Commit 4416cd97 authored by Markus Schedl's avatar Markus Schedl
Browse files

cleaned code to create clustering

parent d28ca3ef
......@@ -18,19 +18,17 @@ import collections
import pickle as pkl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from itertools import cycle
from matplotlib import gridspec
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import OPTICS
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics import silhouette_samples, silhouette_score
import sklearn.metrics.pairwise as pair_dist
from sklearn import metrics
from scipy import sparse
from itertools import cycle
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
# GENERAL PARAMETERS USED FOR DATA FILTERING
......@@ -49,18 +47,17 @@ DO_PCA = True # Before clustering, reduce country feature
CLUSTERING_TSNE = True # Perform t-SNE clustering
DO_OPTICS_FOR_TSNE = True # Find and visualize clusters using OPTICS after t-SNE has been created
CLUSTERING_AP = False # Perform clustering via Affinity Propagation
CLUSTERING_DBSCAN = False # Perform clustering via DBSCAN
DO_KMEANS_SILHOUETTE = False # Perform k-means and plot silhouettes for different number of clusters (k)
CREATE_PLOTS = True # Create all plots/visualizations
CREATE_PLOTS = False # Create all plots/visualizations
# PARAMETERS FOR PATHS
DATA_DIR = '../data/' #'../data_FiAI2020/' #'../data/' # directory containing the raw data
FEATURE_DIR = DATA_DIR + 'LEs_country_vectors_min' + str(MIN_LES_TRACK) + 'les/' #_artists # directory containing the preprocessed data (country feature vectors)
VISU_DIR = '../visu_FiAI2020/' #'../visu/' # output directory to write results/visualizations to
DATA_DIR = '../data_FiAI2020/' # Directory containing the raw data
FEATURE_DIR = DATA_DIR + 'LEs_country_vectors_min' + str(MIN_LES_TRACK) + 'les/' # Directory containing the preprocessed data (country feature vectors); can add "artists" if clustering is performed on artist level
VISU_DIR = '../visu_FiAI2020/' # Output directory to write results/visualizations to
# PARAMETERS FOR PREPROCESSING AND CLUSTERING ALGORITHMS
PCA_DIMS = 100 # Number of dimensions the data is reduced to by PCA
TSNE_PERPLEXITY = [1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 50] # List of perplexities to investigate for t-SNE #[3, 5] #[1, 2, 3, 5] #[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 50] #[20, 25, 30]
TSNE_PERPLEXITY = [1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40] # List of perplexities to investigate for t-SNE
OPTICS_MIN_CLUSTER_SIZE = 3 # Minimum number of data points per cluster for OPTICS
......@@ -321,7 +318,9 @@ def compute_NPR(P, D, nn):
if __name__ == '__main__':
np.set_printoptions(precision=3, suppress=True)
# Perform data preprocessing
##############################
# Perform data preprocessing #
##############################
#
# Assumes that the following files are present in DATA_DIR:
# country_dist.txt: country_code \t no_les
......@@ -342,7 +341,7 @@ if __name__ == '__main__':
t_les = pkl.load(fo)
else:
track_le = read_tracks_le(DATA_DIR + 'track_dist.txt')
# track_le = read_tracks_le(DATA_DIR + 'artist_dist.txt')
# track_le = read_tracks_le(DATA_DIR + 'artist_dist.txt') # To compute clusters on artist level instead of track level
print('Tracks read: %d' % len(track_le))
t_ids = np.array(list(track_le.keys()))
t_les = np.array(list(track_le.values()))
......@@ -352,19 +351,19 @@ if __name__ == '__main__':
with open(DATA_DIR + 't_les.pkl', 'wb') as fo: # a_
pkl.dump(t_les, fo)
# Plot distribution of LEs per user
if CREATE_PLOTS:
print('Creating plot of track LE distribution')
handle_tpc_all, = plt.semilogy(range(len(t_les)), t_les, marker='.', color='b', markersize=1.0, linestyle='None')
plt.setp(handle_tpc_all, linewidth=.5)
plt.title('Distribution of listening events', fontsize=18)
plt.xlabel('Artist ID', fontsize=14) # Track ID
plt.xlabel('Track ID', fontsize=14)
#plt.xlabel('Artist ID', fontsize=14) # when computed on artist level
plt.ylabel('Number of listening events', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig(VISU_DIR + 'plot_artist_les.jpg', format='jpg', dpi=300)
#plt.savefig(VISU_DIR + 'plot_track_les.pdf', format='pdf', dpi=300)
plt.savefig(VISU_DIR + 'plot_track_les.pdf', format='pdf', dpi=300)
#plt.savefig(VISU_DIR + 'plot_artist_les.jpg', format='jpg', dpi=300) # when computed on artist level
plt.close()
# Create subset of tracks according to threshold MIN_LES_TRACK
......@@ -374,29 +373,28 @@ if __name__ == '__main__':
t_ids_reduced = t_ids[idx_min_les]
print('No. tracks after filtering: %d' % len(t_les_reduced))
# Dump reduced ones as pkl files
with open(DATA_DIR + 't_ids_min' + str(MIN_LES_TRACK) + 'les.pkl', 'wb') as fo: # a_
with open(DATA_DIR + 't_ids_min' + str(MIN_LES_TRACK) + 'les.pkl', 'wb') as fo: # "a_ids_min" when computed on artist level
pkl.dump(t_ids_reduced, fo)
with open(DATA_DIR + 't_les_min' + str(MIN_LES_TRACK) + 'les.pkl', 'wb') as fo: # a_
with open(DATA_DIR + 't_les_min' + str(MIN_LES_TRACK) + 'les.pkl', 'wb') as fo: # "a_les_min" when computed on artist level
pkl.dump(t_les_reduced, fo)
# print(t_ids_reduced)
# print(t_les_reduced)
# print(t_ids_reduced)
# print(t_les_reduced)
# Read country track distribution file and filter with track-ids in subset (MIN_LES_TRACK)
country_track_le = read_filter_country_tracks_le(DATA_DIR + 'country_track_dist.txt', t_ids_reduced) # country_track_dist.txt # return dict (country, feature vector)
country_track_le = read_filter_country_tracks_le(DATA_DIR + 'country_track_dist.txt', t_ids_reduced)
# Store feature vector for each country in pkl file
for cty, feat_vec in country_track_le.items():
print('Storing feature vector of country %s; sum of LEs: %d' % (cty, np.sum(feat_vec)))
with open(DATA_DIR + 'les_' + cty + '_min' + str(MIN_LES_TRACK) + 'les.pkl', 'wb') as fo:
pkl.dump(feat_vec, fo)
# exit()
# Read number of users for each country (for filtering)
cty_numusers = read_country_number_users(DATA_DIR + '/country_user_dist.txt')
#
# Perform clustering experiments
#
##################################
# Perform clustering experiments #
##################################
if DO_CLUSTERING:
# Load country feature vectors
feature_files = [f for f in glob.glob(FEATURE_DIR + "/*.pkl", recursive=True)]
......@@ -413,11 +411,11 @@ if __name__ == '__main__':
country_features_list.append(pkl.load(fi))
print('Feature vector size: %d' % len(country_features_list[0]))
### Iterate through all elements in feature_vecs, normalize, and write to numpy array
# Iterate through all elements in feature vectors, normalize, and write to numpy array
# Only include countries that fulfill certain criteria:
# minimum number of LEs
# minimum number of unique tracks
# minimum number of users/listeners
# minimum number of LEs (MIN_LES_COUNTRY)
# minimum number of unique tracks (MIN_UNIQUE_TRACKS_COUNTRY)
# minimum number of users/listeners (MIN_USERS_COUNTRY)
country_ids_include = [] # to hold country ids fulfilling this
overall_le_sum = 0 # total sum of LEs of all countries
overall_le_sum_included_country = 0 # total sum of LEs of included countries
......@@ -434,22 +432,21 @@ if __name__ == '__main__':
print('Excluded country: %s; Users: %d; LE-sum: %d; unique tracks: %d' % (country_labels[c], num_users, le_sum, uniq_tracks))
print('Number of countries with #LE >= %d and #min LEs >=%d and #unique tracks >= %d and #users >= %d: %d' % (MIN_LES_COUNTRY, MIN_LES_TRACK, MIN_UNIQUE_TRACKS_COUNTRY, MIN_USERS_COUNTRY, len(country_ids_include)))
print('#LE: %d (all countries); %d, %.6f (included countries)' % (overall_le_sum, overall_le_sum_included_country, np.float32(overall_le_sum_included_country)/np.float32(overall_le_sum) ))
# exit()
# Init feature vector matrix
country_features = np.zeros([len(country_ids_include), len(country_features_list[0])], dtype=np.float32)
country_le_sums = np.zeros([len(country_ids_include)], dtype=np.float32)
#country_features = np.zeros([len(country_features_list), len(country_features_list[0])], dtype=np.float32)
#country_le_sums = np.zeros([len(country_features_list)], dtype=np.float32)
country_labels_full = country_labels
country_labels = []
# Construct feature vectors and normalize
for c in range(0, len(country_ids_include)):
# Sum LEs per country
le_sum = np.sum(country_features_list[country_ids_include[c]])
country_le_sums[c] = le_sum
# Construct features and normalize
# Construct and normalize feature vector for country
country_features[c, :] = country_features_list[country_ids_include[c]] / le_sum
country_labels.append(country_labels_full[country_ids_include[c]])
#print(country_features.shape)
for i in range(0, len(country_labels)):
for i in range(0, len(country_labels)): # Output feature vectors
print(country_labels[i], country_features[i], np.sum(country_features[i]))
# Plot distribution of LEs per country
......@@ -465,7 +462,6 @@ if __name__ == '__main__':
plt.savefig(VISU_DIR + 'plot_country_les.jpg', format='jpg', dpi=300)
#plt.savefig(VISU_DIR + 'plot_country_les.pdf', format='pdf', dpi=300)
plt.close()
# Plot distribution of users per country
if CREATE_PLOTS:
print('Creating plot of country user distribution')
......@@ -482,7 +478,6 @@ if __name__ == '__main__':
#plt.savefig(VISU_DIR + 'plot_country_les.pdf', format='pdf', dpi=300)
plt.close()
# Perform PCA if demanded
if DO_PCA:
print('Performing PCA / Truncated SVD to reduce number of dimensions to %d' % PCA_DIMS)
......@@ -502,14 +497,14 @@ if __name__ == '__main__':
init = 'pca', random_state = 169, method = 'barnes_hut', angle = 0.5)
# init = 'random', random_state = 169, method = 'barnes_hut', angle = 0.5)
Y = tsne.fit_transform(X)
method = 'T-SNE (perp=' + str(perp) + '); PCA dim=' + str(PCA_DIMS) + ', LE item min=' + str(MIN_LES_TRACK) + ', LE country min=' + str(MIN_LES_COUNTRY) + ', users country min=' + str(MIN_USERS_COUNTRY) #', unique items-country min=' + str(MIN_UNIQUE_TRACKS_COUNTRY) +
method = 'T-SNE (perp=' + str(perp) + '); PCA dim=' + str(PCA_DIMS) + ', LE item min=' + str(MIN_LES_TRACK) + ', LE country min=' + str(MIN_LES_COUNTRY) + ', users country min=' + str(MIN_USERS_COUNTRY)
if CREATE_PLOTS:
plot_mapping(Y, method, country_labels, country_le_sums)
# Compute neighborhood preservation ratio
for nn in [1, 3, 5, 10, 25, 50, perp]: #, 100
for nn in [1, 3, 5, 10, 25, 50, perp]:
print('t-SNE (perplexity=%d), NPR(nn=%d): %.3f%%' % (perp, nn, compute_NPR(pair_dist.cosine_distances(X), pair_dist.cosine_distances(Y), nn)*100.0))
# Plot silhouette for different k's in k-means
# Plot silhouette for different ks in k-means
if DO_KMEANS_SILHOUETTE:
if CREATE_PLOTS:
plot_silhouette(Y, method, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
......@@ -517,31 +512,24 @@ if __name__ == '__main__':
# Perform OPTICS on top of t-SNE
if DO_OPTICS_FOR_TSNE:
optics = OPTICS(min_samples=OPTICS_MIN_CLUSTER_SIZE, max_eps=np.inf, metric='euclidean').fit(Y) #, p=2, metric_params=None, cluster_method=’xi’, eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm=’auto’, leaf_size=30, n_jobs=None)[
#optics = OPTICS(min_samples=OPTICS_MIN_CLUSTER_SIZE, max_eps=np.inf, metric='cosine').fit(Y) #, p=2, metric_params=None, cluster_method=’xi’, eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm=’auto’, leaf_size=30, n_jobs=None)[
reachability = optics.reachability_[optics.ordering_]
# Output clusters
labels = optics.labels_[optics.ordering_]
ordering = optics.ordering_
print(labels)
print(ordering)
# Output countries in each cluster
for cl in np.unique(labels):
print('Cluster %d: ' % cl, end="")
idx_cl = np.where(labels == cl)[0]
for i in idx_cl:
print(country_labels[ordering[i]] + ' ', end="")
print('')
# Compute silhouette coefficient
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
# clusterer = KMeans(n_clusters=10, random_state=10)
# cluster_labels = clusterer.fit_predict(X)
# Compute silhouette coefficient according to https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
silhouette_avg = silhouette_score(Y, labels)
print('Silhouette score average: %.3f' % silhouette_avg)
if CREATE_PLOTS:
# Visualize t-SNE mapping including color-coded OPTICS clusters
datapoint_scaling = 1.0
# datapoint_scaling = 0.5
datapoint_scaling = 1.0 # 0.5
datapoint_size_min = 8 * datapoint_scaling
datapoint_size_max = 32 * datapoint_scaling
dp_max = np.max(country_le_sums)
......@@ -552,27 +540,28 @@ if __name__ == '__main__':
colors_all = []
for cl in range(0, len(np.unique(labels))-1):
Xk = Y[optics.labels_ == cl]
c = next(colors) # Next color for every new class/label
c = next(colors) # next color for every new class/label
colors_cl.append(c)
plt.scatter(Xk[:, 0], Xk[:, 1], color=c, s=datapoint_size_min * datapoint_scaling * 0.25)
#plt.plot(Xk[:, 0], Xk[:, 1], '.', color=c) #, alpha=0.3)
plt.scatter(Y[optics.labels_ == -1, 0], Y[optics.labels_ == -1, 1], marker='+', color=c, s=datapoint_size_min * datapoint_scaling, alpha=0.1)
#plt.plot(Y[optics.labels_ == -1, 0], Y[optics.labels_ == -1, 1], 'k+', alpha=0.1)
# print(colors_cl)
for i in range(0, len(ordering)):
# plt.plot(Y[optics.labels_ == -1, 0], Y[optics.labels_ == -1, 1], 'k+', alpha=0.1)
# print(colors_cl)
for i in range(0, len(ordering)): # add country labels to plot
if labels[i] == -1:
color_cur = 'b'
alpha_cur = 0.1
else:
# print(labels[i])
# print(labels[i])
color_cur = colors_cl[labels[i]]
alpha_cur = 1.0
colors_all.append(color_cur)
plt.text(Y[ordering[i], 0], Y[ordering[i], 1], country_labels[ordering[i]], fontsize=dp_sizes[ordering[i]], color=color_cur, alpha=alpha_cur)
# Scale (use 2nd lowest and 2nd highest to drop outliers)
# plt.xticks(np.arange(np.sort(Y[:, 0])[1], np.sort(Y[:, 0])[-2], 1))
# plt.yticks(np.arange(np.sort(Y[:, 1])[1], np.sort(Y[:, 1])[-2], 1))
# Scale visualization (use 2nd lowest and 2nd highest value to drop outliers)
# NB: This is a hack to create "zoomed-in" plots for the article
# plt.xticks(np.arange(np.sort(Y[:, 0])[1], np.sort(Y[:, 0])[-2], 1))
# plt.yticks(np.arange(np.sort(Y[:, 1])[1], np.sort(Y[:, 1])[-2], 1))
plt.title('OPTICS (cluster size min=' + str(OPTICS_MIN_CLUSTER_SIZE) + '); ' + method)
plt.axis('Off')
plt.grid(False)
......@@ -580,10 +569,10 @@ if __name__ == '__main__':
plt.savefig(VISU_DIR + 'OPTICS (cluster size min=' + str(OPTICS_MIN_CLUSTER_SIZE) + '); ' + method + ', datapoint scaling=' + str(datapoint_scaling) + '.pdf', format='pdf', dpi=300)
plt.close()
### Perform AP clustering
if CLUSTERING_AP:
af = AffinityPropagation().fit(X) #convergence_iter=50, max_iter=1000).fit(X)
# Create clustering
af = AffinityPropagation().fit(X) # additional parameters: convergence_iter=50, max_iter=1000).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
......@@ -617,56 +606,18 @@ if __name__ == '__main__':
fig = plt.figure(figsize=(11.69, 8.27))
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
# print k, col
data = X[cluster_centers_indices[k]]
ax = fig.add_subplot(gs[k])
handle_tpc, = ax.plot(range(len(data)), data, marker='.', color=col, markersize=1.0, linestyle='None')
ax.set_title('Cluster %s' % (str(k)))
# ax.text(.5, .9, 'Prototype for cluster %s' % (str(k)), horizontalalignment='center', transform=ax.transAxes)
# ax.text(.5, .9, 'Prototype for cluster %s' % (str(k)), horizontalalignment='center', transform=ax.transAxes)
plt.setp(handle_tpc, linewidth=.5)
# plt.title('Prototype for cluster %s' % (str(k)), fontsize=18)
# plt.xlabel('Artists', fontsize=14)
# plt.ylabel(t, fontsize=14)
# plt.grid(True)
# plt.title('Prototype for cluster %s' % (str(k)), fontsize=18)
# plt.xlabel('Artists', fontsize=14)
# plt.ylabel(t, fontsize=14)
# plt.grid(True)
fig.suptitle('Country Prototypes', fontsize=14)
gs.tight_layout(fig, rect=[0, 0.03, 1, 0.95])
# fig.savefig(VISU_DIR + 'clusters_prototypes.pdf', format='pdf', dpi=300)
fig.savefig(VISU_DIR + 'clusters_prototypes_AP.jpg', format='jpg', dpi=600)
# fig.savefig(VISU_DIR + 'clusters_prototypes.pdf', format='pdf', dpi=300)
plt.close()
### Perform DBSCAN clustering
if CLUSTERING_DBSCAN:
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment