country_track_clustering.py 32.9 KB
Newer Older
Markus Schedl's avatar
.  
Markus Schedl committed
1
2
3
4
5
6
7
8
9
10
11
12
13
#
# This is a script to cluster countries according to the music listening preferences and behaviors of their inhabitants,
# using the LFM-1b dataset of listening events (LEs) created by Last.fm users.
#
# This code accompanies the following journal article. Please consider reading it for background information and to
# obtain better insights into the research behind.
#
# Markus Schedl, Christine Bauer, Wolfgang Reisinger, Dominik Kowald, and Elisabeth Lex
# Listener Modeling and Context-aware Music Recommendation Based on Country Archetypes
# Frontiers in Artificial Intelligence - Machine Learning and Artificial Intelligence, 2020
#
__author__ = 'Markus Schedl'

Markus Schedl's avatar
Markus Schedl committed
14

Markus Schedl's avatar
.  
Markus Schedl committed
15
16
17
18
19
20
21
import numpy as np
import os
import glob
import collections
import pickle as pkl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
22
from itertools import cycle
Markus Schedl's avatar
.  
Markus Schedl committed
23
24
25
26
27
28
from matplotlib import gridspec
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import OPTICS
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD, PCA
29
from sklearn.metrics import silhouette_samples, silhouette_score
Markus Schedl's avatar
.  
Markus Schedl committed
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import sklearn.metrics.pairwise as pair_dist
from sklearn import metrics
from scipy import sparse


# GENERAL PARAMETERS USED FOR DATA FILTERING
MIN_LES_TRACK = 1000                # This is the most important parameter: minimum number of LEs per track to have the track included;
                                    # 100 results in 1,012,961 unique tracks,
                                    # 1000 results in 122,442 unique tracks
MIN_LES_COUNTRY = 80000             # Minimum number of LEs per country to be considered in the clustering (other suited values: 350000, 120000)
MIN_UNIQUE_TRACKS_COUNTRY = 1000    # Minimum number of unique tracks per country to be considered in the clustering
MIN_USERS_COUNTRY = 25 #150         # Minimum number of users per country to be considered in the clustering

# BOOLEAN PARAMETERS TO CONTROL THE STEPS CARRIED OUT BY THE SCRIPT
DO_PREPROCESSING = False            # Perform preprocessing (needs to be set to True when running the script for the first time;
                                    # assumes that the raw data is available in path DATA_DIR)
DO_CLUSTERING = True                # Conduct clustering experiments (assumes preprocessing has been performed already)
DO_PCA = True                       # Before clustering, reduce country feature matrix using PCA / Truncated SVD
CLUSTERING_TSNE = True              # Perform t-SNE clustering
DO_OPTICS_FOR_TSNE = True           # Find and visualize clusters using OPTICS after t-SNE has been created
CLUSTERING_AP = False               # Perform clustering via Affinity Propagation
DO_KMEANS_SILHOUETTE = False        # Perform k-means and plot silhouettes for different number of clusters (k)
52
CREATE_PLOTS = False                 # Create all plots/visualizations
Markus Schedl's avatar
.  
Markus Schedl committed
53
54

# PARAMETERS FOR PATHS
55
56
57
DATA_DIR = '../data_FiAI2020/'      # Directory containing the raw data
FEATURE_DIR = DATA_DIR + 'LEs_country_vectors_min' + str(MIN_LES_TRACK) + 'les/'    # Directory containing the preprocessed data (country feature vectors); can add "artists" if clustering is performed on artist level
VISU_DIR = '../visu_FiAI2020/'      # Output directory to write results/visualizations to
Markus Schedl's avatar
.  
Markus Schedl committed
58
59
60

# PARAMETERS FOR PREPROCESSING AND CLUSTERING ALGORITHMS
PCA_DIMS = 100                      # Number of dimensions the data is reduced to by PCA
61
TSNE_PERPLEXITY = [1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40]      # List of perplexities to investigate for t-SNE
Markus Schedl's avatar
.  
Markus Schedl committed
62
63
64
65
66
67
68
OPTICS_MIN_CLUSTER_SIZE = 3         # Minimum number of data points per cluster for OPTICS



# Read file fn containing pairs of tracks and listening events
def read_tracks_le(fn):
    track_le = {}
Markus Schedl's avatar
Markus Schedl committed
69
70
71
    # no_lines = sum(1 for line in open(fn, 'r'))
    # no_lines = len(open(fn).read().splitlines())
    # print(no_lines)
Markus Schedl's avatar
.  
Markus Schedl committed
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
    counter = 0
    with open(fn, 'r') as f:
        for line in f:
            content = line.strip().split('\t')
            if len(content) == 2:
                track_le[np.int32(content[0])] = np.int32(content[1])
            else:
                print('Problem encountered with ', content)
            if np.mod(counter, 100000) == 0:
                print('%d lines read from %s' % (counter, fn))
            counter += 1

    return collections.OrderedDict(track_le)

# Read file fn that contains the number of users for each country
def read_country_number_users(fn):
    country_users = {}
    counter = 0
    with open(fn, 'r') as f:
        for line in f:
            content = line.strip().split('\t')
            if len(content) == 2:
                country = content[0]
                # Parse lines
                uid_les_all = content[1].split(';')
Markus Schedl's avatar
Markus Schedl committed
97
                # print('Number of users for country %s: %d' % (content[0], len(uid_les_all)))
Markus Schedl's avatar
.  
Markus Schedl committed
98
99
100
                country_users[country] = len(uid_les_all)-1
                for idx_uid_les in range(0, len(uid_les_all) - 1):          # All users in current country
                    uid_les = uid_les_all[idx_uid_les]
Markus Schedl's avatar
Markus Schedl committed
101
                    # print(uid_les)
Markus Schedl's avatar
.  
Markus Schedl committed
102
103
104
                    uid_les_split = uid_les.split(' ')
                    uid = np.int32(uid_les_split[0])
                    les = np.int32(uid_les_split[1])
Markus Schedl's avatar
Markus Schedl committed
105
                # print(uid, les)
Markus Schedl's avatar
.  
Markus Schedl committed
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
            else:
                print('Problem encountered with ', content)
            if np.mod(counter, 100000) == 0:
                print('%d lines read from %s' % (counter, fn))
            counter += 1

    return collections.OrderedDict(country_users)

# Read file fn containing country, tracks, and LEs; and filter data using the list of track-ids in filter_tids
def read_filter_country_tracks_le(fn, filter_tids):
    country_track_le = {}
    with open(fn, 'r') as f:
        for line in f:                              # For all countries
            # Initialize country-specific feature vector (containing LEs)
            country_feature_vec = np.zeros([len(filter_tids)], dtype=np.int32)
            # Start parsing
            content = line.strip().split('\t')
            if len(content) == 2:
                cty = content[0]
                # Parse lines
                tid_les_all = content[1].split(';')
                print('Number of tracks for country %s: %d' % (content[0], len(tid_les_all)))
                t_counter = 0
                for idx_tid_les in range(0, len(tid_les_all)-1):
                    tid_les = tid_les_all[idx_tid_les]
                    #print(tid_les)
                    tid_les_split = tid_les.split(' ')
                    tid = np.int32(tid_les_split[0])
                    les = np.int32(tid_les_split[1])
                    # Filter using track-ids in filter_tids
                    if tid in t_ids_reduced:        # Only keep entries whose track-id is in filtered track-id list
                        # Output track-id, country-wise, and global LEs
                        idx_tid = np.where(tid == t_ids_reduced)
                        #print('t_id: %d \t le_country: %d \t le_global: %d' % (tid, les, t_les_reduced[idx_tid]))
                        country_feature_vec[idx_tid] = les
                    if np.mod(t_counter, 10000) == 0:
                        print('%d tracks processed of country %s' % (t_counter, cty))
                    t_counter += 1

                # Store country feature vector into dictionary
                country_track_le[cty] = country_feature_vec
                print('Number of tracks for country %s after filtering: %d' % (cty, len(np.where(country_feature_vec > 0)[0])))
            else:
                print('Problem encountered with ', content)

    return country_track_le

# Creates visualization of clustering
# Y: coordinates of data points (as output of t-SNE, for instance)
# method: used as title of the created plot
# labels: labels to show next to each data point (here: country labels)
# datapoint_sizes: list of sizes for each data point (if omitted, a fixed size is used)
# colors: list of colors for each data point (if omitted, all points are plotted in blue)
def plot_mapping(Y, method, labels = [], datapoint_sizes = [], colors = []):
    tsne_datapoint_size = 40            # fixed data point size if no information in datapoint_sizes is provided
    tsne_datapoint_size_min = 8
    tsne_datapoint_size_max = 40

    # Create figure
    plt.figure(figsize=(11.69, 8.27))

    # If data point size is given -> scale values
    if len(datapoint_sizes) > 0:
        dp_sizes = datapoint_sizes
        dp_max = np.max(datapoint_sizes)
        dp_sizes = tsne_datapoint_size_min + ((datapoint_sizes / dp_max) * (tsne_datapoint_size_max - tsne_datapoint_size_min))
    else:   # If not, use fixed size for all data points
        dp_sizes = np.repeat(tsne_datapoint_size, len(labels))

    # Plot results
    if len(labels) > 0:        # If we have labels, print data items with same label in same color
        # Iterate over all classes and plot respective items with corresponding class color
        # Print text / country labels
        for y_idx in range(0, len(labels)):
            if len(colors) == len(labels):          # if color mapping provided, use it
                color_cur = colors[y_idx]
            else:
                color_cur = 'b'
            plt.scatter(Y[y_idx, 0], Y[y_idx, 1], color=color_cur, s=dp_sizes[y_idx])  # Add scatter plot
            plt.text(Y[y_idx, 0], Y[y_idx, 1], labels[y_idx], fontsize=dp_sizes[y_idx], color=color_cur) # colors_defined[labels[y_idx]])
    else:                       # No labels -> all data points in same color
        plt.scatter(Y[:, 0], Y[:, 1], color='b')
        #OR: plt.plot(Y[:, 0], Y[:, 1], 'o', color='r')

    # Add title and legend
    plt.title(method)
    #plt.legend(fontsize=8)
    # Save as PDF (without axes and grid)
    plt.axis('off')
    plt.grid(False)
    plt.tight_layout()
    plt.savefig(VISU_DIR + method + '.pdf', format='pdf', dpi=300)
    plt.close()
    # plt.show()

# Plot silhouette to determine the number of clusters for clustering algorithms (k-means)
# X: input data
# method: string used in file name for plot
# range_n_clusters: rang of number of clusters to evaluate
def plot_silhouette(X, method = [], range_n_clusters = [2, 3, 4, 5, 6]):
    # Code from https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                    c=colors, edgecolor='k')

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                    c="white", alpha=1, s=200, edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                        s=50, edgecolor='k')

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                      "with n_clusters = %d" % n_clusters),
                     fontsize=14, fontweight='bold')

        plt.savefig(VISU_DIR + 'kmeans-silhouette (k=' + str(n_clusters) +'); ' + method + '.jpg', format='jpg', dpi=300)
        plt.close()
        #plt.show()

# Compute neighborhood preservation ratio (NPR)
# P: proximity matrix (input space)
# D: distance matrix (output space)
# nn: number of neareast neighbors to consider
def compute_NPR(P, D, nn):
    # Return -1 if size of P and D is different
    if P.shape != D.shape:
        return -1

    npr = 0
    # For all data points
    for i in range(0, P.shape[0]):
        idx_sorted_P = np.argsort(P[i])
        idx_sorted_D = np.argsort(D[i])
        nn_P = idx_sorted_P[1:nn+1]         # Get nn nearest items to P_i
        nn_D = idx_sorted_D[1:nn+1]       # Get nn nearest items to D_i

        # Compute NPR for current data point
        npr_i = len(np.intersect1d(nn_P, nn_D)) / np.float32(nn)
        npr = npr + npr_i / P.shape[0]

    return npr


# Main script
if __name__ == '__main__':
    np.set_printoptions(precision=3, suppress=True)

322
323
324
    ##############################
    # Perform data preprocessing #
    ##############################
Markus Schedl's avatar
.  
Markus Schedl committed
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
    #
    # Assumes that the following files are present in DATA_DIR:
    # country_dist.txt: country_code \t no_les
    # country_track_dist.txt: country_code \t track_id SPACE no_les ; track_id SPACE no_les ; ...
    # country_user_dist.txt: country_code \t user_id SPACE no_les ; user_id SPACE no_les ; ...
    #
    # The following files are optional:
    # user_dist.txt: user_id \t no_les
    # track_dist.txt: track_id \t no_les
    # user_track_dist.txt: user_id \t track_id SPACE no_les ; track_id SPACE no_les ; ...
    if DO_PREPROCESSING:
        # Load or create tracks and LEs file
        if os.path.exists(DATA_DIR + 't_ids.pkl'):  # a_
            print('Loading tracks and LEs from pickle files')
            with open(DATA_DIR + 't_ids.pkl', 'rb') as fo:  # a_
                t_ids = pkl.load(fo)
            with open(DATA_DIR + 't_les.pkl', 'rb') as fo:  # a_
                t_les = pkl.load(fo)
        else:
            track_le = read_tracks_le(DATA_DIR + 'track_dist.txt')
345
#            track_le = read_tracks_le(DATA_DIR + 'artist_dist.txt')            # To compute clusters on artist level instead of track level
Markus Schedl's avatar
.  
Markus Schedl committed
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
            print('Tracks read: %d' % len(track_le))
            t_ids = np.array(list(track_le.keys()))
            t_les = np.array(list(track_le.values()))
            # ...and dump as pkl file
            with open(DATA_DIR + 't_ids.pkl', 'wb') as fo:      # a_
                pkl.dump(t_ids, fo)
            with open(DATA_DIR + 't_les.pkl', 'wb') as fo:      # a_
                pkl.dump(t_les, fo)

        # Plot distribution of LEs per user
        if CREATE_PLOTS:
            print('Creating plot of track LE distribution')
            handle_tpc_all, = plt.semilogy(range(len(t_les)), t_les, marker='.', color='b', markersize=1.0, linestyle='None')
            plt.setp(handle_tpc_all, linewidth=.5)
            plt.title('Distribution of listening events', fontsize=18)
361
362
            plt.xlabel('Track ID', fontsize=14)
            #plt.xlabel('Artist ID', fontsize=14)                                   # when computed on artist level
Markus Schedl's avatar
.  
Markus Schedl committed
363
364
365
            plt.ylabel('Number of listening events', fontsize=14)
            plt.grid(True)
            plt.tight_layout()
366
367
            plt.savefig(VISU_DIR + 'plot_track_les.pdf', format='pdf', dpi=300)
            #plt.savefig(VISU_DIR + 'plot_artist_les.jpg', format='jpg', dpi=300)   # when computed on artist level
Markus Schedl's avatar
.  
Markus Schedl committed
368
369
370
371
372
373
374
375
376
            plt.close()

        # Create subset of tracks according to threshold MIN_LES_TRACK
        print('Creating subset of tracks with LE >= %d' % MIN_LES_TRACK)
        idx_min_les = np.where(t_les >= MIN_LES_TRACK)
        t_les_reduced = t_les[idx_min_les]
        t_ids_reduced = t_ids[idx_min_les]
        print('No. tracks after filtering: %d' % len(t_les_reduced))
        # Dump reduced ones as pkl files
377
        with open(DATA_DIR + 't_ids_min' + str(MIN_LES_TRACK) + 'les.pkl', 'wb') as fo:     # "a_ids_min" when computed on artist level
Markus Schedl's avatar
.  
Markus Schedl committed
378
            pkl.dump(t_ids_reduced, fo)
379
        with open(DATA_DIR + 't_les_min' + str(MIN_LES_TRACK) + 'les.pkl', 'wb') as fo:     # "a_les_min" when computed on artist level
Markus Schedl's avatar
.  
Markus Schedl committed
380
            pkl.dump(t_les_reduced, fo)
381
382
        # print(t_ids_reduced)
        # print(t_les_reduced)
Markus Schedl's avatar
.  
Markus Schedl committed
383
384

        # Read country track distribution file and filter with track-ids in subset (MIN_LES_TRACK)
385
        country_track_le = read_filter_country_tracks_le(DATA_DIR + 'country_track_dist.txt', t_ids_reduced)
Markus Schedl's avatar
.  
Markus Schedl committed
386
387
388
389
390
391
392
393
394
        # Store feature vector for each country in pkl file
        for cty, feat_vec in country_track_le.items():
            print('Storing feature vector of country %s; sum of LEs: %d' % (cty, np.sum(feat_vec)))
            with open(DATA_DIR + 'les_' + cty + '_min' + str(MIN_LES_TRACK) + 'les.pkl', 'wb') as fo:
                pkl.dump(feat_vec, fo)

    # Read number of users for each country (for filtering)
    cty_numusers = read_country_number_users(DATA_DIR + '/country_user_dist.txt')

395
396
397
398

    ##################################
    # Perform clustering experiments #
    ##################################
Markus Schedl's avatar
.  
Markus Schedl committed
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
    if DO_CLUSTERING:
        # Load country feature vectors
        feature_files = [f for f in glob.glob(FEATURE_DIR + "/*.pkl", recursive=True)]
        feature_vecs = {}
        #        print('Loading tracks and LEs from pickle files')
        #        print(FEATURE_DIR)
        country_labels = []
        country_features_list = []
        for f in feature_files:
            print('Loading tracks and LEs from pickle file %s' % f)
            with open(f, 'rb') as fi:
                cty = os.path.basename(os.path.normpath(f)).split('_')[1]  # extract country name from file name
                country_labels.append(cty)
                country_features_list.append(pkl.load(fi))
            print('Feature vector size: %d' % len(country_features_list[0]))

415
        # Iterate through all elements in feature vectors, normalize, and write to numpy array
Markus Schedl's avatar
.  
Markus Schedl committed
416
        # Only include countries that fulfill certain criteria:
417
418
419
        # minimum number of LEs (MIN_LES_COUNTRY)
        # minimum number of unique tracks (MIN_UNIQUE_TRACKS_COUNTRY)
        # minimum number of users/listeners (MIN_USERS_COUNTRY)
Markus Schedl's avatar
.  
Markus Schedl committed
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
        country_ids_include = []        # to hold country ids fulfilling this
        overall_le_sum = 0                      # total sum of LEs of all countries
        overall_le_sum_included_country = 0     # total sum of LEs of included countries
        for c in range(0, len(country_features_list)):
            le_sum = np.sum(country_features_list[c])
            uniq_tracks = np.sum(np.sign(country_features_list[c]))
            num_users = cty_numusers[country_labels[c]]
            overall_le_sum += le_sum
            if le_sum >= MIN_LES_COUNTRY and uniq_tracks >= MIN_UNIQUE_TRACKS_COUNTRY and num_users >= MIN_USERS_COUNTRY:
                print('Included country: %s; Users: %d; LE-sum: %d; unique tracks: %d' % (country_labels[c], num_users, le_sum, uniq_tracks))
                country_ids_include.append(c)
                overall_le_sum_included_country += le_sum
            else:
                print('Excluded country: %s; Users: %d; LE-sum: %d; unique tracks: %d' % (country_labels[c], num_users, le_sum, uniq_tracks))
        print('Number of countries with #LE >= %d and #min LEs >=%d and #unique tracks >= %d and #users >= %d: %d' % (MIN_LES_COUNTRY, MIN_LES_TRACK, MIN_UNIQUE_TRACKS_COUNTRY, MIN_USERS_COUNTRY, len(country_ids_include)))
        print('#LE: %d (all countries); %d, %.6f (included countries)' % (overall_le_sum, overall_le_sum_included_country, np.float32(overall_le_sum_included_country)/np.float32(overall_le_sum) ))
436

Markus Schedl's avatar
.  
Markus Schedl committed
437
438
439
440
441
        # Init feature vector matrix
        country_features = np.zeros([len(country_ids_include), len(country_features_list[0])], dtype=np.float32)
        country_le_sums = np.zeros([len(country_ids_include)], dtype=np.float32)
        country_labels_full = country_labels
        country_labels = []
442
        # Construct feature vectors and normalize
Markus Schedl's avatar
.  
Markus Schedl committed
443
        for c in range(0, len(country_ids_include)):
444
            # Sum LEs per country
Markus Schedl's avatar
.  
Markus Schedl committed
445
446
            le_sum = np.sum(country_features_list[country_ids_include[c]])
            country_le_sums[c] = le_sum
447
            # Construct and normalize feature vector for country
Markus Schedl's avatar
.  
Markus Schedl committed
448
449
            country_features[c, :] = country_features_list[country_ids_include[c]] / le_sum
            country_labels.append(country_labels_full[country_ids_include[c]])
450
        for i in range(0, len(country_labels)):                 # Output feature vectors
Markus Schedl's avatar
.  
Markus Schedl committed
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
            print(country_labels[i], country_features[i], np.sum(country_features[i]))

        # Plot distribution of LEs per country
        if CREATE_PLOTS:
            print('Creating plot of country LE distribution')
            handle_tpc_all, = plt.semilogy(range(len(country_le_sums)), np.sort(country_le_sums)[::-1], marker='.', color='b', markersize=1.0, linestyle='None')
            plt.setp(handle_tpc_all, linewidth=.5)
            plt.title('Distribution of listening events', fontsize=18)
            plt.xlabel('Country ID', fontsize=14)
            plt.ylabel('Number of listening events', fontsize=14)
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(VISU_DIR + 'plot_country_les.jpg', format='jpg', dpi=300)
            #plt.savefig(VISU_DIR + 'plot_country_les.pdf', format='pdf', dpi=300)
            plt.close()
        # Plot distribution of users per country
        if CREATE_PLOTS:
            print('Creating plot of country user distribution')
            country_numusers = np.fromiter(cty_numusers.values(), dtype=np.int32)
            handle_tpc_all, = plt.semilogy(range(len(country_numusers)), np.sort(country_numusers)[::-1], marker='.', color='b', markersize=1.0, linestyle='None')
            print(np.sort(country_numusers)[::-1])
            plt.setp(handle_tpc_all, linewidth=.5)
            plt.title('Distribution of users', fontsize=18)
            plt.xlabel('Country ID', fontsize=14)
            plt.ylabel('Number of users', fontsize=14)
            plt.grid(True)
            plt.tight_layout()
            plt.savefig(VISU_DIR + 'plot_country_users.jpg', format='jpg', dpi=300)
            #plt.savefig(VISU_DIR + 'plot_country_les.pdf', format='pdf', dpi=300)
            plt.close()

        # Perform PCA if demanded
        if DO_PCA:
            print('Performing PCA / Truncated SVD to reduce number of dimensions to %d' % PCA_DIMS)
            svd = TruncatedSVD(n_components=PCA_DIMS, n_iter=10, random_state=169)
            #svd = PCA(n_components=PCA_DIMS, random_state=169)
            X = svd.fit_transform(country_features)
            print('Explained variance: ', svd.explained_variance_ratio_)
            print('Sum of explained variance: ', svd.explained_variance_ratio_.sum())
        else:
            X = country_features

        ### Perform t-SNE clustering
        if CLUSTERING_TSNE:
            for perp in TSNE_PERPLEXITY:
                tsne = TSNE(n_components=2, verbose=0, perplexity=perp, early_exaggeration=12.0, learning_rate=200.0,
                            n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='cosine',
                            init = 'pca', random_state = 169, method = 'barnes_hut', angle = 0.5)
                            # init = 'random', random_state = 169, method = 'barnes_hut', angle = 0.5)
                Y = tsne.fit_transform(X)
501
                method = 'T-SNE (perp=' + str(perp) + '); PCA dim=' + str(PCA_DIMS) + ', LE item min=' + str(MIN_LES_TRACK) + ', LE country min=' + str(MIN_LES_COUNTRY) + ', users country min=' + str(MIN_USERS_COUNTRY)
Markus Schedl's avatar
.  
Markus Schedl committed
502
503
504
                if CREATE_PLOTS:
                    plot_mapping(Y, method, country_labels, country_le_sums)
                # Compute neighborhood preservation ratio
505
                for nn in [1, 3, 5, 10, 25, 50, perp]:
Markus Schedl's avatar
.  
Markus Schedl committed
506
507
                    print('t-SNE (perplexity=%d), NPR(nn=%d): %.3f%%' % (perp, nn, compute_NPR(pair_dist.cosine_distances(X), pair_dist.cosine_distances(Y), nn)*100.0))

508
                # Plot silhouette for different ks in k-means
Markus Schedl's avatar
.  
Markus Schedl committed
509
510
511
512
513
514
515
516
517
518
519
                if DO_KMEANS_SILHOUETTE:
                    if CREATE_PLOTS:
                        plot_silhouette(Y, method, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])

                # Perform OPTICS on top of t-SNE
                if DO_OPTICS_FOR_TSNE:
                    optics = OPTICS(min_samples=OPTICS_MIN_CLUSTER_SIZE, max_eps=np.inf, metric='euclidean').fit(Y) #, p=2, metric_params=None, cluster_method=’xi’, eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm=’auto’, leaf_size=30, n_jobs=None)[
                    reachability = optics.reachability_[optics.ordering_]
                    # Output clusters
                    labels = optics.labels_[optics.ordering_]
                    ordering = optics.ordering_
520
                    # Output countries in each cluster
Markus Schedl's avatar
.  
Markus Schedl committed
521
522
523
524
525
526
                    for cl in np.unique(labels):
                        print('Cluster %d: ' % cl, end="")
                        idx_cl = np.where(labels == cl)[0]
                        for i in idx_cl:
                            print(country_labels[ordering[i]] + ' ', end="")
                        print('')
527
                    # Compute silhouette coefficient according to https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
Markus Schedl's avatar
.  
Markus Schedl committed
528
529
530
531
532
                    silhouette_avg = silhouette_score(Y, labels)
                    print('Silhouette score average: %.3f' % silhouette_avg)

                    if CREATE_PLOTS:
                        # Visualize t-SNE mapping including color-coded OPTICS clusters
533
                        datapoint_scaling = 1.0     # 0.5
Markus Schedl's avatar
.  
Markus Schedl committed
534
535
536
537
538
539
540
541
542
543
                        datapoint_size_min = 8 * datapoint_scaling
                        datapoint_size_max = 32 * datapoint_scaling
                        dp_max = np.max(country_le_sums)
                        dp_sizes = datapoint_size_min + ((country_le_sums / dp_max) * (datapoint_size_max - datapoint_size_min))
                        plt.figure(figsize=(11.69, 8.27))
                        colors = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(labels)))))
                        colors_cl = []
                        colors_all = []
                        for cl in range(0, len(np.unique(labels))-1):
                            Xk = Y[optics.labels_ == cl]
544
                            c = next(colors)        # next color for every new class/label
Markus Schedl's avatar
.  
Markus Schedl committed
545
546
547
548
                            colors_cl.append(c)
                            plt.scatter(Xk[:, 0], Xk[:, 1], color=c, s=datapoint_size_min * datapoint_scaling * 0.25)
                            #plt.plot(Xk[:, 0], Xk[:, 1], '.', color=c) #, alpha=0.3)
                        plt.scatter(Y[optics.labels_ == -1, 0], Y[optics.labels_ == -1, 1], marker='+', color=c, s=datapoint_size_min * datapoint_scaling, alpha=0.1)
549
550
551
                        # plt.plot(Y[optics.labels_ == -1, 0], Y[optics.labels_ == -1, 1], 'k+', alpha=0.1)
                        # print(colors_cl)
                        for i in range(0, len(ordering)):           # add country labels to plot
Markus Schedl's avatar
.  
Markus Schedl committed
552
553
554
555
                            if labels[i] == -1:
                                color_cur = 'b'
                                alpha_cur = 0.1
                            else:
556
                                # print(labels[i])
Markus Schedl's avatar
.  
Markus Schedl committed
557
558
559
560
561
                                color_cur = colors_cl[labels[i]]
                                alpha_cur = 1.0
                            colors_all.append(color_cur)
                            plt.text(Y[ordering[i], 0], Y[ordering[i], 1], country_labels[ordering[i]], fontsize=dp_sizes[ordering[i]], color=color_cur, alpha=alpha_cur)

562
563
564
565
                        # Scale visualization (use 2nd lowest and 2nd highest value to drop outliers)
                        # NB: This is a hack to create "zoomed-in" plots for the article
                        # plt.xticks(np.arange(np.sort(Y[:, 0])[1], np.sort(Y[:, 0])[-2], 1))
                        # plt.yticks(np.arange(np.sort(Y[:, 1])[1], np.sort(Y[:, 1])[-2], 1))
Markus Schedl's avatar
.  
Markus Schedl committed
566
567
568
569
570
571
572
573
574
                        plt.title('OPTICS (cluster size min=' + str(OPTICS_MIN_CLUSTER_SIZE) + '); ' + method)
                        plt.axis('Off')
                        plt.grid(False)
                        plt.tight_layout()
                        plt.savefig(VISU_DIR + 'OPTICS (cluster size min=' + str(OPTICS_MIN_CLUSTER_SIZE) + '); ' + method + ', datapoint scaling=' + str(datapoint_scaling) + '.pdf', format='pdf', dpi=300)
                        plt.close()

        ### Perform AP clustering
        if CLUSTERING_AP:
575
576
            # Create clustering
            af = AffinityPropagation().fit(X)       # additional parameters: convergence_iter=50, max_iter=1000).fit(X)
Markus Schedl's avatar
.  
Markus Schedl committed
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
            cluster_centers_indices = af.cluster_centers_indices_
            labels = af.labels_
            n_clusters_ = len(cluster_centers_indices)
            print('Estimated number of clusters: %d' % n_clusters_)

            # Visualize clustering results
            plt.close('all')
            plt.figure(figsize=(11.69, 8.27))
            plt.clf()
            colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
            for k, col in zip(range(n_clusters_), colors):
                class_members = labels == k
                cluster_center = X[cluster_centers_indices[k]]
                label = 'Cluster ' + str(k) + ': ' + ', '.join(np.asarray(country_labels)[np.where(class_members)])
                handle, = plt.plot(X[class_members, 0], X[class_members, 1], col + '.', label=label)
                plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
                for x in X[class_members]:
                    plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
            plt.title('Affinity Propagation (%d clusters)' % (n_clusters_))
            plt.legend(fontsize=12)
            plt.axis('off')
            plt.grid(False)
            #plt.tight_layout()
            plt.savefig(VISU_DIR + 'clusters_AP.pdf', format='pdf', dpi=300)
            plt.close()

            # Plot prototype cluster centers
            cols = 6
            rows = int(np.ceil(n_clusters_ / np.float32(cols)))
            gs = gridspec.GridSpec(rows, cols)
            fig = plt.figure(figsize=(11.69, 8.27))
            colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
            for k, col in zip(range(n_clusters_), colors):
                data = X[cluster_centers_indices[k]]
                ax = fig.add_subplot(gs[k])
                handle_tpc, = ax.plot(range(len(data)), data, marker='.', color=col, markersize=1.0, linestyle='None')
                ax.set_title('Cluster %s' % (str(k)))
614
                # ax.text(.5, .9, 'Prototype for cluster %s' % (str(k)), horizontalalignment='center', transform=ax.transAxes)
Markus Schedl's avatar
.  
Markus Schedl committed
615
                plt.setp(handle_tpc, linewidth=.5)
616
617
618
619
                # plt.title('Prototype for cluster %s' % (str(k)), fontsize=18)
                # plt.xlabel('Artists', fontsize=14)
                # plt.ylabel(t, fontsize=14)
                # plt.grid(True)
Markus Schedl's avatar
.  
Markus Schedl committed
620
621
622
            fig.suptitle('Country Prototypes', fontsize=14)
            gs.tight_layout(fig, rect=[0, 0.03, 1, 0.95])
            fig.savefig(VISU_DIR + 'clusters_prototypes_AP.jpg', format='jpg', dpi=600)
623
            # fig.savefig(VISU_DIR + 'clusters_prototypes.pdf', format='pdf', dpi=300)
Markus Schedl's avatar
.  
Markus Schedl committed
624
            plt.close()