Commit 72c1a4cf authored by Alessandro Melchiorre's avatar Alessandro Melchiorre
Browse files

# Sorted data

# No multiple machines
parent 471c635c
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -4,14 +4,18 @@ import os
import pandas as pd
from LastFMCrawler import LastFMCrawler
from conf import API_KEYS, API_SECRETS
from crawling_settings import api_key_idx, this_crawler
from crawling_settings import api_key_idx
lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
names=["uid", "username", "country"])
lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"])
# columns are: uid, username, country, max_ts
lfm1b = pd.merge(lfm1b_users, lfm1b_ts)
lfm1b_us_pc = pd.read_csv('./data/LFM-1b_users_playcounts_filtered.txt', delimiter='\t', skiprows=[0],
names=['uid', 'username', 'pc'])
# columns are: uid, username, country, max_ts, pc
lfm1b = pd.merge(lfm1b_users, lfm1b_ts).merge(lfm1b_us_pc)
lfm1b = lfm1b.sort_values(by='pc')
print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
......@@ -50,9 +54,9 @@ if not work_on_failures:
# number_of_crawlers and this crawler are used to find only subset of users for the current execution (modulo)
# this_crawler is included in
number_of_crawlers = 3
# number_of_crawlers = 3
lfm1b = lfm1b[lfm1b.uid % number_of_crawlers == this_crawler]
#lfm1b = lfm1b[lfm1b.uid % number_of_crawlers == this_crawler]
print("Number of users to crawl is: {}".format(len(lfm1b)))
# api_key_idx is included in crawling_settings
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment