Commit 72c1a4cf authored by Alessandro Melchiorre's avatar Alessandro Melchiorre
Browse files

# Sorted data

# No multiple machines
parent 471c635c
This diff is collapsed.
...@@ -4,14 +4,18 @@ import os ...@@ -4,14 +4,18 @@ import os
import pandas as pd import pandas as pd
from LastFMCrawler import LastFMCrawler from LastFMCrawler import LastFMCrawler
from conf import API_KEYS, API_SECRETS from conf import API_KEYS, API_SECRETS
from crawling_settings import api_key_idx, this_crawler from crawling_settings import api_key_idx
lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3], lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
names=["uid", "username", "country"]) names=["uid", "username", "country"])
lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"]) lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"])
# columns are: uid, username, country, max_ts lfm1b_us_pc = pd.read_csv('./data/LFM-1b_users_playcounts_filtered.txt', delimiter='\t', skiprows=[0],
lfm1b = pd.merge(lfm1b_users, lfm1b_ts) names=['uid', 'username', 'pc'])
# columns are: uid, username, country, max_ts, pc
lfm1b = pd.merge(lfm1b_users, lfm1b_ts).merge(lfm1b_us_pc)
lfm1b = lfm1b.sort_values(by='pc')
print(lfm1b.head()) print(lfm1b.head())
print("Number of users in LFM-1b is: {}".format(len(lfm1b))) print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
...@@ -50,9 +54,9 @@ if not work_on_failures: ...@@ -50,9 +54,9 @@ if not work_on_failures:
# number_of_crawlers and this crawler are used to find only subset of users for the current execution (modulo) # number_of_crawlers and this crawler are used to find only subset of users for the current execution (modulo)
# this_crawler is included in crawling_settings.py # this_crawler is included in crawling_settings.py
number_of_crawlers = 3 # number_of_crawlers = 3
lfm1b = lfm1b[lfm1b.uid % number_of_crawlers == this_crawler] #lfm1b = lfm1b[lfm1b.uid % number_of_crawlers == this_crawler]
print("Number of users to crawl is: {}".format(len(lfm1b))) print("Number of users to crawl is: {}".format(len(lfm1b)))
# api_key_idx is included in crawling_settings # api_key_idx is included in crawling_settings
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment