Commit e7a91b70 authored by Alessandro Melchiorre's avatar Alessandro Melchiorre
Browse files

- better way to write to the output

- split on modulo
parent 045783ff
...@@ -80,7 +80,7 @@ class LastFMCrawler: ...@@ -80,7 +80,7 @@ class LastFMCrawler:
if self.to_ts: if self.to_ts:
basic_url_vars["to"] = self.to_ts basic_url_vars["to"] = self.to_ts
for _, user in tqdm(users.iterrows()): for user in tqdm(users):
username = user["username"] username = user["username"]
max_ts = user["max_ts"] max_ts = user["max_ts"]
......
...@@ -2,9 +2,8 @@ import csv ...@@ -2,9 +2,8 @@ import csv
import os import os
import pandas as pd import pandas as pd
from LastFMCrawler import LastFMCrawler
from src.LastFMCrawler import LastFMCrawler from conf import API_KEY_2, API_SECRET_2
from src.conf import API_KEY_1, API_SECRET_1
lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3], lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
names=["uid", "username", "country"]) names=["uid", "username", "country"])
...@@ -14,6 +13,8 @@ lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=No ...@@ -14,6 +13,8 @@ lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=No
lfm1b = pd.merge(lfm1b_users, lfm1b_ts) lfm1b = pd.merge(lfm1b_users, lfm1b_ts)
print(lfm1b.head()) print(lfm1b.head())
print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
# timestamp for 20/03/2020 12:00:00 GMT # timestamp for 20/03/2020 12:00:00 GMT
to_ts = 1584705600 to_ts = 1584705600
...@@ -30,6 +31,7 @@ if not os.path.isdir(folder_name): ...@@ -30,6 +31,7 @@ if not os.path.isdir(folder_name):
else: else:
# Do not crawl already crawled users # Do not crawl already crawled users
crawled = [x.replace(".json", "") for x in os.listdir(folder_name)] crawled = [x.replace(".json", "") for x in os.listdir(folder_name)]
print("Number of users already crawled is: {}".format(len(crawled)))
lfm1b = lfm1b[~lfm1b.username.isin(crawled)] lfm1b = lfm1b[~lfm1b.username.isin(crawled)]
# If True, tries to re-fetch users for which previous attempts had failed. # If True, tries to re-fetch users for which previous attempts had failed.
...@@ -42,14 +44,18 @@ if not work_on_failures: ...@@ -42,14 +44,18 @@ if not work_on_failures:
with open(failed_file, "r") as inf: with open(failed_file, "r") as inf:
reader = csv.reader(inf) reader = csv.reader(inf)
failed_users = [x[0] for x in reader] failed_users = [x[0] for x in reader]
print("Number of users failed is: {}".format(len(failed_users)))
lfm1b = lfm1b[~lfm1b.username.isin(failed_users)] lfm1b = lfm1b[~lfm1b.username.isin(failed_users)]
# number_of_crawlers and this crawlers are used to find only subset of users for the current execution (modulo)sky
number_of_crawlers = 3
this_crawler = 2
lfm1b = lfm1b[lfm1b.uid % number_of_crawlers == this_crawler]
print("Number of users to crawl is: {}".format(len(lfm1b))) print("Number of users to crawl is: {}".format(len(lfm1b)))
crawler = LastFMCrawler(api_key=API_KEY_1, api_secret=API_SECRET_1, to_ts=to_ts) crawler = LastFMCrawler(api_key=API_KEY_2, api_secret=API_SECRET_2, to_ts=to_ts)
if work_on_failures: if work_on_failures:
crawler.crawl(lfm1b[["username", "max_ts"]], folder_name, error_file) crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file)
else: else:
crawler.crawl(lfm1b[["username", "max_ts"]], folder_name, error_file, failed_file=failed_file) crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, failed_file=failed_file)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment