- better way to write to the output

- split on modulo
parent 045783ff
......@@ -80,7 +80,7 @@ class LastFMCrawler:
if self.to_ts:
basic_url_vars["to"] = self.to_ts
for _, user in tqdm(users.iterrows()):
for user in tqdm(users):
username = user["username"]
max_ts = user["max_ts"]
......@@ -2,9 +2,8 @@ import csv
import os
import pandas as pd
from src.LastFMCrawler import LastFMCrawler
from src.conf import API_KEY_1, API_SECRET_1
from LastFMCrawler import LastFMCrawler
from conf import API_KEY_2, API_SECRET_2
lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
names=["uid", "username", "country"])
......@@ -14,6 +13,8 @@ lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=No
lfm1b = pd.merge(lfm1b_users, lfm1b_ts)
print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
# timestamp for 20/03/2020 12:00:00 GMT
to_ts = 1584705600
......@@ -30,6 +31,7 @@ if not os.path.isdir(folder_name):
# Do not crawl already crawled users
crawled = [x.replace(".json", "") for x in os.listdir(folder_name)]
print("Number of users already crawled is: {}".format(len(crawled)))
lfm1b = lfm1b[~lfm1b.username.isin(crawled)]
# If True, tries to re-fetch users for which previous attempts had failed.
......@@ -42,14 +44,18 @@ if not work_on_failures:
with open(failed_file, "r") as inf:
reader = csv.reader(inf)
failed_users = [x[0] for x in reader]
print("Number of users failed is: {}".format(len(failed_users)))
lfm1b = lfm1b[~lfm1b.username.isin(failed_users)]
# number_of_crawlers and this crawlers are used to find only subset of users for the current execution (modulo)sky
number_of_crawlers = 3
this_crawler = 2
lfm1b = lfm1b[lfm1b.uid % number_of_crawlers == this_crawler]
print("Number of users to crawl is: {}".format(len(lfm1b)))
crawler = LastFMCrawler(api_key=API_KEY_1, api_secret=API_SECRET_1, to_ts=to_ts)
crawler = LastFMCrawler(api_key=API_KEY_2, api_secret=API_SECRET_2, to_ts=to_ts)
if work_on_failures:
crawler.crawl(lfm1b[["username", "max_ts"]], folder_name, error_file)
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file)
crawler.crawl(lfm1b[["username", "max_ts"]], folder_name, error_file, failed_file=failed_file)
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, failed_file=failed_file)
