Commit 6d2df439 authored by Alessandro Melchiorre's avatar Alessandro Melchiorre
Browse files

# Crawling settings are in main file now

# It doesn't log empty files
# Crawled users are logged in (timerange)-log_file.csv
parent 72c1a4cf
......@@ -66,7 +66,7 @@ class LastFMCrawler:
track["artist"].pop("image", None)
return json
def crawl(self, users, json_folder, error_file, failed_file=None):
def crawl(self, users, json_folder, error_file, log_file, failed_file=None):
self.last_call = 0
......@@ -131,8 +131,13 @@ class LastFMCrawler:
continue
try:
with open(os.path.join(json_folder, username + ".json"), "w") as outfile:
json.dump(usr_data, outfile)
if tot_les > 0:
# Save it only if there a new events
with open(os.path.join(json_folder, username + ".json"), "w") as outfile:
json.dump(usr_data, outfile)
with open(log_file, 'a+') as out:
writer = csv.writer(out)
writer.writerow([username, tot_les])
except Exception as e:
print("User {} not saved!".format(username))
......
this_crawler = 1
api_key_idx = 0
......@@ -4,7 +4,6 @@ import os
import pandas as pd
from LastFMCrawler import LastFMCrawler
from conf import API_KEYS, API_SECRETS
from crawling_settings import api_key_idx
lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
names=["uid", "username", "country"])
......@@ -29,15 +28,19 @@ folder_name = "./data/max_ts-{}".format(to_ts)
failed_file = "./data/max_ts-{}-failed_users.csv".format(to_ts)
# error_file, logs all errors happened during the saving
error_file = "./data/max_ts-{}-error_file.csv".format(to_ts)
# log_file, logs all successfully retrieved users
log_file = './data/max_ts-{}-log_file.csv'.format(to_ts)
if not os.path.isdir(folder_name):
# Create Folder if not exists
os.mkdir(folder_name)
else:
# Do not crawl already crawled users
crawled = [x.replace(".json", "") for x in os.listdir(folder_name)]
print("Number of users already crawled is: {}".format(len(crawled)))
lfm1b = lfm1b[~lfm1b.username.isin(crawled)]
if os.path.isfile(log_file):
with open(log_file, "r") as inf:
reader = csv.reader(inf)
log_users = [x[0].replace('.json', '') for x in reader]
print("Number of users already crawled is: {}".format(len(log_users)))
lfm1b = lfm1b[~lfm1b.username.isin(log_users)]
# If True, tries to re-fetch users for which previous attempts had failed.
# It False, it sets a log file in which it stores the usernames to skip
......@@ -54,15 +57,30 @@ if not work_on_failures:
# number_of_crawlers and this crawler are used to find only subset of users for the current execution (modulo)
# this_crawler is included in crawling_settings.py
# number_of_crawlers = 3
# Don't change this!
number_of_crawlers = 3
### troete
this_crawler = 0
api_key_idx = 1
### kara
# this_crawler = 1
# api_key_idx = 2
### dragonforce
# this_crawler = 2
# api_key_idx = 3
#lfm1b = lfm1b[lfm1b.uid % number_of_crawlers == this_crawler]
lfm1b = lfm1b.reset_index(drop=True) # Generates new index
lfm1b = lfm1b[lfm1b.index % number_of_crawlers == this_crawler]
print("Number of users to crawl is: {}".format(len(lfm1b)))
# api_key_idx is included in crawling_settings
crawler = LastFMCrawler(api_key=API_KEYS[api_key_idx], api_secret=API_SECRETS[api_key_idx], to_ts=to_ts)
if work_on_failures:
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file)
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, log_file)
else:
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, failed_file=failed_file)
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, log_file,
failed_file=failed_file)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment