Commit 13d7659a authored by Alessandro Melchiorre's avatar Alessandro Melchiorre
Browse files

Continue_crawling

parent b4d93d80
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
certifi==2020.4.5.1
faster-than-requests==0.9.8
mkl-fft==1.0.15
mkl-random==1.1.0
mkl-service==2.3.0
numpy==1.18.1
pandas==1.0.3
python-dateutil==2.8.1
pytz==2019.3
six==1.14.0
tqdm==4.46.0
......@@ -85,12 +85,12 @@ class LastFMCrawler:
for user in tqdm(users):
username = user["username"]
max_ts = user["max_ts"]
# max_ts = user["max_ts"]
url_vars = basic_url_vars.copy()
url_vars["user"] = username
if "from" not in url_vars:
url_vars["from"] = int(max_ts + 1) # Adding one second
# if "from" not in url_vars:
# url_vars["from"] = int(max_ts + 1) # Adding one second
usr_url = self.base_url + urllib.parse.urlencode(url_vars)
......
API_KEY_1 = 'e300f86cc2d7820b568185ac42563e96'
API_SECRET_1 = 'ea6cdb065438f166ff7a2f9d561bc62e'
# Insert the api key and the api secret in the relative list as STRINGS!
# e.g API_KEYS = ['JKAUF92JFHSJDHSJ']
API_KEY_2 = '982ecb74a29dad923a09d0217c01ff94'
API_SECRET_2 = '59d4c1c67c3fad02f00456dad5df8f22'
API_KEY_3 = 'd6a28f74f0f8a8dc7d01c594a60c716c'
API_SECRET_3 = '5e051193161130c797cd97eefe3170d7'
API_KEY_4 = '5f89b409e9b3efad06bd212ca56d5aad'
API_SECRET_4 = '1998115447a6483c4edb8d8ad75d8670'
API_KEYS = [API_KEY_1, API_KEY_2, API_KEY_3, API_KEY_4]
API_SECRETS = [API_SECRET_1, API_SECRET_2, API_SECRET_3, API_SECRET_4]
API_KEYS = []
API_SECRETS = []
......@@ -5,16 +5,17 @@ import pandas as pd
from LastFMCrawler import LastFMCrawler
from conf import API_KEYS, API_SECRETS
lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
names=["uid", "username", "country"])
lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"])
lfm1b = pd.read_csv('./data/user_recrawl.csv', names=['username'], skiprows=1)
# lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
# names=["uid", "username", "country"])
# lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"])
lfm1b_us_pc = pd.read_csv('./data/LFM-1b_users_playcounts_filtered.txt', delimiter='\t', skiprows=[0],
names=['uid', 'username', 'pc'])
# lfm1b_us_pc = pd.read_csv('./data/LFM-1b_users_playcounts_filtered.txt', delimiter='\t', skiprows=[0],
# names=['uid', 'username', 'pc'])
# columns are: uid, username, country, max_ts, pc
lfm1b = pd.merge(lfm1b_users, lfm1b_ts).merge(lfm1b_us_pc)
lfm1b = lfm1b.sort_values(by='pc')
# lfm1b = pd.merge(lfm1b_users, lfm1b_ts).merge(lfm1b_us_pc)
# lfm1b = lfm1b.sort_values(by='pc')
print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
......@@ -23,40 +24,42 @@ print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
# Don't change this!
number_of_crawlers = 4
### troete
this_crawler = 0
api_key_idx = 1
### new_computer
this_crawler = 3
api_key_idx = 0
### kara
# this_crawler = 1
# this_crawler = 0
# api_key_idx = 2
### dragonforce
# this_crawler = 2
# this_crawler = 1
# api_key_idx = 3
### passionpit
# this_crawler = 3
# this_crawler = 2
# api_key_idx = 0
lfm1b = lfm1b.reset_index(drop=True) # Generates new index
# lfm1b = lfm1b.reset_index(drop=True) # Generates new index
lfm1b = lfm1b[lfm1b.index % number_of_crawlers == this_crawler]
print(lfm1b.head())
print("Number of users in the split is: {}".format(len(lfm1b)))
# timestamp for 20/03/2020 12:00:00 GMT
to_ts = 1584705600
from_ts = 1584705600
# timestamp for 26/09/2020 12:00:00 GMT
to_ts = 1601078400
# folder name
folder_name = "./data/max_ts-{}".format(to_ts)
folder_name = "./data/{}-{}".format(from_ts, to_ts)
# failed users (csv file)
failed_file = "./data/max_ts-{}-failed_users.csv".format(to_ts)
failed_file = "./data/{}-{}-failed_users.csv".format(from_ts, to_ts)
# error_file, logs all errors happened during the saving
error_file = "./data/max_ts-{}-error_file.csv".format(to_ts)
error_file = "./data/{}-{}-error_file.csv".format(from_ts, to_ts)
# log_file, logs all successfully retrieved users
log_file = './data/max_ts-{}-log_file.csv'.format(to_ts)
log_file = './data/{}-{}-log_file.csv'.format(from_ts, to_ts)
if not os.path.isdir(folder_name):
# Create Folder if not exists
......@@ -85,10 +88,10 @@ if not work_on_failures:
print("Number of users to crawl is: {}".format(len(lfm1b)))
# api_key_idx is included in crawling_settings
crawler = LastFMCrawler(api_key=API_KEYS[api_key_idx], api_secret=API_SECRETS[api_key_idx], to_ts=to_ts)
crawler = LastFMCrawler(api_key=API_KEYS[api_key_idx], api_secret=API_SECRETS[api_key_idx], from_ts=from_ts, to_ts=to_ts)
if work_on_failures:
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, log_file)
crawler.crawl(lfm1b[["username"]].to_dict("records"), folder_name, error_file, log_file)
else:
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, log_file,
crawler.crawl(lfm1b[["username"]].to_dict("records"), folder_name, error_file, log_file,
failed_file=failed_file)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment