Commit 13d7659a authored by Alessandro Melchiorre's avatar Alessandro Melchiorre
Browse files

Continue_crawling

parent b4d93d80
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
certifi==2020.4.5.1
faster-than-requests==0.9.8
mkl-fft==1.0.15
mkl-random==1.1.0
mkl-service==2.3.0
numpy==1.18.1
pandas==1.0.3
python-dateutil==2.8.1
pytz==2019.3
six==1.14.0
tqdm==4.46.0
......@@ -85,12 +85,12 @@ class LastFMCrawler:
for user in tqdm(users):
username = user["username"]
max_ts = user["max_ts"]
# max_ts = user["max_ts"]
url_vars = basic_url_vars.copy()
url_vars["user"] = username
if "from" not in url_vars:
url_vars["from"] = int(max_ts + 1) # Adding one second
# if "from" not in url_vars:
# url_vars["from"] = int(max_ts + 1) # Adding one second
usr_url = self.base_url + urllib.parse.urlencode(url_vars)
......
API_KEY_1 = 'e300f86cc2d7820b568185ac42563e96'
API_SECRET_1 = 'ea6cdb065438f166ff7a2f9d561bc62e'
# Insert the api key and the api secret in the relative list as STRINGS!
# e.g API_KEYS = ['JKAUF92JFHSJDHSJ']
API_KEY_2 = '982ecb74a29dad923a09d0217c01ff94'
API_SECRET_2 = '59d4c1c67c3fad02f00456dad5df8f22'
API_KEY_3 = 'd6a28f74f0f8a8dc7d01c594a60c716c'
API_SECRET_3 = '5e051193161130c797cd97eefe3170d7'
API_KEY_4 = '5f89b409e9b3efad06bd212ca56d5aad'
API_SECRET_4 = '1998115447a6483c4edb8d8ad75d8670'
API_KEYS = [API_KEY_1, API_KEY_2, API_KEY_3, API_KEY_4]
API_SECRETS = [API_SECRET_1, API_SECRET_2, API_SECRET_3, API_SECRET_4]
API_KEYS = []
API_SECRETS = []
......@@ -5,16 +5,17 @@ import pandas as pd
from LastFMCrawler import LastFMCrawler
from conf import API_KEYS, API_SECRETS
lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
names=["uid", "username", "country"])
lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"])
lfm1b = pd.read_csv('./data/user_recrawl.csv', names=['username'], skiprows=1)
# lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
# names=["uid", "username", "country"])
# lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"])
lfm1b_us_pc = pd.read_csv('./data/LFM-1b_users_playcounts_filtered.txt', delimiter='\t', skiprows=[0],
names=['uid', 'username', 'pc'])
# lfm1b_us_pc = pd.read_csv('./data/LFM-1b_users_playcounts_filtered.txt', delimiter='\t', skiprows=[0],
# names=['uid', 'username', 'pc'])
# columns are: uid, username, country, max_ts, pc
lfm1b = pd.merge(lfm1b_users, lfm1b_ts).merge(lfm1b_us_pc)
lfm1b = lfm1b.sort_values(by='pc')
# lfm1b = pd.merge(lfm1b_users, lfm1b_ts).merge(lfm1b_us_pc)
# lfm1b = lfm1b.sort_values(by='pc')
print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
......@@ -23,40 +24,42 @@ print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
# Don't change this!
number_of_crawlers = 4
### troete
this_crawler = 0
api_key_idx = 1
### new_computer
this_crawler = 3
api_key_idx = 0
### kara
# this_crawler = 1
# this_crawler = 0
# api_key_idx = 2
### dragonforce
# this_crawler = 2
# this_crawler = 1
# api_key_idx = 3
### passionpit
# this_crawler = 3
# this_crawler = 2
# api_key_idx = 0
lfm1b = lfm1b.reset_index(drop=True) # Generates new index
# lfm1b = lfm1b.reset_index(drop=True) # Generates new index
lfm1b = lfm1b[lfm1b.index % number_of_crawlers == this_crawler]
print(lfm1b.head())
print("Number of users in the split is: {}".format(len(lfm1b)))
# timestamp for 20/03/2020 12:00:00 GMT
to_ts = 1584705600
from_ts = 1584705600
# timestamp for 26/09/2020 12:00:00 GMT
to_ts = 1601078400
# folder name
folder_name = "./data/max_ts-{}".format(to_ts)
folder_name = "./data/{}-{}".format(from_ts, to_ts)
# failed users (csv file)
failed_file = "./data/max_ts-{}-failed_users.csv".format(to_ts)
failed_file = "./data/{}-{}-failed_users.csv".format(from_ts, to_ts)
# error_file, logs all errors happened during the saving
error_file = "./data/max_ts-{}-error_file.csv".format(to_ts)
error_file = "./data/{}-{}-error_file.csv".format(from_ts, to_ts)
# log_file, logs all successfully retrieved users
log_file = './data/max_ts-{}-log_file.csv'.format(to_ts)
log_file = './data/{}-{}-log_file.csv'.format(from_ts, to_ts)
if not os.path.isdir(folder_name):
# Create Folder if not exists
......@@ -85,10 +88,10 @@ if not work_on_failures:
print("Number of users to crawl is: {}".format(len(lfm1b)))
# api_key_idx is included in crawling_settings
crawler = LastFMCrawler(api_key=API_KEYS[api_key_idx], api_secret=API_SECRETS[api_key_idx], to_ts=to_ts)
crawler = LastFMCrawler(api_key=API_KEYS[api_key_idx], api_secret=API_SECRETS[api_key_idx], from_ts=from_ts, to_ts=to_ts)
if work_on_failures:
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, log_file)
crawler.crawl(lfm1b[["username"]].to_dict("records"), folder_name, error_file, log_file)
else:
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, log_file,
crawler.crawl(lfm1b[["username"]].to_dict("records"), folder_name, error_file, log_file,
failed_file=failed_file)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment