Commit 13d7659a authored by Alessandro Melchiorre's avatar Alessandro Melchiorre
Browse files

Continue_crawling

parent b4d93d80
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
certifi==2020.4.5.1
faster-than-requests==0.9.8
mkl-fft==1.0.15
mkl-random==1.1.0
mkl-service==2.3.0
numpy==1.18.1
pandas==1.0.3
python-dateutil==2.8.1
pytz==2019.3
six==1.14.0
tqdm==4.46.0
...@@ -85,12 +85,12 @@ class LastFMCrawler: ...@@ -85,12 +85,12 @@ class LastFMCrawler:
for user in tqdm(users): for user in tqdm(users):
username = user["username"] username = user["username"]
max_ts = user["max_ts"] # max_ts = user["max_ts"]
url_vars = basic_url_vars.copy() url_vars = basic_url_vars.copy()
url_vars["user"] = username url_vars["user"] = username
if "from" not in url_vars: # if "from" not in url_vars:
url_vars["from"] = int(max_ts + 1) # Adding one second # url_vars["from"] = int(max_ts + 1) # Adding one second
usr_url = self.base_url + urllib.parse.urlencode(url_vars) usr_url = self.base_url + urllib.parse.urlencode(url_vars)
......
API_KEY_1 = 'e300f86cc2d7820b568185ac42563e96' # Insert the api key and the api secret in the relative list as STRINGS!
API_SECRET_1 = 'ea6cdb065438f166ff7a2f9d561bc62e' # e.g API_KEYS = ['JKAUF92JFHSJDHSJ']
API_KEY_2 = '982ecb74a29dad923a09d0217c01ff94' API_KEYS = []
API_SECRET_2 = '59d4c1c67c3fad02f00456dad5df8f22' API_SECRETS = []
API_KEY_3 = 'd6a28f74f0f8a8dc7d01c594a60c716c'
API_SECRET_3 = '5e051193161130c797cd97eefe3170d7'
API_KEY_4 = '5f89b409e9b3efad06bd212ca56d5aad'
API_SECRET_4 = '1998115447a6483c4edb8d8ad75d8670'
API_KEYS = [API_KEY_1, API_KEY_2, API_KEY_3, API_KEY_4]
API_SECRETS = [API_SECRET_1, API_SECRET_2, API_SECRET_3, API_SECRET_4]
...@@ -5,16 +5,17 @@ import pandas as pd ...@@ -5,16 +5,17 @@ import pandas as pd
from LastFMCrawler import LastFMCrawler from LastFMCrawler import LastFMCrawler
from conf import API_KEYS, API_SECRETS from conf import API_KEYS, API_SECRETS
lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3], lfm1b = pd.read_csv('./data/user_recrawl.csv', names=['username'], skiprows=1)
names=["uid", "username", "country"]) # lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"]) # names=["uid", "username", "country"])
# lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"])
lfm1b_us_pc = pd.read_csv('./data/LFM-1b_users_playcounts_filtered.txt', delimiter='\t', skiprows=[0], # lfm1b_us_pc = pd.read_csv('./data/LFM-1b_users_playcounts_filtered.txt', delimiter='\t', skiprows=[0],
names=['uid', 'username', 'pc']) # names=['uid', 'username', 'pc'])
# columns are: uid, username, country, max_ts, pc # columns are: uid, username, country, max_ts, pc
lfm1b = pd.merge(lfm1b_users, lfm1b_ts).merge(lfm1b_us_pc) # lfm1b = pd.merge(lfm1b_users, lfm1b_ts).merge(lfm1b_us_pc)
lfm1b = lfm1b.sort_values(by='pc') # lfm1b = lfm1b.sort_values(by='pc')
print("Number of users in LFM-1b is: {}".format(len(lfm1b))) print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
...@@ -23,40 +24,42 @@ print("Number of users in LFM-1b is: {}".format(len(lfm1b))) ...@@ -23,40 +24,42 @@ print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
# Don't change this! # Don't change this!
number_of_crawlers = 4 number_of_crawlers = 4
### troete ### new_computer
this_crawler = 0 this_crawler = 3
api_key_idx = 1 api_key_idx = 0
### kara ### kara
# this_crawler = 1 # this_crawler = 0
# api_key_idx = 2 # api_key_idx = 2
### dragonforce ### dragonforce
# this_crawler = 2 # this_crawler = 1
# api_key_idx = 3 # api_key_idx = 3
### passionpit ### passionpit
# this_crawler = 3 # this_crawler = 2
# api_key_idx = 0 # api_key_idx = 0
lfm1b = lfm1b.reset_index(drop=True) # Generates new index # lfm1b = lfm1b.reset_index(drop=True) # Generates new index
lfm1b = lfm1b[lfm1b.index % number_of_crawlers == this_crawler] lfm1b = lfm1b[lfm1b.index % number_of_crawlers == this_crawler]
print(lfm1b.head()) print(lfm1b.head())
print("Number of users in the split is: {}".format(len(lfm1b))) print("Number of users in the split is: {}".format(len(lfm1b)))
# timestamp for 20/03/2020 12:00:00 GMT # timestamp for 20/03/2020 12:00:00 GMT
to_ts = 1584705600 from_ts = 1584705600
# timestamp for 26/09/2020 12:00:00 GMT
to_ts = 1601078400
# folder name # folder name
folder_name = "./data/max_ts-{}".format(to_ts) folder_name = "./data/{}-{}".format(from_ts, to_ts)
# failed users (csv file) # failed users (csv file)
failed_file = "./data/max_ts-{}-failed_users.csv".format(to_ts) failed_file = "./data/{}-{}-failed_users.csv".format(from_ts, to_ts)
# error_file, logs all errors happened during the saving # error_file, logs all errors happened during the saving
error_file = "./data/max_ts-{}-error_file.csv".format(to_ts) error_file = "./data/{}-{}-error_file.csv".format(from_ts, to_ts)
# log_file, logs all successfully retrieved users # log_file, logs all successfully retrieved users
log_file = './data/max_ts-{}-log_file.csv'.format(to_ts) log_file = './data/{}-{}-log_file.csv'.format(from_ts, to_ts)
if not os.path.isdir(folder_name): if not os.path.isdir(folder_name):
# Create Folder if not exists # Create Folder if not exists
...@@ -85,10 +88,10 @@ if not work_on_failures: ...@@ -85,10 +88,10 @@ if not work_on_failures:
print("Number of users to crawl is: {}".format(len(lfm1b))) print("Number of users to crawl is: {}".format(len(lfm1b)))
# api_key_idx is included in crawling_settings # api_key_idx is included in crawling_settings
crawler = LastFMCrawler(api_key=API_KEYS[api_key_idx], api_secret=API_SECRETS[api_key_idx], to_ts=to_ts) crawler = LastFMCrawler(api_key=API_KEYS[api_key_idx], api_secret=API_SECRETS[api_key_idx], from_ts=from_ts, to_ts=to_ts)
if work_on_failures: if work_on_failures:
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, log_file) crawler.crawl(lfm1b[["username"]].to_dict("records"), folder_name, error_file, log_file)
else: else:
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, log_file, crawler.crawl(lfm1b[["username"]].to_dict("records"), folder_name, error_file, log_file,
failed_file=failed_file) failed_file=failed_file)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment