Commit 13d7659a authored by Alessandro Melchiorre's avatar Alessandro Melchiorre
Browse files

Continue_crawling

parent b4d93d80
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
certifi==2020.4.5.1
faster-than-requests==0.9.8
mkl-fft==1.0.15
mkl-random==1.1.0
mkl-service==2.3.0
numpy==1.18.1
pandas==1.0.3
python-dateutil==2.8.1
pytz==2019.3
six==1.14.0
tqdm==4.46.0
...@@ -85,12 +85,12 @@ class LastFMCrawler: ...@@ -85,12 +85,12 @@ class LastFMCrawler:
for user in tqdm(users): for user in tqdm(users):
username = user["username"] username = user["username"]
max_ts = user["max_ts"] # max_ts = user["max_ts"]
url_vars = basic_url_vars.copy() url_vars = basic_url_vars.copy()
url_vars["user"] = username url_vars["user"] = username
if "from" not in url_vars: # if "from" not in url_vars:
url_vars["from"] = int(max_ts + 1) # Adding one second # url_vars["from"] = int(max_ts + 1) # Adding one second
usr_url = self.base_url + urllib.parse.urlencode(url_vars) usr_url = self.base_url + urllib.parse.urlencode(url_vars)
......
API_KEY_1 = 'e300f86cc2d7820b568185ac42563e96' # Insert the api key and the api secret in the relative list as STRINGS!
API_SECRET_1 = 'ea6cdb065438f166ff7a2f9d561bc62e' # e.g API_KEYS = ['JKAUF92JFHSJDHSJ']
API_KEY_2 = '982ecb74a29dad923a09d0217c01ff94' API_KEYS = []
API_SECRET_2 = '59d4c1c67c3fad02f00456dad5df8f22' API_SECRETS = []
API_KEY_3 = 'd6a28f74f0f8a8dc7d01c594a60c716c'
API_SECRET_3 = '5e051193161130c797cd97eefe3170d7'
API_KEY_4 = '5f89b409e9b3efad06bd212ca56d5aad'
API_SECRET_4 = '1998115447a6483c4edb8d8ad75d8670'
API_KEYS = [API_KEY_1, API_KEY_2, API_KEY_3, API_KEY_4]
API_SECRETS = [API_SECRET_1, API_SECRET_2, API_SECRET_3, API_SECRET_4]
...@@ -5,16 +5,17 @@ import pandas as pd ...@@ -5,16 +5,17 @@ import pandas as pd
from LastFMCrawler import LastFMCrawler from LastFMCrawler import LastFMCrawler
from conf import API_KEYS, API_SECRETS from conf import API_KEYS, API_SECRETS
lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3], lfm1b = pd.read_csv('./data/user_recrawl.csv', names=['username'], skiprows=1)
names=["uid", "username", "country"]) # lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"]) # names=["uid", "username", "country"])
# lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"])
lfm1b_us_pc = pd.read_csv('./data/LFM-1b_users_playcounts_filtered.txt', delimiter='\t', skiprows=[0], # lfm1b_us_pc = pd.read_csv('./data/LFM-1b_users_playcounts_filtered.txt', delimiter='\t', skiprows=[0],
names=['uid', 'username', 'pc']) # names=['uid', 'username', 'pc'])
# columns are: uid, username, country, max_ts, pc # columns are: uid, username, country, max_ts, pc
lfm1b = pd.merge(lfm1b_users, lfm1b_ts).merge(lfm1b_us_pc) # lfm1b = pd.merge(lfm1b_users, lfm1b_ts).merge(lfm1b_us_pc)
lfm1b = lfm1b.sort_values(by='pc') # lfm1b = lfm1b.sort_values(by='pc')
print("Number of users in LFM-1b is: {}".format(len(lfm1b))) print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
...@@ -23,40 +24,42 @@ print("Number of users in LFM-1b is: {}".format(len(lfm1b))) ...@@ -23,40 +24,42 @@ print("Number of users in LFM-1b is: {}".format(len(lfm1b)))
# Don't change this! # Don't change this!
number_of_crawlers = 4 number_of_crawlers = 4
### troete ### new_computer
this_crawler = 0 this_crawler = 3
api_key_idx = 1 api_key_idx = 0
### kara ### kara
# this_crawler = 1 # this_crawler = 0
# api_key_idx = 2 # api_key_idx = 2
### dragonforce ### dragonforce
# this_crawler = 2 # this_crawler = 1
# api_key_idx = 3 # api_key_idx = 3
### passionpit ### passionpit
# this_crawler = 3 # this_crawler = 2
# api_key_idx = 0 # api_key_idx = 0
lfm1b = lfm1b.reset_index(drop=True) # Generates new index # lfm1b = lfm1b.reset_index(drop=True) # Generates new index
lfm1b = lfm1b[lfm1b.index % number_of_crawlers == this_crawler] lfm1b = lfm1b[lfm1b.index % number_of_crawlers == this_crawler]
print(lfm1b.head()) print(lfm1b.head())
print("Number of users in the split is: {}".format(len(lfm1b))) print("Number of users in the split is: {}".format(len(lfm1b)))
# timestamp for 20/03/2020 12:00:00 GMT # timestamp for 20/03/2020 12:00:00 GMT
to_ts = 1584705600 from_ts = 1584705600
# timestamp for 26/09/2020 12:00:00 GMT
to_ts = 1601078400
# folder name # folder name
folder_name = "./data/max_ts-{}".format(to_ts) folder_name = "./data/{}-{}".format(from_ts, to_ts)
# failed users (csv file) # failed users (csv file)
failed_file = "./data/max_ts-{}-failed_users.csv".format(to_ts) failed_file = "./data/{}-{}-failed_users.csv".format(from_ts, to_ts)
# error_file, logs all errors happened during the saving # error_file, logs all errors happened during the saving
error_file = "./data/max_ts-{}-error_file.csv".format(to_ts) error_file = "./data/{}-{}-error_file.csv".format(from_ts, to_ts)
# log_file, logs all successfully retrieved users # log_file, logs all successfully retrieved users
log_file = './data/max_ts-{}-log_file.csv'.format(to_ts) log_file = './data/{}-{}-log_file.csv'.format(from_ts, to_ts)
if not os.path.isdir(folder_name): if not os.path.isdir(folder_name):
# Create Folder if not exists # Create Folder if not exists
...@@ -85,10 +88,10 @@ if not work_on_failures: ...@@ -85,10 +88,10 @@ if not work_on_failures:
print("Number of users to crawl is: {}".format(len(lfm1b))) print("Number of users to crawl is: {}".format(len(lfm1b)))
# api_key_idx is included in crawling_settings # api_key_idx is included in crawling_settings
crawler = LastFMCrawler(api_key=API_KEYS[api_key_idx], api_secret=API_SECRETS[api_key_idx], to_ts=to_ts) crawler = LastFMCrawler(api_key=API_KEYS[api_key_idx], api_secret=API_SECRETS[api_key_idx], from_ts=from_ts, to_ts=to_ts)
if work_on_failures: if work_on_failures:
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, log_file) crawler.crawl(lfm1b[["username"]].to_dict("records"), folder_name, error_file, log_file)
else: else:
crawler.crawl(lfm1b[["username", "max_ts"]].to_dict("records"), folder_name, error_file, log_file, crawler.crawl(lfm1b[["username"]].to_dict("records"), folder_name, error_file, log_file,
failed_file=failed_file) failed_file=failed_file)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment