Commit 045783ff authored by Alessandro Melchiorre's avatar Alessandro Melchiorre
Browse files

- max-ts file showing the maximum timestamp for each user

- code modified accordingly
parent f033a127
This diff is collapsed.
...@@ -80,10 +80,15 @@ class LastFMCrawler: ...@@ -80,10 +80,15 @@ class LastFMCrawler:
if self.to_ts: if self.to_ts:
basic_url_vars["to"] = self.to_ts basic_url_vars["to"] = self.to_ts
for username in tqdm(users): for _, user in tqdm(users.iterrows()):
username = user["username"]
max_ts = user["max_ts"]
url_vars = basic_url_vars.copy() url_vars = basic_url_vars.copy()
url_vars["user"] = username url_vars["user"] = username
if "from" not in url_vars:
url_vars["from"] = int(max_ts + 1) # Adding one second
usr_url = self.base_url + urllib.parse.urlencode(url_vars) usr_url = self.base_url + urllib.parse.urlencode(url_vars)
usr_data = [] usr_data = []
......
...@@ -2,27 +2,27 @@ import csv ...@@ -2,27 +2,27 @@ import csv
import os import os
import pandas as pd import pandas as pd
from LastFMCrawler import LastFMCrawler
from conf import API_KEY_1, API_SECRET_1
lfm1b = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[1, 3], from src.LastFMCrawler import LastFMCrawler
names=["username", "country"]) from src.conf import API_KEY_1, API_SECRET_1
# Only users with country information lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
lfm1b = lfm1b[~lfm1b["country"].isna()] names=["uid", "username", "country"])
lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"])
# columns are: uid, username, country, max_ts
lfm1b = pd.merge(lfm1b_users, lfm1b_ts)
print(lfm1b.head()) print(lfm1b.head())
# timestamp for 01/01/2020 00:00:00 GMT (Greenwich Mean Time)
from_ts = 1577836800
# timestamp for 20/03/2020 12:00:00 GMT # timestamp for 20/03/2020 12:00:00 GMT
to_ts = 1584705600 to_ts = 1584705600
# folder name # folder name
folder_name = "./data/{}-{}".format(from_ts, to_ts) folder_name = "./data/max_ts-{}".format(to_ts)
# failed users (csv file) # failed users (csv file)
failed_file = "./data/{}-{}-failed_users.csv".format(from_ts, to_ts) failed_file = "./data/max_ts-{}-failed_users.csv".format(to_ts)
# error_file, logs all errors happened during the saving # error_file, logs all errors happened during the saving
error_file = "./data/{}-{}-error_file.csv".format(from_ts, to_ts) error_file = "./data/max_ts-{}-error_file.csv".format(to_ts)
if not os.path.isdir(folder_name): if not os.path.isdir(folder_name):
# Create Folder if not exists # Create Folder if not exists
...@@ -47,10 +47,9 @@ if not work_on_failures: ...@@ -47,10 +47,9 @@ if not work_on_failures:
print("Number of users to crawl is: {}".format(len(lfm1b))) print("Number of users to crawl is: {}".format(len(lfm1b)))
crawler = LastFMCrawler(api_key=API_KEY_1, api_secret=API_SECRET_1, from_ts=from_ts, to_ts=to_ts) crawler = LastFMCrawler(api_key=API_KEY_1, api_secret=API_SECRET_1, to_ts=to_ts)
print(failed_file)
if work_on_failures: if work_on_failures:
crawler.crawl(lfm1b["username"], folder_name, error_file) crawler.crawl(lfm1b[["username", "max_ts"]], folder_name, error_file)
else: else:
crawler.crawl(lfm1b["username"], folder_name, error_file, failed_file=failed_file) crawler.crawl(lfm1b[["username", "max_ts"]], folder_name, error_file, failed_file=failed_file)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment