- max-ts file showing the maximum timestamp for each user

- code modified accordingly
parent f033a127
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -80,10 +80,15 @@ class LastFMCrawler:
if self.to_ts:
basic_url_vars["to"] = self.to_ts
for username in tqdm(users):
for _, user in tqdm(users.iterrows()):
username = user["username"]
max_ts = user["max_ts"]
url_vars = basic_url_vars.copy()
url_vars["user"] = username
if "from" not in url_vars:
url_vars["from"] = int(max_ts + 1) # Adding one second
usr_url = self.base_url + urllib.parse.urlencode(url_vars)
usr_data = []
......@@ -2,27 +2,27 @@ import csv
import os
import pandas as pd
from LastFMCrawler import LastFMCrawler
from conf import API_KEY_1, API_SECRET_1
lfm1b = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[1, 3],
names=["username", "country"])
from src.LastFMCrawler import LastFMCrawler
from src.conf import API_KEY_1, API_SECRET_1
# Only users with country information
lfm1b = lfm1b[~lfm1b["country"].isna()]
lfm1b_users = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[0, 1, 3],
names=["uid", "username", "country"])
lfm1b_ts = pd.read_csv("./data/LFM-1b_UID_MAX_TS.txt", delimiter="\t", header=None, names=["uid", "max_ts"])
# columns are: uid, username, country, max_ts
lfm1b = pd.merge(lfm1b_users, lfm1b_ts)
# timestamp for 01/01/2020 00:00:00 GMT (Greenwich Mean Time)
from_ts = 1577836800
# timestamp for 20/03/2020 12:00:00 GMT
to_ts = 1584705600
# folder name
folder_name = "./data/{}-{}".format(from_ts, to_ts)
folder_name = "./data/max_ts-{}".format(to_ts)
# failed users (csv file)
failed_file = "./data/{}-{}-failed_users.csv".format(from_ts, to_ts)
failed_file = "./data/max_ts-{}-failed_users.csv".format(to_ts)
# error_file, logs all errors happened during the saving
error_file = "./data/{}-{}-error_file.csv".format(from_ts, to_ts)
error_file = "./data/max_ts-{}-error_file.csv".format(to_ts)
if not os.path.isdir(folder_name):
# Create Folder if not exists
......@@ -47,10 +47,9 @@ if not work_on_failures:
print("Number of users to crawl is: {}".format(len(lfm1b)))
crawler = LastFMCrawler(api_key=API_KEY_1, api_secret=API_SECRET_1, from_ts=from_ts, to_ts=to_ts)
crawler = LastFMCrawler(api_key=API_KEY_1, api_secret=API_SECRET_1, to_ts=to_ts)
if work_on_failures:
crawler.crawl(lfm1b["username"], folder_name, error_file)
crawler.crawl(lfm1b[["username", "max_ts"]], folder_name, error_file)
crawler.crawl(lfm1b["username"], folder_name, error_file, failed_file=failed_file)
crawler.crawl(lfm1b[["username", "max_ts"]], folder_name, error_file, failed_file=failed_file)
