Commit 116dd88a authored by Alessandro Melchiorre's avatar Alessandro Melchiorre
Browse files

- init

parents
This diff is collapsed.
import csv
import json
import os
import time
import urllib.parse
import faster_than_requests as requests
from tqdm import tqdm
class LastFMCrawler:
def __init__(self, api_key, api_secret, from_ts=None, to_ts=None):
self.base_url = "https://ws.audioscrobbler.com/2.0/?"
self.api_key = api_key
self.api_secret = api_secret
self.from_ts = from_ts
self.to_ts = to_ts
self.last_call = 0
def _connect_url(self, url):
# It connects to the url given as argument. If it fails three times, it raises and exception.
n_trials = 1
time_passed = time.time() - self.last_call
if time_passed <= 0.3:
time.sleep(time_passed)
while True:
try:
rs = requests.get(url)
self.last_call = time.time()
status = rs["status"]
if status != "200 OK":
raise Exception(status)
break
except Exception as e:
if n_trials > 3:
print("Number of trials exceeded!")
raise e
print("Failed attempt. 0.8 Second Sleep")
time.sleep(0.8)
n_trials += 1
return json.loads(rs["body"])
def _slimmer(self, json):
# Get rids of the useless images
tracks = json["track"]
for track in tracks:
track.pop("image", None)
if "artist" in track:
track["artist"].pop("image", None)
return json
def crawl(self, users, json_folder, error_file, failed_file=None):
self.last_call = 0
# Preparing basic arguments:
basic_url_vars = {
"method": "user.getrecenttracks",
"limit": "200",
"format": "json",
"extended": "1",
"api_key": self.api_key,
}
if self.from_ts:
basic_url_vars["from"] = self.from_ts
if self.to_ts:
basic_url_vars["to"] = self.to_ts
for username in tqdm(users):
url_vars = basic_url_vars.copy()
url_vars["user"] = username
usr_url = self.base_url + urllib.parse.urlencode(url_vars)
usr_data = []
try:
rs = self._connect_url(usr_url)
info = rs['recenttracks']['@attr']
tot_pages = int(info['totalPages'])
tot_les = int(info['total'])
if tot_les > 0:
usr_data.append(self._slimmer(rs['recenttracks']))
# Fetch other pages, if any
for page in tqdm(range(2, tot_pages + 1)):
page_vars = url_vars.copy()
page_vars["page"] = str(page)
page_url = self.base_url + urllib.parse.urlencode(page_vars)
rs = self._connect_url(page_url)
usr_data.append(self._slimmer(rs['recenttracks']))
except Exception as e:
print('Failure while working on user {}'.format(username))
print(str(e))
# Saving if required
if failed_file:
with open(failed_file, "a+") as out:
writer = csv.writer(out)
writer.writerow([username, str(e)])
continue
try:
with open(os.path.join(json_folder, username + ".json"), "w") as outfile:
json.dump(usr_data, outfile)
except Exception as e:
print("User {} not saved!".format(username))
with open(error_file, "a+") as out:
writer = csv.writer(out)
writer.writerow([username, len(usr_data), str(e)])
API_KEY_1 = 'e300f86cc2d7820b568185ac42563e96'
API_SECRET_1 = 'ea6cdb065438f166ff7a2f9d561bc62e'
API_KEY_2 = '982ecb74a29dad923a09d0217c01ff94'
API_SECRET_2 = '59d4c1c67c3fad02f00456dad5df8f22'
API_KEY_3 = 'd6a28f74f0f8a8dc7d01c594a60c716c'
API_SECRET_3 = '5e051193161130c797cd97eefe3170d7'
import csv
import os
import pandas as pd
from LastFMCrawler import LastFMCrawler
from conf import API_KEY_1, API_SECRET_1
lfm1b = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[1, 3],
names=["username", "country"])
# Only users with country information
lfm1b = lfm1b[~lfm1b["country"].isna()]
print(lfm1b.head())
# timestamp for 01/01/2020 00:00:00 GMT (Greenwich Mean Time)
from_ts = 1577836800
# timestamp for 20/03/2020 12:00:00 GMT
to_ts = 1584705600
# folder name
folder_name = "./data/{}-{}".format(from_ts, to_ts)
# failed users (csv file)
failed_file = "./data/{}-{}-failed_users.csv".format(from_ts, to_ts)
# error_file, logs all errors happened during the saving
error_file = "./data/{}-{}-error_file.csv".format(from_ts, to_ts)
if not os.path.isdir(folder_name):
# Create Folder if not exists
os.mkdir(folder_name)
else:
# Do not crawl already crawled users
crawled = [x.replace(".json", "") for x in os.listdir(folder_name)]
lfm1b = lfm1b[~lfm1b.username.isin(crawled)]
# If True, tries to re-fetch users for which previous attempts had failed.
# It False, it sets a log file in which it stores the usernames to skip
work_on_failures = False
if not work_on_failures:
# Do not crawl failed users
if os.path.isfile(failed_file):
with open(failed_file, "r") as inf:
reader = csv.reader(inf)
failed_users = [x[0] for x in reader]
lfm1b = lfm1b[~lfm1b.username.isin(failed_users)]
print("Number of users to crawl is: {}".format(len(lfm1b)))
crawler = LastFMCrawler(api_key=API_KEY_1, api_secret=API_SECRET_1, from_ts=from_ts, to_ts=to_ts)
print(failed_file)
if work_on_failures:
crawler.crawl(lfm1b["username"], folder_name, error_file)
else:
crawler.crawl(lfm1b["username"], folder_name, error_file, failed_file=failed_file)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment