Commit 116dd88a authored by Alessandro Melchiorre's avatar Alessandro Melchiorre
Browse files

- init

parents
This source diff could not be displayed because it is too large. You can view the blob instead.
import csv
import json
import os
import time
import urllib.parse
import faster_than_requests as requests
from tqdm import tqdm
class LastFMCrawler:
def __init__(self, api_key, api_secret, from_ts=None, to_ts=None):
self.base_url = "https://ws.audioscrobbler.com/2.0/?"
self.api_key = api_key
self.api_secret = api_secret
self.from_ts = from_ts
self.to_ts = to_ts
self.last_call = 0
def _connect_url(self, url):
# It connects to the url given as argument. If it fails three times, it raises and exception.
n_trials = 1
time_passed = time.time() - self.last_call
if time_passed <= 0.3:
time.sleep(time_passed)
while True:
try:
rs = requests.get(url)
self.last_call = time.time()
status = rs["status"]
if status != "200 OK":
raise Exception(status)
break
except Exception as e:
if n_trials > 3:
print("Number of trials exceeded!")
raise e
print("Failed attempt. 0.8 Second Sleep")
time.sleep(0.8)
n_trials += 1
return json.loads(rs["body"])
def _slimmer(self, json):
# Get rids of the useless images
tracks = json["track"]
for track in tracks:
track.pop("image", None)
if "artist" in track:
track["artist"].pop("image", None)
return json
def crawl(self, users, json_folder, error_file, failed_file=None):
self.last_call = 0
# Preparing basic arguments:
basic_url_vars = {
"method": "user.getrecenttracks",
"limit": "200",
"format": "json",
"extended": "1",
"api_key": self.api_key,
}
if self.from_ts:
basic_url_vars["from"] = self.from_ts
if self.to_ts:
basic_url_vars["to"] = self.to_ts
for username in tqdm(users):
url_vars = basic_url_vars.copy()
url_vars["user"] = username
usr_url = self.base_url + urllib.parse.urlencode(url_vars)
usr_data = []
try:
rs = self._connect_url(usr_url)
info = rs['recenttracks']['@attr']
tot_pages = int(info['totalPages'])
tot_les = int(info['total'])
if tot_les > 0:
usr_data.append(self._slimmer(rs['recenttracks']))
# Fetch other pages, if any
for page in tqdm(range(2, tot_pages + 1)):
page_vars = url_vars.copy()
page_vars["page"] = str(page)
page_url = self.base_url + urllib.parse.urlencode(page_vars)
rs = self._connect_url(page_url)
usr_data.append(self._slimmer(rs['recenttracks']))
except Exception as e:
print('Failure while working on user {}'.format(username))
print(str(e))
# Saving if required
if failed_file:
with open(failed_file, "a+") as out:
writer = csv.writer(out)
writer.writerow([username, str(e)])
continue
try:
with open(os.path.join(json_folder, username + ".json"), "w") as outfile:
json.dump(usr_data, outfile)
except Exception as e:
print("User {} not saved!".format(username))
with open(error_file, "a+") as out:
writer = csv.writer(out)
writer.writerow([username, len(usr_data), str(e)])
API_KEY_1 = 'e300f86cc2d7820b568185ac42563e96'
API_SECRET_1 = 'ea6cdb065438f166ff7a2f9d561bc62e'
API_KEY_2 = '982ecb74a29dad923a09d0217c01ff94'
API_SECRET_2 = '59d4c1c67c3fad02f00456dad5df8f22'
API_KEY_3 = 'd6a28f74f0f8a8dc7d01c594a60c716c'
API_SECRET_3 = '5e051193161130c797cd97eefe3170d7'
import csv
import os
import pandas as pd
from LastFMCrawler import LastFMCrawler
from conf import API_KEY_1, API_SECRET_1
lfm1b = pd.read_csv("./data/LFM-1b_users.txt", delimiter="\t", header=None, usecols=[1, 3],
names=["username", "country"])
# Only users with country information
lfm1b = lfm1b[~lfm1b["country"].isna()]
print(lfm1b.head())
# timestamp for 01/01/2020 00:00:00 GMT (Greenwich Mean Time)
from_ts = 1577836800
# timestamp for 20/03/2020 12:00:00 GMT
to_ts = 1584705600
# folder name
folder_name = "./data/{}-{}".format(from_ts, to_ts)
# failed users (csv file)
failed_file = "./data/{}-{}-failed_users.csv".format(from_ts, to_ts)
# error_file, logs all errors happened during the saving
error_file = "./data/{}-{}-error_file.csv".format(from_ts, to_ts)
if not os.path.isdir(folder_name):
# Create Folder if not exists
os.mkdir(folder_name)
else:
# Do not crawl already crawled users
crawled = [x.replace(".json", "") for x in os.listdir(folder_name)]
lfm1b = lfm1b[~lfm1b.username.isin(crawled)]
# If True, tries to re-fetch users for which previous attempts had failed.
# It False, it sets a log file in which it stores the usernames to skip
work_on_failures = False
if not work_on_failures:
# Do not crawl failed users
if os.path.isfile(failed_file):
with open(failed_file, "r") as inf:
reader = csv.reader(inf)
failed_users = [x[0] for x in reader]
lfm1b = lfm1b[~lfm1b.username.isin(failed_users)]
print("Number of users to crawl is: {}".format(len(lfm1b)))
crawler = LastFMCrawler(api_key=API_KEY_1, api_secret=API_SECRET_1, from_ts=from_ts, to_ts=to_ts)
print(failed_file)
if work_on_failures:
crawler.crawl(lfm1b["username"], folder_name, error_file)
else:
crawler.crawl(lfm1b["username"], folder_name, error_file, failed_file=failed_file)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment