LastFMCrawler.py 4.26 KB
Newer Older
Alessandro Melchiorre's avatar
- init  
Alessandro Melchiorre committed
1
2
3
4
5
6
import csv
import json
import os
import time
import urllib.parse

7
import requests
Alessandro Melchiorre's avatar
- init  
Alessandro Melchiorre committed
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from tqdm import tqdm


class LastFMCrawler:

    def __init__(self, api_key, api_secret, from_ts=None, to_ts=None):

        self.base_url = "https://ws.audioscrobbler.com/2.0/?"

        self.api_key = api_key
        self.api_secret = api_secret

        self.from_ts = from_ts
        self.to_ts = to_ts

        self.last_call = 0

    def _connect_url(self, url):
        # It connects to the url given as argument. If it fails three times, it raises and exception.
        n_trials = 1

        time_passed = time.time() - self.last_call
        if time_passed <= 0.3:
            time.sleep(time_passed)

33
        content = None
Alessandro Melchiorre's avatar
- init  
Alessandro Melchiorre committed
34
35
36
37
38
        while True:
            try:
                rs = requests.get(url)
                self.last_call = time.time()

39
40
                if not rs:
                    raise Exception(rs.status_code)
Alessandro Melchiorre's avatar
- init  
Alessandro Melchiorre committed
41

42
                content = rs.json()
43
44
                # Crash test
                _ = content['recenttracks']
Alessandro Melchiorre's avatar
- init  
Alessandro Melchiorre committed
45
46
47
48
49
50
51
52
53
54
                break

            except Exception as e:
                if n_trials > 3:
                    print("Number of trials exceeded!")
                    raise e
                print("Failed attempt. 0.8 Second Sleep")
                time.sleep(0.8)
                n_trials += 1

55
        return content
Alessandro Melchiorre's avatar
- init  
Alessandro Melchiorre committed
56
57
58
59
60
61
62
63
64
65
66
67

    def _slimmer(self, json):
        # Get rids of the useless images

        tracks = json["track"]

        for track in tracks:
            track.pop("image", None)
            if "artist" in track:
                track["artist"].pop("image", None)
        return json

68
    def crawl(self, users, json_folder, error_file, log_file, failed_file=None):
Alessandro Melchiorre's avatar
- init  
Alessandro Melchiorre committed
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85

        self.last_call = 0

        # Preparing basic arguments:
        basic_url_vars = {
            "method": "user.getrecenttracks",
            "limit": "200",
            "format": "json",
            "extended": "1",
            "api_key": self.api_key,
        }

        if self.from_ts:
            basic_url_vars["from"] = self.from_ts
        if self.to_ts:
            basic_url_vars["to"] = self.to_ts

86
        for user in tqdm(users):
87
88
            username = user["username"]
            max_ts = user["max_ts"]
Alessandro Melchiorre's avatar
- init  
Alessandro Melchiorre committed
89
90
91

            url_vars = basic_url_vars.copy()
            url_vars["user"] = username
92
93
94
            if "from" not in url_vars:
                url_vars["from"] = int(max_ts + 1)  # Adding one second

Alessandro Melchiorre's avatar
- init  
Alessandro Melchiorre committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
            usr_url = self.base_url + urllib.parse.urlencode(url_vars)

            usr_data = []

            try:

                rs = self._connect_url(usr_url)

                info = rs['recenttracks']['@attr']
                tot_pages = int(info['totalPages'])
                tot_les = int(info['total'])

                if tot_les > 0:

                    usr_data.append(self._slimmer(rs['recenttracks']))

                    # Fetch other pages, if any
                    for page in tqdm(range(2, tot_pages + 1)):
                        page_vars = url_vars.copy()
                        page_vars["page"] = str(page)
                        page_url = self.base_url + urllib.parse.urlencode(page_vars)

                        rs = self._connect_url(page_url)

                        usr_data.append(self._slimmer(rs['recenttracks']))

            except Exception as e:
                print('Failure while working on user {}'.format(username))
                print(str(e))

                # Saving if required
                if failed_file:
                    with open(failed_file, "a+") as out:
                        writer = csv.writer(out)
                        writer.writerow([username, str(e)])

                continue
            try:
133
134
135
136
137
138
139
                if tot_les > 0:
                    # Save it only if there a new events
                    with open(os.path.join(json_folder, username + ".json"), "w") as outfile:
                        json.dump(usr_data, outfile)
                with open(log_file, 'a+') as out:
                    writer = csv.writer(out)
                    writer.writerow([username, tot_les])
Alessandro Melchiorre's avatar
- init  
Alessandro Melchiorre committed
140
141
142
143
144
145

            except Exception as e:
                print("User {} not saved!".format(username))
                with open(error_file, "a+") as out:
                    writer = csv.writer(out)
                    writer.writerow([username, len(usr_data), str(e)])