Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Alessandro Melchiorre
last_fm_crawler
Commits
045783ff
Commit
045783ff
authored
Mar 24, 2020
by
Alessandro Melchiorre
Browse files
- max-ts file showing the maximum timestamp for each user
- code modified accordingly
parent
f033a127
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
data/LFM-1b_UID_MAX_TS.txt
0 → 100644
View file @
045783ff
This diff is collapsed.
Click to expand it.
src/LastFMCrawler.py
View file @
045783ff
...
...
@@ -80,10 +80,15 @@ class LastFMCrawler:
if
self
.
to_ts
:
basic_url_vars
[
"to"
]
=
self
.
to_ts
for
username
in
tqdm
(
users
):
for
_
,
user
in
tqdm
(
users
.
iterrows
()):
username
=
user
[
"username"
]
max_ts
=
user
[
"max_ts"
]
url_vars
=
basic_url_vars
.
copy
()
url_vars
[
"user"
]
=
username
if
"from"
not
in
url_vars
:
url_vars
[
"from"
]
=
int
(
max_ts
+
1
)
# Adding one second
usr_url
=
self
.
base_url
+
urllib
.
parse
.
urlencode
(
url_vars
)
usr_data
=
[]
...
...
src/main.py
View file @
045783ff
...
...
@@ -2,27 +2,27 @@ import csv
import
os
import
pandas
as
pd
from
LastFMCrawler
import
LastFMCrawler
from
conf
import
API_KEY_1
,
API_SECRET_1
lfm1b
=
pd
.
read_csv
(
"./data/LFM-1b_users.txt"
,
delimiter
=
"
\t
"
,
header
=
None
,
usecols
=
[
1
,
3
],
names
=
[
"username"
,
"country"
])
from
src.LastFMCrawler
import
LastFMCrawler
from
src.conf
import
API_KEY_1
,
API_SECRET_1
# Only users with country information
lfm1b
=
lfm1b
[
~
lfm1b
[
"country"
].
isna
()]
lfm1b_users
=
pd
.
read_csv
(
"./data/LFM-1b_users.txt"
,
delimiter
=
"
\t
"
,
header
=
None
,
usecols
=
[
0
,
1
,
3
],
names
=
[
"uid"
,
"username"
,
"country"
])
lfm1b_ts
=
pd
.
read_csv
(
"./data/LFM-1b_UID_MAX_TS.txt"
,
delimiter
=
"
\t
"
,
header
=
None
,
names
=
[
"uid"
,
"max_ts"
])
# columns are: uid, username, country, max_ts
lfm1b
=
pd
.
merge
(
lfm1b_users
,
lfm1b_ts
)
print
(
lfm1b
.
head
())
# timestamp for 01/01/2020 00:00:00 GMT (Greenwich Mean Time)
from_ts
=
1577836800
# timestamp for 20/03/2020 12:00:00 GMT
to_ts
=
1584705600
# folder name
folder_name
=
"./data/
{}
-{}"
.
format
(
from_ts
,
to_ts
)
folder_name
=
"./data/
max_ts
-{}"
.
format
(
to_ts
)
# failed users (csv file)
failed_file
=
"./data/
{}
-{}-failed_users.csv"
.
format
(
from_ts
,
to_ts
)
failed_file
=
"./data/
max_ts
-{}-failed_users.csv"
.
format
(
to_ts
)
# error_file, logs all errors happened during the saving
error_file
=
"./data/
{}
-{}-error_file.csv"
.
format
(
from_ts
,
to_ts
)
error_file
=
"./data/
max_ts
-{}-error_file.csv"
.
format
(
to_ts
)
if
not
os
.
path
.
isdir
(
folder_name
):
# Create Folder if not exists
...
...
@@ -47,10 +47,9 @@ if not work_on_failures:
print
(
"Number of users to crawl is: {}"
.
format
(
len
(
lfm1b
)))
crawler
=
LastFMCrawler
(
api_key
=
API_KEY_1
,
api_secret
=
API_SECRET_1
,
from_ts
=
from_ts
,
to_ts
=
to_ts
)
crawler
=
LastFMCrawler
(
api_key
=
API_KEY_1
,
api_secret
=
API_SECRET_1
,
to_ts
=
to_ts
)
print
(
failed_file
)
if
work_on_failures
:
crawler
.
crawl
(
lfm1b
[
"username"
],
folder_name
,
error_file
)
crawler
.
crawl
(
lfm1b
[
[
"username"
,
"max_ts"
]
],
folder_name
,
error_file
)
else
:
crawler
.
crawl
(
lfm1b
[
"username"
],
folder_name
,
error_file
,
failed_file
=
failed_file
)
crawler
.
crawl
(
lfm1b
[
[
"username"
,
"max_ts"
]
],
folder_name
,
error_file
,
failed_file
=
failed_file
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment