Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Alessandro Melchiorre
last_fm_crawler
Commits
72c1a4cf
Commit
72c1a4cf
authored
Jun 03, 2020
by
Alessandro Melchiorre
Browse files
# Sorted data
# No multiple machines
parent
471c635c
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
data/LFM-1b_users_playcounts_filtered.txt
0 → 100644
View file @
72c1a4cf
This diff is collapsed.
Click to expand it.
src/main.py
View file @
72c1a4cf
...
...
@@ -4,14 +4,18 @@ import os
import
pandas
as
pd
from
LastFMCrawler
import
LastFMCrawler
from
conf
import
API_KEYS
,
API_SECRETS
from
crawling_settings
import
api_key_idx
,
this_crawler
from
crawling_settings
import
api_key_idx
lfm1b_users
=
pd
.
read_csv
(
"./data/LFM-1b_users.txt"
,
delimiter
=
"
\t
"
,
header
=
None
,
usecols
=
[
0
,
1
,
3
],
names
=
[
"uid"
,
"username"
,
"country"
])
lfm1b_ts
=
pd
.
read_csv
(
"./data/LFM-1b_UID_MAX_TS.txt"
,
delimiter
=
"
\t
"
,
header
=
None
,
names
=
[
"uid"
,
"max_ts"
])
# columns are: uid, username, country, max_ts
lfm1b
=
pd
.
merge
(
lfm1b_users
,
lfm1b_ts
)
lfm1b_us_pc
=
pd
.
read_csv
(
'./data/LFM-1b_users_playcounts_filtered.txt'
,
delimiter
=
'
\t
'
,
skiprows
=
[
0
],
names
=
[
'uid'
,
'username'
,
'pc'
])
# columns are: uid, username, country, max_ts, pc
lfm1b
=
pd
.
merge
(
lfm1b_users
,
lfm1b_ts
).
merge
(
lfm1b_us_pc
)
lfm1b
=
lfm1b
.
sort_values
(
by
=
'pc'
)
print
(
lfm1b
.
head
())
print
(
"Number of users in LFM-1b is: {}"
.
format
(
len
(
lfm1b
)))
...
...
@@ -50,9 +54,9 @@ if not work_on_failures:
# number_of_crawlers and this crawler are used to find only subset of users for the current execution (modulo)
# this_crawler is included in crawling_settings.py
number_of_crawlers
=
3
#
number_of_crawlers = 3
lfm1b
=
lfm1b
[
lfm1b
.
uid
%
number_of_crawlers
==
this_crawler
]
#
lfm1b = lfm1b[lfm1b.uid % number_of_crawlers == this_crawler]
print
(
"Number of users to crawl is: {}"
.
format
(
len
(
lfm1b
)))
# api_key_idx is included in crawling_settings
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment