Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Switch to GitLab Next
Sign in / Register
Toggle navigation
Open sidebar
Recommend.Games
Board Game Scraper
Commits
6661ddb4
Verified
Commit
6661ddb4
authored
Nov 14, 2020
by
Markus Shepherd
🙈
Browse files
Merge branch '79-ranking-by-type' of gitlab.com:recommend.games/board-game-scraper into update
parents
eab9a8fe
c682e3e9
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
226 additions
and
17 deletions
+226
-17
board_game_scraper/__main__.py
board_game_scraper/__main__.py
+6
-3
board_game_scraper/full_merge.py
board_game_scraper/full_merge.py
+2
-0
board_game_scraper/items.py
board_game_scraper/items.py
+7
-0
board_game_scraper/settings.py
board_game_scraper/settings.py
+1
-0
board_game_scraper/spiders/bgg.py
board_game_scraper/spiders/bgg.py
+23
-3
board_game_scraper/spiders/bgg_rankings.py
board_game_scraper/spiders/bgg_rankings.py
+27
-11
docker-compose.yaml
docker-compose.yaml
+160
-0
No files found.
board_game_scraper/__main__.py
View file @
6661ddb4
...
...
@@ -85,7 +85,7 @@ def _parse_args():
help
=
"log level (repeat for more verbosity)"
,
)
return
parser
.
parse_args
()
return
parser
.
parse_
known_
args
()
def
main
():
...
...
@@ -94,8 +94,9 @@ def main():
settings
=
get_project_settings
()
configure_logging
(
settings
)
args
=
_parse_args
()
args
,
remainder
=
_parse_args
()
LOGGER
.
info
(
args
)
LOGGER
.
info
(
remainder
)
base_dir
=
Path
(
settings
[
"BASE_DIR"
]).
resolve
()
cache_dir
=
base_dir
/
".scrapy"
/
"httpcache"
...
...
@@ -179,7 +180,9 @@ def main():
f
"JOBDIR=
{
curr_job
}
"
,
"--set"
,
f
"DONT_RUN_BEFORE_FILE=
{
dont_run_before_file
}
"
,
]
]
+
remainder
LOGGER
.
info
(
"Executing command %r"
,
command
)
try
:
execute
(
argv
=
command
)
...
...
board_game_scraper/full_merge.py
View file @
6661ddb4
...
...
@@ -106,6 +106,7 @@ def merge_configs(spider, full=False):
else
(
"published_at"
,
"rank"
,
"add_rank"
,
"bgg_id"
,
"name"
,
"year"
,
...
...
@@ -131,6 +132,7 @@ def merge_configs(spider, full=False):
"published_at"
,
"bgg_id"
,
"rank"
,
"add_rank"
,
"name"
,
"year"
,
"num_votes"
,
...
...
board_game_scraper/items.py
View file @
6661ddb4
...
...
@@ -342,6 +342,13 @@ class GameItem(TypedItem):
input_processor
=
POS_INT_PROCESSOR
,
default
=
None
,
)
add_rank
=
Field
(
dtype
=
list
,
input_processor
=
IDENTITY
,
output_processor
=
IDENTITY
,
serializer
=
JSON_SERIALIZER
,
parser
=
parse_json
,
)
num_votes
=
Field
(
dtype
=
int
,
dtype_convert
=
parse_int
,
...
...
board_game_scraper/settings.py
View file @
6661ddb4
...
...
@@ -67,6 +67,7 @@ FEED_EXPORT_FIELDS = (
"implementation"
,
"integration"
,
"rank"
,
"add_rank"
,
"num_votes"
,
"avg_rating"
,
"stddev_rating"
,
...
...
board_game_scraper/spiders/bgg.py
View file @
6661ddb4
...
...
@@ -10,7 +10,7 @@ from functools import partial
from
itertools
import
repeat
from
urllib.parse
import
urlencode
from
pytility
import
batchify
,
clear_list
,
normalize_space
,
parse_int
from
pytility
import
batchify
,
clear_list
,
normalize_space
,
parse_float
,
parse_int
from
scrapy
import
signals
from
scrapy
import
Request
,
Spider
from
scrapy.utils.misc
import
arg_to_iter
...
...
@@ -87,10 +87,18 @@ def _value_id(items, sep=":"):
yield
f
"
{
value
}{
sep
}{
id_
}
"
if
id_
else
value
def
_remove_rank
(
value
):
return
(
value
[:
-
5
]
if
value
and
isinstance
(
value
,
str
)
and
value
.
lower
().
endswith
(
" rank"
)
else
value
)
def
_value_id_rank
(
items
,
sep
=
":"
):
for
item
in
arg_to_iter
(
items
):
value
=
item
.
xpath
(
"@friendlyname"
).
extract_first
()
or
""
value
=
value
[:
-
5
]
if
value
and
value
.
lower
().
endswith
(
" rank"
)
else
value
value
=
_remove_rank
(
value
)
id_
=
item
.
xpath
(
"@id"
).
extract_first
()
or
""
yield
f
"
{
value
}{
sep
}{
id_
}
"
if
id_
else
value
...
...
@@ -372,7 +380,7 @@ class BggSpider(Spider):
min_players_best max_players_best
\
min_age min_age_rec min_time max_time
\
game_type category mechanic cooperative compilation family expansion
\
rank num_votes avg_rating stddev_rating
\
rank
add_rank
num_votes avg_rating stddev_rating
\
bayes_rating worst_rating best_rating
\
complexity easiest_complexity hardest_complexity
\
language_dependency lowest_language_dependency highest_language_dependency
\
...
...
@@ -570,6 +578,18 @@ class BggSpider(Spider):
),
)
for
rank
in
game
.
xpath
(
'statistics/ratings/ranks/rank[@type = "family"]'
):
add_rank
=
{
"game_type"
:
rank
.
xpath
(
"@name"
).
extract_first
(),
"game_type_id"
:
parse_int
(
rank
.
xpath
(
"@id"
).
extract_first
()),
"name"
:
_remove_rank
(
rank
.
xpath
(
"@friendlyname"
).
extract_first
()),
"rank"
:
parse_int
(
rank
.
xpath
(
"@value"
).
extract_first
()),
"bayes_rating"
:
parse_float
(
rank
.
xpath
(
"@bayesaverage"
).
extract_first
()
),
}
ldr
.
add_value
(
"add_rank"
,
add_rank
)
yield
ldr
.
load_item
()
def
parse_collection
(
self
,
response
):
...
...
board_game_scraper/spiders/bgg_rankings.py
View file @
6661ddb4
...
...
@@ -6,10 +6,12 @@ import os
import
re
from
datetime
import
datetime
,
timezone
from
itertools
import
product
from
random
import
randint
from
pytility
import
normalize_space
,
parse_date
,
parse_int
from
scrapy
import
Request
,
Spider
from
scrapy.utils.misc
import
arg_to_iter
from
..items
import
GameItem
from
..loaders
import
GameLoader
...
...
@@ -65,6 +67,22 @@ def _extract_bgg_id(url):
return
extract_bgg_id
(
url
)
def
_start_urls
(
paths
,
bgg_domains
=
(
"http://boardgamegeek.com/"
,
"https://boardgamegeek.com/"
,
"http://www.boardgamegeek.com/"
,
"https://www.boardgamegeek.com/"
,
),
prefix_urls
=
(
"https://web.archive.org/web/{date}/"
,
""
),
):
for
prefix_url
,
bgg_domain
,
path
in
product
(
arg_to_iter
(
prefix_urls
),
arg_to_iter
(
bgg_domains
),
arg_to_iter
(
paths
)
):
yield
prefix_url
+
bgg_domain
+
path
def
_parse_date
(
date
,
tzinfo
=
timezone
.
utc
,
format_str
=
WEB_ARCHIVE_DATE_FORMAT
):
try
:
date
=
datetime
.
strptime
(
date
,
format_str
)
...
...
@@ -110,16 +128,7 @@ class BggRankingsSpider(Spider):
"top50.php3"
,
"topn.php3?count=50"
,
)
bgg_urls
=
(
tuple
(
f
"http://boardgamegeek.com/
{
path
}
"
for
path
in
bgg_paths
)
+
tuple
(
f
"https://boardgamegeek.com/
{
path
}
"
for
path
in
bgg_paths
)
+
tuple
(
f
"http://www.boardgamegeek.com/
{
path
}
"
for
path
in
bgg_paths
)
+
tuple
(
f
"https://www.boardgamegeek.com/
{
path
}
"
for
path
in
bgg_paths
)
)
start_urls
=
(
tuple
(
f
"https://web.archive.org/web/{{date}}/
{
url
}
"
for
url
in
bgg_urls
)
+
bgg_urls
)
start_urls
=
tuple
(
_start_urls
(
bgg_paths
))
item_classes
=
(
GameItem
,)
custom_settings
=
{
...
...
@@ -157,7 +166,14 @@ class BggRankingsSpider(Spider):
start_date_str
=
start_date
.
strftime
(
WEB_ARCHIVE_DATE_FORMAT
)
for
start_url
in
self
.
start_urls
:
start_urls
=
(
tuple
(
_start_urls
(
self
.
bgg_path
))
if
hasattr
(
self
,
"bgg_path"
)
and
self
.
bgg_path
else
self
.
start_urls
)
self
.
logger
.
info
(
"Start URLs: %s"
,
start_urls
)
for
start_url
in
start_urls
:
yield
Request
(
url
=
start_url
.
format
(
date
=
start_date_str
),
callback
=
self
.
parse
,
...
...
docker-compose.yaml
View file @
6661ddb4
...
...
@@ -67,6 +67,166 @@ services:
stop_grace_period
:
15m
stop_signal
:
SIGINT
bgg-rankings-abstract
:
image
:
registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name
:
bg-scraper-bgg-rankings-abstract
build
:
'
.'
command
:
[
'
python'
,
'
-m'
,
'
board_game_scraper'
,
'
bgg_rankings'
,
'
--feeds-subdir'
,
'
bgg_rankings_abstract'
,
'
-a'
,
'
bgg_path=abstracts/browse/boardgame'
,
]
env_file
:
.env
environment
:
CLOSESPIDER_TIMEOUT
:
36000
# 10 hours
DONT_RUN_BEFORE_SEC
:
21600
# 6 hours
volumes
:
-
./feeds:/app/feeds
-
./images:/app/images
restart
:
always
stop_grace_period
:
15m
stop_signal
:
SIGINT
bgg-rankings-children
:
image
:
registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name
:
bg-scraper-bgg-rankings-children
build
:
'
.'
command
:
[
'
python'
,
'
-m'
,
'
board_game_scraper'
,
'
bgg_rankings'
,
'
--feeds-subdir'
,
'
bgg_rankings_children'
,
'
-a'
,
'
bgg_path=childrensgames/browse/boardgame'
,
]
env_file
:
.env
environment
:
CLOSESPIDER_TIMEOUT
:
36000
# 10 hours
DONT_RUN_BEFORE_SEC
:
21600
# 6 hours
volumes
:
-
./feeds:/app/feeds
-
./images:/app/images
restart
:
always
stop_grace_period
:
15m
stop_signal
:
SIGINT
bgg-rankings-customizable
:
image
:
registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name
:
bg-scraper-bgg-rankings-customizable
build
:
'
.'
command
:
[
'
python'
,
'
-m'
,
'
board_game_scraper'
,
'
bgg_rankings'
,
'
--feeds-subdir'
,
'
bgg_rankings_customizable'
,
'
-a'
,
'
bgg_path=cgs/browse/boardgame'
,
]
env_file
:
.env
environment
:
CLOSESPIDER_TIMEOUT
:
36000
# 10 hours
DONT_RUN_BEFORE_SEC
:
21600
# 6 hours
volumes
:
-
./feeds:/app/feeds
-
./images:/app/images
restart
:
always
stop_grace_period
:
15m
stop_signal
:
SIGINT
bgg-rankings-family
:
image
:
registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name
:
bg-scraper-bgg-rankings-family
build
:
'
.'
command
:
[
'
python'
,
'
-m'
,
'
board_game_scraper'
,
'
bgg_rankings'
,
'
--feeds-subdir'
,
'
bgg_rankings_family'
,
'
-a'
,
'
bgg_path=familygames/browse/boardgame'
,
]
env_file
:
.env
environment
:
CLOSESPIDER_TIMEOUT
:
36000
# 10 hours
DONT_RUN_BEFORE_SEC
:
21600
# 6 hours
volumes
:
-
./feeds:/app/feeds
-
./images:/app/images
restart
:
always
stop_grace_period
:
15m
stop_signal
:
SIGINT
bgg-rankings-party
:
image
:
registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name
:
bg-scraper-bgg-rankings-party
build
:
'
.'
command
:
[
'
python'
,
'
-m'
,
'
board_game_scraper'
,
'
bgg_rankings'
,
'
--feeds-subdir'
,
'
bgg_rankings_party'
,
'
-a'
,
'
bgg_path=partygames/browse/boardgame'
,
]
env_file
:
.env
environment
:
CLOSESPIDER_TIMEOUT
:
36000
# 10 hours
DONT_RUN_BEFORE_SEC
:
21600
# 6 hours
volumes
:
-
./feeds:/app/feeds
-
./images:/app/images
restart
:
always
stop_grace_period
:
15m
stop_signal
:
SIGINT
bgg-rankings-strategy
:
image
:
registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name
:
bg-scraper-bgg-rankings-strategy
build
:
'
.'
command
:
[
'
python'
,
'
-m'
,
'
board_game_scraper'
,
'
bgg_rankings'
,
'
--feeds-subdir'
,
'
bgg_rankings_strategy'
,
'
-a'
,
'
bgg_path=strategygames/browse/boardgame'
,
]
env_file
:
.env
environment
:
CLOSESPIDER_TIMEOUT
:
36000
# 10 hours
DONT_RUN_BEFORE_SEC
:
21600
# 6 hours
volumes
:
-
./feeds:/app/feeds
-
./images:/app/images
restart
:
always
stop_grace_period
:
15m
stop_signal
:
SIGINT
bgg-rankings-thematic
:
image
:
registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name
:
bg-scraper-bgg-rankings-thematic
build
:
'
.'
command
:
[
'
python'
,
'
-m'
,
'
board_game_scraper'
,
'
bgg_rankings'
,
'
--feeds-subdir'
,
'
bgg_rankings_thematic'
,
'
-a'
,
'
bgg_path=thematic/browse/boardgame'
,
]
env_file
:
.env
environment
:
CLOSESPIDER_TIMEOUT
:
36000
# 10 hours
DONT_RUN_BEFORE_SEC
:
21600
# 6 hours
volumes
:
-
./feeds:/app/feeds
-
./images:/app/images
restart
:
always
stop_grace_period
:
15m
stop_signal
:
SIGINT
bgg-rankings-war
:
image
:
registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name
:
bg-scraper-bgg-rankings-war
build
:
'
.'
command
:
[
'
python'
,
'
-m'
,
'
board_game_scraper'
,
'
bgg_rankings'
,
'
--feeds-subdir'
,
'
bgg_rankings_war'
,
'
-a'
,
'
bgg_path=wargames/browse/boardgame'
,
]
env_file
:
.env
environment
:
CLOSESPIDER_TIMEOUT
:
36000
# 10 hours
DONT_RUN_BEFORE_SEC
:
21600
# 6 hours
volumes
:
-
./feeds:/app/feeds
-
./images:/app/images
restart
:
always
stop_grace_period
:
15m
stop_signal
:
SIGINT
bgg-geeklist
:
image
:
registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name
:
bg-scraper-bgg-geeklist
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment