Verified Commit 6661ddb4 authored by Markus Shepherd's avatar Markus Shepherd 🙈
Browse files

Merge branch '79-ranking-by-type' of gitlab.com:recommend.games/board-game-scraper into update

parents eab9a8fe c682e3e9
......@@ -85,7 +85,7 @@ def _parse_args():
help="log level (repeat for more verbosity)",
)
return parser.parse_args()
return parser.parse_known_args()
def main():
......@@ -94,8 +94,9 @@ def main():
settings = get_project_settings()
configure_logging(settings)
args = _parse_args()
args, remainder = _parse_args()
LOGGER.info(args)
LOGGER.info(remainder)
base_dir = Path(settings["BASE_DIR"]).resolve()
cache_dir = base_dir / ".scrapy" / "httpcache"
......@@ -179,7 +180,9 @@ def main():
f"JOBDIR={curr_job}",
"--set",
f"DONT_RUN_BEFORE_FILE={dont_run_before_file}",
]
] + remainder
LOGGER.info("Executing command %r", command)
try:
execute(argv=command)
......
......@@ -106,6 +106,7 @@ def merge_configs(spider, full=False):
else (
"published_at",
"rank",
"add_rank",
"bgg_id",
"name",
"year",
......@@ -131,6 +132,7 @@ def merge_configs(spider, full=False):
"published_at",
"bgg_id",
"rank",
"add_rank",
"name",
"year",
"num_votes",
......
......@@ -342,6 +342,13 @@ class GameItem(TypedItem):
input_processor=POS_INT_PROCESSOR,
default=None,
)
add_rank = Field(
dtype=list,
input_processor=IDENTITY,
output_processor=IDENTITY,
serializer=JSON_SERIALIZER,
parser=parse_json,
)
num_votes = Field(
dtype=int,
dtype_convert=parse_int,
......
......@@ -67,6 +67,7 @@ FEED_EXPORT_FIELDS = (
"implementation",
"integration",
"rank",
"add_rank",
"num_votes",
"avg_rating",
"stddev_rating",
......
......@@ -10,7 +10,7 @@ from functools import partial
from itertools import repeat
from urllib.parse import urlencode
from pytility import batchify, clear_list, normalize_space, parse_int
from pytility import batchify, clear_list, normalize_space, parse_float, parse_int
from scrapy import signals
from scrapy import Request, Spider
from scrapy.utils.misc import arg_to_iter
......@@ -87,10 +87,18 @@ def _value_id(items, sep=":"):
yield f"{value}{sep}{id_}" if id_ else value
def _remove_rank(value):
return (
value[:-5]
if value and isinstance(value, str) and value.lower().endswith(" rank")
else value
)
def _value_id_rank(items, sep=":"):
for item in arg_to_iter(items):
value = item.xpath("@friendlyname").extract_first() or ""
value = value[:-5] if value and value.lower().endswith(" rank") else value
value = _remove_rank(value)
id_ = item.xpath("@id").extract_first() or ""
yield f"{value}{sep}{id_}" if id_ else value
......@@ -372,7 +380,7 @@ class BggSpider(Spider):
min_players_best max_players_best \
min_age min_age_rec min_time max_time \
game_type category mechanic cooperative compilation family expansion \
rank num_votes avg_rating stddev_rating \
rank add_rank num_votes avg_rating stddev_rating \
bayes_rating worst_rating best_rating \
complexity easiest_complexity hardest_complexity \
language_dependency lowest_language_dependency highest_language_dependency \
......@@ -570,6 +578,18 @@ class BggSpider(Spider):
),
)
for rank in game.xpath('statistics/ratings/ranks/rank[@type = "family"]'):
add_rank = {
"game_type": rank.xpath("@name").extract_first(),
"game_type_id": parse_int(rank.xpath("@id").extract_first()),
"name": _remove_rank(rank.xpath("@friendlyname").extract_first()),
"rank": parse_int(rank.xpath("@value").extract_first()),
"bayes_rating": parse_float(
rank.xpath("@bayesaverage").extract_first()
),
}
ldr.add_value("add_rank", add_rank)
yield ldr.load_item()
def parse_collection(self, response):
......
......@@ -6,10 +6,12 @@ import os
import re
from datetime import datetime, timezone
from itertools import product
from random import randint
from pytility import normalize_space, parse_date, parse_int
from scrapy import Request, Spider
from scrapy.utils.misc import arg_to_iter
from ..items import GameItem
from ..loaders import GameLoader
......@@ -65,6 +67,22 @@ def _extract_bgg_id(url):
return extract_bgg_id(url)
def _start_urls(
paths,
bgg_domains=(
"http://boardgamegeek.com/",
"https://boardgamegeek.com/",
"http://www.boardgamegeek.com/",
"https://www.boardgamegeek.com/",
),
prefix_urls=("https://web.archive.org/web/{date}/", ""),
):
for prefix_url, bgg_domain, path in product(
arg_to_iter(prefix_urls), arg_to_iter(bgg_domains), arg_to_iter(paths)
):
yield prefix_url + bgg_domain + path
def _parse_date(date, tzinfo=timezone.utc, format_str=WEB_ARCHIVE_DATE_FORMAT):
try:
date = datetime.strptime(date, format_str)
......@@ -110,16 +128,7 @@ class BggRankingsSpider(Spider):
"top50.php3",
"topn.php3?count=50",
)
bgg_urls = (
tuple(f"http://boardgamegeek.com/{path}" for path in bgg_paths)
+ tuple(f"https://boardgamegeek.com/{path}" for path in bgg_paths)
+ tuple(f"http://www.boardgamegeek.com/{path}" for path in bgg_paths)
+ tuple(f"https://www.boardgamegeek.com/{path}" for path in bgg_paths)
)
start_urls = (
tuple(f"https://web.archive.org/web/{{date}}/{url}" for url in bgg_urls)
+ bgg_urls
)
start_urls = tuple(_start_urls(bgg_paths))
item_classes = (GameItem,)
custom_settings = {
......@@ -157,7 +166,14 @@ class BggRankingsSpider(Spider):
start_date_str = start_date.strftime(WEB_ARCHIVE_DATE_FORMAT)
for start_url in self.start_urls:
start_urls = (
tuple(_start_urls(self.bgg_path))
if hasattr(self, "bgg_path") and self.bgg_path
else self.start_urls
)
self.logger.info("Start URLs: %s", start_urls)
for start_url in start_urls:
yield Request(
url=start_url.format(date=start_date_str),
callback=self.parse,
......
......@@ -67,6 +67,166 @@ services:
stop_grace_period: 15m
stop_signal: SIGINT
bgg-rankings-abstract:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-bgg-rankings-abstract
build: '.'
command: [
'python', '-m', 'board_game_scraper', 'bgg_rankings',
'--feeds-subdir', 'bgg_rankings_abstract',
'-a', 'bgg_path=abstracts/browse/boardgame',
]
env_file: .env
environment:
CLOSESPIDER_TIMEOUT: 36000 # 10 hours
DONT_RUN_BEFORE_SEC: 21600 # 6 hours
volumes:
- ./feeds:/app/feeds
- ./images:/app/images
restart: always
stop_grace_period: 15m
stop_signal: SIGINT
bgg-rankings-children:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-bgg-rankings-children
build: '.'
command: [
'python', '-m', 'board_game_scraper', 'bgg_rankings',
'--feeds-subdir', 'bgg_rankings_children',
'-a', 'bgg_path=childrensgames/browse/boardgame',
]
env_file: .env
environment:
CLOSESPIDER_TIMEOUT: 36000 # 10 hours
DONT_RUN_BEFORE_SEC: 21600 # 6 hours
volumes:
- ./feeds:/app/feeds
- ./images:/app/images
restart: always
stop_grace_period: 15m
stop_signal: SIGINT
bgg-rankings-customizable:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-bgg-rankings-customizable
build: '.'
command: [
'python', '-m', 'board_game_scraper', 'bgg_rankings',
'--feeds-subdir', 'bgg_rankings_customizable',
'-a', 'bgg_path=cgs/browse/boardgame',
]
env_file: .env
environment:
CLOSESPIDER_TIMEOUT: 36000 # 10 hours
DONT_RUN_BEFORE_SEC: 21600 # 6 hours
volumes:
- ./feeds:/app/feeds
- ./images:/app/images
restart: always
stop_grace_period: 15m
stop_signal: SIGINT
bgg-rankings-family:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-bgg-rankings-family
build: '.'
command: [
'python', '-m', 'board_game_scraper', 'bgg_rankings',
'--feeds-subdir', 'bgg_rankings_family',
'-a', 'bgg_path=familygames/browse/boardgame',
]
env_file: .env
environment:
CLOSESPIDER_TIMEOUT: 36000 # 10 hours
DONT_RUN_BEFORE_SEC: 21600 # 6 hours
volumes:
- ./feeds:/app/feeds
- ./images:/app/images
restart: always
stop_grace_period: 15m
stop_signal: SIGINT
bgg-rankings-party:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-bgg-rankings-party
build: '.'
command: [
'python', '-m', 'board_game_scraper', 'bgg_rankings',
'--feeds-subdir', 'bgg_rankings_party',
'-a', 'bgg_path=partygames/browse/boardgame',
]
env_file: .env
environment:
CLOSESPIDER_TIMEOUT: 36000 # 10 hours
DONT_RUN_BEFORE_SEC: 21600 # 6 hours
volumes:
- ./feeds:/app/feeds
- ./images:/app/images
restart: always
stop_grace_period: 15m
stop_signal: SIGINT
bgg-rankings-strategy:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-bgg-rankings-strategy
build: '.'
command: [
'python', '-m', 'board_game_scraper', 'bgg_rankings',
'--feeds-subdir', 'bgg_rankings_strategy',
'-a', 'bgg_path=strategygames/browse/boardgame',
]
env_file: .env
environment:
CLOSESPIDER_TIMEOUT: 36000 # 10 hours
DONT_RUN_BEFORE_SEC: 21600 # 6 hours
volumes:
- ./feeds:/app/feeds
- ./images:/app/images
restart: always
stop_grace_period: 15m
stop_signal: SIGINT
bgg-rankings-thematic:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-bgg-rankings-thematic
build: '.'
command: [
'python', '-m', 'board_game_scraper', 'bgg_rankings',
'--feeds-subdir', 'bgg_rankings_thematic',
'-a', 'bgg_path=thematic/browse/boardgame',
]
env_file: .env
environment:
CLOSESPIDER_TIMEOUT: 36000 # 10 hours
DONT_RUN_BEFORE_SEC: 21600 # 6 hours
volumes:
- ./feeds:/app/feeds
- ./images:/app/images
restart: always
stop_grace_period: 15m
stop_signal: SIGINT
bgg-rankings-war:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-bgg-rankings-war
build: '.'
command: [
'python', '-m', 'board_game_scraper', 'bgg_rankings',
'--feeds-subdir', 'bgg_rankings_war',
'-a', 'bgg_path=wargames/browse/boardgame',
]
env_file: .env
environment:
CLOSESPIDER_TIMEOUT: 36000 # 10 hours
DONT_RUN_BEFORE_SEC: 21600 # 6 hours
volumes:
- ./feeds:/app/feeds
- ./images:/app/images
restart: always
stop_grace_period: 15m
stop_signal: SIGINT
bgg-geeklist:
image: registry.gitlab.com/recommend.games/board-game-scraper:${LIBRARY_VERSION}
container_name: bg-scraper-bgg-geeklist
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment