Commit 60fadded authored by Markus Shepherd's avatar Markus Shepherd 🙈
Browse files

Merge branch '78-limit' into 'master'

Resolve "Limit the number of images to be scraped for a single item"

Closes #78

See merge request !54
parents 4adb022d 7a9941be
......@@ -16,3 +16,11 @@ PULL_QUEUE_INTERVAL=300
# AWS credentials if you need access to S3
AWS_ACCESS_KEY_ID=<access-key>
AWS_SECRET_ACCESS_KEY=<secret-access-key>
# limit downloaded images per game and spider
# use 0 to disable download; -1 for all images
LIMIT_IMAGES_TO_DOWNLOAD_BGA=0
LIMIT_IMAGES_TO_DOWNLOAD_BGG=0
LIMIT_IMAGES_TO_DOWNLOAD_DBPEDIA=0
LIMIT_IMAGES_TO_DOWNLOAD_LUDING=0
LIMIT_IMAGES_TO_DOWNLOAD_SPIELEN=0
LIMIT_IMAGES_TO_DOWNLOAD_WIKIDATA=0
......@@ -170,6 +170,7 @@ class GameItem(TypedItem):
serializer=JSON_SERIALIZER,
parser=parse_json,
)
image_url_download = Field(serializer=JSON_SERIALIZER, parser=parse_json)
image_file = Field(serializer=JSON_SERIALIZER, parser=parse_json)
video_url = Field(
dtype=list,
......@@ -508,6 +509,7 @@ class UserItem(TypedItem):
serializer=JSON_SERIALIZER,
parser=parse_json,
)
image_url_download = Field(serializer=JSON_SERIALIZER, parser=parse_json)
image_file = Field(serializer=JSON_SERIALIZER, parser=parse_json)
published_at = Field(
......
......@@ -6,7 +6,9 @@ import logging
import math
import re
from itertools import islice
from urllib.parse import quote, unquote_plus
from typing import Optional
import jmespath
......@@ -179,3 +181,58 @@ class ResolveImagePipeline:
if item.get(field):
item[field] = clear_list(map(self._parse_url, arg_to_iter(item[field])))
return item
class LimitImagesPipeline:
"""Copy a limited number of image URLs to be downloaded from source to target."""
source_field: str
target_field: str
limit: Optional[int] = None
@classmethod
def from_crawler(cls, crawler):
"""Init from crawler."""
source_field = crawler.settings.get("LIMIT_IMAGES_URLS_FIELD")
target_field = crawler.settings.get("IMAGES_URLS_FIELD")
if not source_field or not target_field:
raise NotConfigured
limit = crawler.settings.getint("LIMIT_IMAGES_TO_DOWNLOAD", -1)
return cls(
source_field=source_field,
target_field=target_field,
limit=limit,
)
def __init__(
self, source_field: str, target_field: str, limit: Optional[int] = None
):
self.source_field = source_field
self.target_field = target_field
self.limit = limit
# pylint: disable=unused-argument
def process_item(self, item, spider):
"""Copy a limited number of image URLs to be downloaded from source to target."""
# adding target field would result in error; return item as-is
if hasattr(item, "fields") and self.target_field not in item.fields:
return item
if self.limit is None or self.limit < 0: # copy through everything
item[self.target_field] = list(arg_to_iter(item.get(self.source_field)))
return item
if not self.limit: # limit is zero
item[self.target_field] = []
return item
# actual limit
item[self.target_field] = list(
islice(arg_to_iter(item.get(self.source_field)), self.limit)
)
return item
......@@ -201,7 +201,8 @@ ITEM_PIPELINES = {
"scrapy_extensions.ValidatePipeline": 200,
"board_game_scraper.pipelines.ResolveLabelPipeline": 300,
"board_game_scraper.pipelines.ResolveImagePipeline": 400,
"scrapy.pipelines.images.ImagesPipeline": None,
"board_game_scraper.pipelines.LimitImagesPipeline": 500,
"scrapy.pipelines.images.ImagesPipeline": 600,
"scrapy.pipelines.images.FilesPipeline": None,
}
......@@ -264,12 +265,16 @@ DONT_RUN_BEFORE_DATE = os.getenv("DONT_RUN_BEFORE_DATE")
MEDIA_ALLOW_REDIRECTS = True
# LimitImagesPipeline
LIMIT_IMAGES_TO_DOWNLOAD = 0
LIMIT_IMAGES_URLS_FIELD = "image_url"
# Image processing
IMAGES_STORE = os.path.join(BASE_DIR, "images")
IMAGES_URLS_FIELD = "image_url"
IMAGES_URLS_FIELD = "image_url_download"
IMAGES_RESULT_FIELD = "image_file"
IMAGES_EXPIRES = 180
IMAGES_THUMBS = {"thumb": (1024, 1024)}
IMAGES_EXPIRES = 360
# IMAGES_THUMBS = {"thumb": (1024, 1024)}
# File processing
FILES_STORE = os.path.join(BASE_DIR, "rules")
......
......@@ -2,6 +2,8 @@
""" Board Game Atlas spider """
import os
from functools import partial
from itertools import chain
from urllib.parse import urlencode
......@@ -53,6 +55,8 @@ class BgaSpider(Spider):
"DOWNLOAD_DELAY": 30,
"CONCURRENT_REQUESTS_PER_DOMAIN": 4,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 2,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_BGA"))
or 0,
}
@classmethod
......
......@@ -2,6 +2,7 @@
""" BoardGameGeek spider """
import os
import re
import statistics
......@@ -120,6 +121,8 @@ class BggSpider(Spider):
"DELAYED_RETRY_DELAY": 5.0,
"AUTOTHROTTLE_HTTP_CODES": (429, 503, 504),
"PULL_QUEUE_ENABLED": True,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_BGG"))
or 0,
}
scrape_ratings = False
......
......@@ -2,9 +2,11 @@
""" DBpedia spider """
import os
from urllib.parse import urlencode
from pytility import batchify, normalize_space
from pytility import batchify, parse_int, normalize_space
from scrapy import Request, Spider
from scrapy.utils.misc import arg_to_iter
......@@ -50,6 +52,10 @@ class DBpediaSpider(Spider):
"DOWNLOAD_DELAY": 20,
"CONCURRENT_REQUESTS_PER_DOMAIN": 4,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 2,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(
os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_DBPEDIA")
)
or 0,
}
game_types = (
......
......@@ -2,9 +2,11 @@
""" Luding spider """
import os
import re
import string
from pytility import parse_int
from scrapy import Spider
from scrapy.utils.misc import arg_to_iter
......@@ -28,6 +30,10 @@ class LudingSpider(Spider):
"DOWNLOAD_DELAY": 2,
"CONCURRENT_REQUESTS_PER_DOMAIN": 8,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 4,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(
os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_LUDING")
)
or 0,
}
def parse(self, response):
......
......@@ -2,9 +2,10 @@
""" Spielen.de spider """
import os
import re
from pytility import clear_list
from pytility import clear_list, parse_int
from scrapy import Spider
from ..items import GameItem
......@@ -48,6 +49,10 @@ class SpielenSpider(Spider):
"DOWNLOAD_DELAY": 10,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 1,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(
os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_SPIELEN")
)
or 0,
}
def parse(self, response):
......
......@@ -3,10 +3,11 @@
""" Wikidata spider """
import json
import os
from urllib.parse import urlencode
from pytility import batchify, normalize_space
from pytility import batchify, normalize_space, parse_int
from scrapy import Request, Spider
from scrapy.loader.processors import MapCompose
from scrapy.utils.misc import arg_to_iter
......@@ -36,6 +37,10 @@ class WikidataSpider(Spider):
"DOWNLOAD_DELAY": 10,
"CONCURRENT_REQUESTS_PER_DOMAIN": 4,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 2,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(
os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_WIKIDATA")
)
or 0,
}
game_types = (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment