Verified Commit faecf492 authored by Markus Shepherd's avatar Markus Shepherd 🙈
Browse files

first version

parent 4adb022d
......@@ -170,6 +170,7 @@ class GameItem(TypedItem):
serializer=JSON_SERIALIZER,
parser=parse_json,
)
image_url_dowload = Field(serializer=JSON_SERIALIZER, parser=parse_json)
image_file = Field(serializer=JSON_SERIALIZER, parser=parse_json)
video_url = Field(
dtype=list,
......@@ -508,6 +509,7 @@ class UserItem(TypedItem):
serializer=JSON_SERIALIZER,
parser=parse_json,
)
image_url_dowload = Field(serializer=JSON_SERIALIZER, parser=parse_json)
image_file = Field(serializer=JSON_SERIALIZER, parser=parse_json)
published_at = Field(
......
......@@ -6,7 +6,9 @@ import logging
import math
import re
from itertools import islice
from urllib.parse import quote, unquote_plus
from typing import Optional
import jmespath
......@@ -179,3 +181,53 @@ class ResolveImagePipeline:
if item.get(field):
item[field] = clear_list(map(self._parse_url, arg_to_iter(item[field])))
return item
class LimitImagesPipeline:
"""TODO."""
source_field: str
target_field: str
limit: Optional[int] = None
@classmethod
def from_crawler(cls, crawler):
""" init from crawler """
source_field = crawler.settings.get("LIMIT_IMAGES_URLS_FIELD")
target_field = crawler.settings.get("IMAGES_URLS_FIELD")
if not source_field or not target_field:
raise NotConfigured
limit = crawler.settings.getint("LIMIT_IMAGES_TO_DOWNLOAD")
return cls(
source_field=source_field,
target_field=target_field,
limit=limit,
)
def __init__(
self, source_field: str, target_field: str, limit: Optional[int] = None
):
self.source_field = source_field
self.target_field = target_field
self.limit = limit
def process_item(self, item, spider):
"""TODO."""
if self.limit is None: # copy through everything
item[self.target_field] = list(arg_to_iter(item.get(self.source_field)))
return item
if not self.limit: # limit is zero
item[self.target_field] = []
return item
# actual limit
item[self.target_field] = list(
islice(arg_to_iter(item.get(self.source_field)), self.limit)
)
return item
......@@ -201,7 +201,8 @@ ITEM_PIPELINES = {
"scrapy_extensions.ValidatePipeline": 200,
"board_game_scraper.pipelines.ResolveLabelPipeline": 300,
"board_game_scraper.pipelines.ResolveImagePipeline": 400,
"scrapy.pipelines.images.ImagesPipeline": None,
"board_game_scraper.pipelines.LimitImagesPipeline": 500,
"scrapy.pipelines.images.ImagesPipeline": 600,
"scrapy.pipelines.images.FilesPipeline": None,
}
......@@ -264,12 +265,16 @@ DONT_RUN_BEFORE_DATE = os.getenv("DONT_RUN_BEFORE_DATE")
MEDIA_ALLOW_REDIRECTS = True
# LimitImagesPipeline
LIMIT_IMAGES_TO_DOWNLOAD = 0
LIMIT_IMAGES_URLS_FIELD = "image_url"
# Image processing
IMAGES_STORE = os.path.join(BASE_DIR, "images")
IMAGES_URLS_FIELD = "image_url"
IMAGES_URLS_FIELD = "image_url_dowload"
IMAGES_RESULT_FIELD = "image_file"
IMAGES_EXPIRES = 180
IMAGES_THUMBS = {"thumb": (1024, 1024)}
IMAGES_EXPIRES = 360
# IMAGES_THUMBS = {"thumb": (1024, 1024)}
# File processing
FILES_STORE = os.path.join(BASE_DIR, "rules")
......
......@@ -120,6 +120,7 @@ class BggSpider(Spider):
"DELAYED_RETRY_DELAY": 5.0,
"AUTOTHROTTLE_HTTP_CODES": (429, 503, 504),
"PULL_QUEUE_ENABLED": True,
"LIMIT_IMAGES_TO_DOWNLOAD": 1,
}
scrape_ratings = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment