Verified Commit a0bc3032 authored by Markus Shepherd's avatar Markus Shepherd 🙈
Browse files

limit image download per spider

parent b3df378b
......@@ -16,3 +16,11 @@ PULL_QUEUE_INTERVAL=300
# AWS credentials if you need access to S3
AWS_ACCESS_KEY_ID=<access-key>
AWS_SECRET_ACCESS_KEY=<secret-access-key>
# limit downloaded images per game and spider
# use 0 to disable download; -1 for all images
LIMIT_IMAGES_TO_DOWNLOAD_BGA=0
LIMIT_IMAGES_TO_DOWNLOAD_BGG=0
LIMIT_IMAGES_TO_DOWNLOAD_DBPEDIA=0
LIMIT_IMAGES_TO_DOWNLOAD_LUDING=0
LIMIT_IMAGES_TO_DOWNLOAD_SPIELEN=0
LIMIT_IMAGES_TO_DOWNLOAD_WIKIDATA=0
......@@ -215,6 +215,7 @@ class LimitImagesPipeline:
self.target_field = target_field
self.limit = limit
# pylint: disable=unused-argument
def process_item(self, item, spider):
"""Copy a limited number of image URLs to be downloaded from source to target."""
......
......@@ -2,6 +2,8 @@
""" Board Game Atlas spider """
import os
from functools import partial
from itertools import chain
from urllib.parse import urlencode
......@@ -53,6 +55,8 @@ class BgaSpider(Spider):
"DOWNLOAD_DELAY": 30,
"CONCURRENT_REQUESTS_PER_DOMAIN": 4,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 2,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_BGA"))
or 0,
}
@classmethod
......
......@@ -2,6 +2,7 @@
""" BoardGameGeek spider """
import os
import re
import statistics
......@@ -120,7 +121,8 @@ class BggSpider(Spider):
"DELAYED_RETRY_DELAY": 5.0,
"AUTOTHROTTLE_HTTP_CODES": (429, 503, 504),
"PULL_QUEUE_ENABLED": True,
"LIMIT_IMAGES_TO_DOWNLOAD": 1,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_BGG"))
or 0,
}
scrape_ratings = False
......
......@@ -2,9 +2,11 @@
""" DBpedia spider """
import os
from urllib.parse import urlencode
from pytility import batchify, normalize_space
from pytility import batchify, parse_int, normalize_space
from scrapy import Request, Spider
from scrapy.utils.misc import arg_to_iter
......@@ -50,6 +52,10 @@ class DBpediaSpider(Spider):
"DOWNLOAD_DELAY": 20,
"CONCURRENT_REQUESTS_PER_DOMAIN": 4,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 2,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(
os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_DBPEDIA")
)
or 0,
}
game_types = (
......
......@@ -2,9 +2,11 @@
""" Luding spider """
import os
import re
import string
from pytility import parse_int
from scrapy import Spider
from scrapy.utils.misc import arg_to_iter
......@@ -28,6 +30,10 @@ class LudingSpider(Spider):
"DOWNLOAD_DELAY": 2,
"CONCURRENT_REQUESTS_PER_DOMAIN": 8,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 4,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(
os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_LUDING")
)
or 0,
}
def parse(self, response):
......
......@@ -2,9 +2,10 @@
""" Spielen.de spider """
import os
import re
from pytility import clear_list
from pytility import clear_list, parse_int
from scrapy import Spider
from ..items import GameItem
......@@ -48,6 +49,10 @@ class SpielenSpider(Spider):
"DOWNLOAD_DELAY": 10,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 1,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(
os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_SPIELEN")
)
or 0,
}
def parse(self, response):
......
......@@ -3,10 +3,11 @@
""" Wikidata spider """
import json
import os
from urllib.parse import urlencode
from pytility import batchify, normalize_space
from pytility import batchify, normalize_space, parse_int
from scrapy import Request, Spider
from scrapy.loader.processors import MapCompose
from scrapy.utils.misc import arg_to_iter
......@@ -36,6 +37,10 @@ class WikidataSpider(Spider):
"DOWNLOAD_DELAY": 10,
"CONCURRENT_REQUESTS_PER_DOMAIN": 4,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 2,
"LIMIT_IMAGES_TO_DOWNLOAD": parse_int(
os.getenv("LIMIT_IMAGES_TO_DOWNLOAD_WIKIDATA")
)
or 0,
}
game_types = (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment