Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Switch to GitLab Next
Sign in / Register
Toggle navigation
Open sidebar
Recommend.Games
Board Game Scraper
Commits
faecf492
Verified
Commit
faecf492
authored
Oct 27, 2020
by
Markus Shepherd
🙈
Browse files
first version
parent
4adb022d
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
64 additions
and
4 deletions
+64
-4
board_game_scraper/items.py
board_game_scraper/items.py
+2
-0
board_game_scraper/pipelines.py
board_game_scraper/pipelines.py
+52
-0
board_game_scraper/settings.py
board_game_scraper/settings.py
+9
-4
board_game_scraper/spiders/bgg.py
board_game_scraper/spiders/bgg.py
+1
-0
No files found.
board_game_scraper/items.py
View file @
faecf492
...
...
@@ -170,6 +170,7 @@ class GameItem(TypedItem):
serializer
=
JSON_SERIALIZER
,
parser
=
parse_json
,
)
image_url_dowload
=
Field
(
serializer
=
JSON_SERIALIZER
,
parser
=
parse_json
)
image_file
=
Field
(
serializer
=
JSON_SERIALIZER
,
parser
=
parse_json
)
video_url
=
Field
(
dtype
=
list
,
...
...
@@ -508,6 +509,7 @@ class UserItem(TypedItem):
serializer
=
JSON_SERIALIZER
,
parser
=
parse_json
,
)
image_url_dowload
=
Field
(
serializer
=
JSON_SERIALIZER
,
parser
=
parse_json
)
image_file
=
Field
(
serializer
=
JSON_SERIALIZER
,
parser
=
parse_json
)
published_at
=
Field
(
...
...
board_game_scraper/pipelines.py
View file @
faecf492
...
...
@@ -6,7 +6,9 @@ import logging
import
math
import
re
from
itertools
import
islice
from
urllib.parse
import
quote
,
unquote_plus
from
typing
import
Optional
import
jmespath
...
...
@@ -179,3 +181,53 @@ class ResolveImagePipeline:
if
item
.
get
(
field
):
item
[
field
]
=
clear_list
(
map
(
self
.
_parse_url
,
arg_to_iter
(
item
[
field
])))
return
item
class
LimitImagesPipeline
:
"""TODO."""
source_field
:
str
target_field
:
str
limit
:
Optional
[
int
]
=
None
@
classmethod
def
from_crawler
(
cls
,
crawler
):
""" init from crawler """
source_field
=
crawler
.
settings
.
get
(
"LIMIT_IMAGES_URLS_FIELD"
)
target_field
=
crawler
.
settings
.
get
(
"IMAGES_URLS_FIELD"
)
if
not
source_field
or
not
target_field
:
raise
NotConfigured
limit
=
crawler
.
settings
.
getint
(
"LIMIT_IMAGES_TO_DOWNLOAD"
)
return
cls
(
source_field
=
source_field
,
target_field
=
target_field
,
limit
=
limit
,
)
def
__init__
(
self
,
source_field
:
str
,
target_field
:
str
,
limit
:
Optional
[
int
]
=
None
):
self
.
source_field
=
source_field
self
.
target_field
=
target_field
self
.
limit
=
limit
def
process_item
(
self
,
item
,
spider
):
"""TODO."""
if
self
.
limit
is
None
:
# copy through everything
item
[
self
.
target_field
]
=
list
(
arg_to_iter
(
item
.
get
(
self
.
source_field
)))
return
item
if
not
self
.
limit
:
# limit is zero
item
[
self
.
target_field
]
=
[]
return
item
# actual limit
item
[
self
.
target_field
]
=
list
(
islice
(
arg_to_iter
(
item
.
get
(
self
.
source_field
)),
self
.
limit
)
)
return
item
board_game_scraper/settings.py
View file @
faecf492
...
...
@@ -201,7 +201,8 @@ ITEM_PIPELINES = {
"scrapy_extensions.ValidatePipeline"
:
200
,
"board_game_scraper.pipelines.ResolveLabelPipeline"
:
300
,
"board_game_scraper.pipelines.ResolveImagePipeline"
:
400
,
"scrapy.pipelines.images.ImagesPipeline"
:
None
,
"board_game_scraper.pipelines.LimitImagesPipeline"
:
500
,
"scrapy.pipelines.images.ImagesPipeline"
:
600
,
"scrapy.pipelines.images.FilesPipeline"
:
None
,
}
...
...
@@ -264,12 +265,16 @@ DONT_RUN_BEFORE_DATE = os.getenv("DONT_RUN_BEFORE_DATE")
MEDIA_ALLOW_REDIRECTS
=
True
# LimitImagesPipeline
LIMIT_IMAGES_TO_DOWNLOAD
=
0
LIMIT_IMAGES_URLS_FIELD
=
"image_url"
# Image processing
IMAGES_STORE
=
os
.
path
.
join
(
BASE_DIR
,
"images"
)
IMAGES_URLS_FIELD
=
"image_url"
IMAGES_URLS_FIELD
=
"image_url
_dowload
"
IMAGES_RESULT_FIELD
=
"image_file"
IMAGES_EXPIRES
=
18
0
IMAGES_THUMBS
=
{
"thumb"
:
(
1024
,
1024
)}
IMAGES_EXPIRES
=
36
0
#
IMAGES_THUMBS = {"thumb": (1024, 1024)}
# File processing
FILES_STORE
=
os
.
path
.
join
(
BASE_DIR
,
"rules"
)
...
...
board_game_scraper/spiders/bgg.py
View file @
faecf492
...
...
@@ -120,6 +120,7 @@ class BggSpider(Spider):
"DELAYED_RETRY_DELAY"
:
5.0
,
"AUTOTHROTTLE_HTTP_CODES"
:
(
429
,
503
,
504
),
"PULL_QUEUE_ENABLED"
:
True
,
"LIMIT_IMAGES_TO_DOWNLOAD"
:
1
,
}
scrape_ratings
=
False
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment