Scraping amazon books

parent 0b4cafc2
import requests
import re
from bs4 import BeautifulSoup
import IPython # debug: IPython.embed()
import time
import random
# TODO verify this before running
LAST_PAGE = 15
URL = 'https://www.amazon.com.br/s/ref=sr_pg_{0}?fst=as%3Aoff&rh=n%3A6740748011%2Cp_28%3Adevops%2Cp_n_feature_nine_browse-bin%3A8529758011&page={0}&bbn=6740748011&sort=relevanceexprank&unfiltered=1&ie=UTF8'
BOOKS_PER_PAGE = 16
def url_for(page_number):
return URL.format(str(page_number), str(page_number))
class AccessError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class Visitor:
def __init__(self):
self.years = []
self.last_page = LAST_PAGE # it could be automated
def visit_pages(self):
for page_number in range(1, self.last_page+1):
self.visit(page_number)
time_to_sleep = 15 + random.randint(0,9)
time.sleep(time_to_sleep)
def visit(self, page_number):
print('\nVisiting page', page_number)
url = url_for(page_number)
print('Requesting', url)
attempt = 1
keep_trying = True
while keep_trying:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
try:
self.parse(soup, page_number)
keep_trying = False
except AccessError:
print('Trying again...')
time_to_sleep = 30 + attempt*2 + random.randint(0,9)
time.sleep(time_to_sleep)
attempt += 1
def parse(self, soup, page_number):
years_in_page = re.findall('\d{1,2} \w{3} (\d{4})', str(soup))
print('years of page %d = %s' % (page_number, years_in_page))
if (page_number != self.last_page and len(years_in_page) != BOOKS_PER_PAGE
or page_number == self.last_page and len(years_in_page) == 0):
print("Vish, it seems Amazon blocked us!")
raise AccessError('Blocked by Amazon')
self.years.extend(years_in_page)
def main():
visitor = Visitor()
visitor.visit_pages()
print("\nAll the years:")
print(visitor.years)
if __name__ == "__main__":
main()
All the years:
['2018', '2013', '2012', '2016', '2018', '2018', '2013', '2017', '2014', '2018', '2016', '2016', '2016', '2016', '2018', '2018', '2018', '2015', '2016', '2017', '2018', '2018', '2017', '2015', '2016', '2014', '2018', '2017', '2018', '2017', '2016', '2018', '2018', '2018', '2015', '2015', '2017', '2018', '2018', '2018', '2015', '2018', '2016', '2018', '2014', '2019', '2018', '2018', '2017', '2018', '2018', '2017', '2016', '2017', '2017', '2015', '2018', '2017', '2017', '2017', '2017', '2017', '2016', '2016', '2016', '2015', '2015', '2014', '2012', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2019', '2018', '2019', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2017', '2017', '2017', '2017', '2019', '2017', '2017', '2017', '2018', '2017', '2017', '2017', '2017', '2017', '2017', '2017', '2017', '2016', '2016', '2017', '2017', '2017', '2016', '2016', '2016', '2017', '2017', '2016', '2015', '2016', '2016', '2015', '2015', '2015', '2016', '2015', '2015', '2014', '2015', '2015', '2014', '2012', '2012', '2018', '2017', '2018', '2018', '2016', '2012', '2018', '2019', '2017', '2017', '2016', '2018', '2018', '2017', '2017', '2016', '2016', '2011', '2018', '2018', '2018', '2018', '2018', '2017', '2018', '2016', '2018', '2018', '2017', '2016', '2016', '2016', '2015', '2015', '2019', '2019', '2018', '2017', '2014', '2012', '2016', '2011', '2018', '2016', '2015', '2017', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2018', '2019', '2019', '2016']
pandas
matplotlib
beautifulsoup4
requests
ipython
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment