Commit 30d65956 authored by Pedro Vale Ramos's avatar Pedro Vale Ramos

Gets all auctions with all information for every auction

parent a4749cb9
import time
from bs4 import BeautifulSoup
from auction_parsing.connection import get_request
from auction_parsing.controller import parse_auction
URL_INITIAL_SEARCH = f'https://subastas.boe.es/subastas_ava.php?campo%5B0%5D=SUBASTA.ORIGEN&dato%5B0%5D=&campo%5B1%5D=SUBASTA.ESTADO&dato%5B1%5D=&campo%5B2%5D=BIEN.TIPO&dato%5B2%5D=&campo%5B4%5D=BIEN.DIRECCION&dato%5B4%5D=&campo%5B5%5D=BIEN.CODPOSTAL&dato%5B5%5D=&campo%5B6%5D=BIEN.LOCALIDAD&dato%5B6%5D=&campo%5B7%5D=BIEN.COD_PROVINCIA&dato%5B7%5D=&campo%5B8%5D=SUBASTA.POSTURA_MINIMA_MINIMA_LOTES&dato%5B8%5D=&campo%5B9%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_1&dato%5B9%5D=&campo%5B10%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_2&dato%5B10%5D=&campo%5B11%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_3&dato%5B11%5D=&campo%5B12%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_4&dato%5B12%5D=&campo%5B13%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_5&dato%5B13%5D=&campo%5B14%5D=SUBASTA.ID_SUBASTA_BUSCAR&dato%5B14%5D=&campo%5B15%5D=SUBASTA.FECHA_FIN_YMD&dato%5B15%5D%5B0%5D=&dato%5B15%5D%5B1%5D=&campo%5B16%5D=SUBASTA.FECHA_INICIO_YMD&dato%5B16%5D%5B0%5D=&dato%5B16%5D%5B1%5D=&page_hits=40&sort_field%5B0%5D=SUBASTA.FECHA_FIN_YMD&sort_order%5B0%5D=desc&sort_field%5B1%5D=SUBASTA.FECHA_FIN_YMD&sort_order%5B1%5D=asc&sort_field%5B2%5D=SUBASTA.HORA_FIN&sort_order%5B2%5D=asc&accion=Buscar'
def search_all_auctions():
def search_all_auctions(auction_page_callback):
'''
Main method to search for auctions.
:return: list of dictionaries. Each dictionary contains basic auction data
......@@ -19,7 +23,9 @@ def search_all_auctions():
page_soup = get_search_page_soup(url_search=next_page_url)
next_page_url = get_next_page_url(current_page_soup=page_soup)
auctions.extend(get_auctions_in_page(soup=page_soup))
auctions_page = get_auctions_in_page(soup=page_soup)
auction_page_callback(auctions_page)
auctions.extend(auctions_page)
return auctions
......@@ -28,27 +34,28 @@ def get_auctions_in_page(soup: BeautifulSoup):
for auction_html in soup.findAll('li', {'class': 'resultado-busqueda'}):
# print(auction_html)
auction_properties = auction_html.findAll('p', {'class': 'epigrafeDpto'})
auction_data = dict()
auction_data['auction_id'] = auction_properties[0].text.strip().split(' ')[1].strip()
if len(auction_properties) > 1:
auction_data['auctioneer'] = auction_properties[1].text.strip()
if len(auction_properties) > 2:
auction_data['case'] = auction_properties[2].text.strip().split(' ')[1]
if len(auction_properties) > 3:
auction_data['auction_state'] = auction_properties[3].text.strip()
auction_id = auction_properties[0].text.strip().split(' ')[1].strip()
time.sleep(2)
auction_data = parse_auction(auction_id)
print(auction_data)
# if len(auction_properties) > 1:
# auction_data['auctioneer'] = auction_properties[1].text.strip()
# if len(auction_properties) > 2:
# auction_data['case'] = auction_properties[2].text.strip().split(' ')[1]
# if len(auction_properties) > 3:
# auction_data['auction_state'] = auction_properties[3].text.strip()
auction_data['asset_description'] = auction_html.find('p', {'class': 'documento'}).text.strip()
# auction_data['asset_description'] = auction_html.find('p', {'class': 'documento'}).text.strip()
auctions.append(auction_data)
return auctions
def get_search_page_soup(url_search):
print(f'Searching: {url_search}')
soup = BeautifulSoup(get_request(url=url_search), 'html.parser')
return soup
def get_total_number_of_pages(soup: BeautifulSoup):
n_max_pages = 1
for page in soup.findAll('span', {'class': 'pagSigxxx'}):
......@@ -68,8 +75,3 @@ def get_next_page_url(current_page_soup: BeautifulSoup):
url_next_page = f'https://subastas.boe.es/{href}'
return url_next_page
return None
......@@ -5,11 +5,17 @@ from auction_querying.search_auctions import search_all_auctions
def main():
auctions = search_all_auctions()
assert(len(auctions) == 97094)
auctions = search_all_auctions(process_auctions)
#assert(len(auctions) == 97094)
auction_data_json = json.dumps(auctions)
print(auction_data_json)
def process_auctions(auctions):
print(f'PARSED {len(auctions)} new auctions')
auctions_json = json.dumps(auctions)
#print(auctions_json)
with open('auctions.json', 'w') as json_file:
json.dumps(auctions, json_file)
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment