Commit b66311d9 authored by Nathan's avatar Nathan 🚴

adds selenium talk

parent 53ab5ea5
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Basic Selenium Usage"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys\n",
"chromedriver = \"/Users/nathan.cheever/Desktop/chromedriver\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"driver = webdriver.Chrome(chromedriver)\n",
"driver.get(\"http://www.python.org\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Welcome to Python.org\n"
]
}
],
"source": [
"assert \"Python\" in driver.title\n",
"print(driver.title)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"elem = driver.find_element_by_name(\"q\")\n",
"elem.clear()\n",
"elem.send_keys(\"pycon\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"elem.send_keys(Keys.RETURN)\n",
"assert \"No results found.\" not in driver.page_source"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"driver.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### My first attempt\n",
"\n",
"Website we're visiting: https://nces.ed.gov/collegenavigator/"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.support.ui import Select\n",
"chromedriver = \"/Users/nathan.cheever/Desktop/chromedriver\""
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"driver = webdriver.Chrome(chromedriver)\n",
"driver.get(\"https://nces.ed.gov/collegenavigator/\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After doing some research on the element unique identifiers, I selected some criteria"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Let's just select the schools in New York (because I love New York!)\n",
"state_dropdown = driver.find_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_ucMapMain_lstState')\n",
"state_select = Select(state_dropdown)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"state_select.deselect_all()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"state_select.select_by_visible_text(\"New York\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# Only selecting schools with undergrad and grad options\n",
"gradschools = driver.find_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_chkGrad')\n",
"undergrad_schools = driver.find_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_chkBach')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# simulate a click!\n",
"gradschools.click()\n",
"undergrad_schools.click()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# Public schools\n",
"public_sch = driver.find_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_chkControlPublic')\n",
"public_sch.click()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# Show Results\n",
"show_results = driver.find_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_btnSearch')\n",
"show_results.click()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Click on \"Export Results\"\n",
"export = driver.find_element_by_id('ctl00_cphCollegeNavBody_ucFavoritesTop_divExport')\n",
"export.click()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# Click the CSV option output\n",
"driver.find_element_by_id('ctl00_cphCollegeNavBody_ucFavoritesTop_rdbCSV').click()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# click the final export button --> this will download the file to our specified directory\n",
"driver.find_element_by_id('ctl00_cphCollegeNavBody_ucFavoritesTop_aExportData').click()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"driver.quit()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Now as a \"script\""
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.support.ui import Select\n",
"chromedriver = \"/Users/nathan.cheever/Desktop/chromedriver\"\n",
"\n",
"driver = webdriver.Chrome(chromedriver)\n",
"driver.get(\"https://nces.ed.gov/collegenavigator/\")\n",
"\n",
"# Let's just select the schools in New York (because I love New York!)\n",
"state_dropdown = driver.find_element_by_id(\n",
" 'ctl00_cphCollegeNavBody_ucSearchMain_ucMapMain_lstState')\n",
"state_select = Select(state_dropdown)\n",
"\n",
"state_select.deselect_all()\n",
"state_select.select_by_visible_text(\"New York\")\n",
"\n",
"# Only selecting schools with undergrad and grad options\n",
"gradschools = driver.find_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_chkGrad')\n",
"undergrad_schools = driver.find_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_chkBach')\n",
"# simulate a click!\n",
"gradschools.click()\n",
"undergrad_schools.click()\n",
"\n",
"# Public schools\n",
"public_sch = driver.find_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_chkControlPublic')\n",
"public_sch.click()\n",
"\n",
"# Show Results\n",
"show_results = driver.find_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_btnSearch')\n",
"show_results.click()\n",
"# Click on \"Export Results\"\n",
"export = driver.find_element_by_id('ctl00_cphCollegeNavBody_ucFavoritesTop_divExport')\n",
"export.click()\n",
"# Click the CSV option output\n",
"driver.find_element_by_id('ctl00_cphCollegeNavBody_ucFavoritesTop_rdbCSV').click()\n",
"# click the final export button --> this will download the file to our specified directory\n",
"driver.find_element_by_id('ctl00_cphCollegeNavBody_ucFavoritesTop_aExportData').click()\n",
"driver.quit()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"File download demo:\n",
"https://www.thinkbroadband.com/download"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Waiting for download to begin...\n",
"9 tries left\n",
"Waiting for download to begin...\n",
"8 tries left\n",
"Waiting for download to begin...\n",
"7 tries left\n",
"Waiting for download to begin...\n",
"6 tries left\n",
"Waiting for download to begin...\n",
"5 tries left\n",
"['/Users/nathan.cheever/Downloads/Unconfirmed 900828.crdownload'] is downloading. Waiting...\n",
"['/Users/nathan.cheever/Downloads/Unconfirmed 900828.crdownload'] is downloading. Waiting...\n",
"['/Users/nathan.cheever/Downloads/Unconfirmed 900828.crdownload'] is downloading. Waiting...\n",
"['/Users/nathan.cheever/Downloads/Unconfirmed 900828.crdownload'] is downloading. Waiting...\n",
"['/Users/nathan.cheever/Downloads/Unconfirmed 900828.crdownload'] is downloading. Waiting...\n",
"['/Users/nathan.cheever/Downloads/Unconfirmed 900828.crdownload'] is downloading. Waiting...\n",
"['/Users/nathan.cheever/Downloads/Unconfirmed 900828.crdownload'] is downloading. Waiting...\n",
"['/Users/nathan.cheever/Downloads/Unconfirmed 900828.crdownload'] is downloading. Waiting...\n",
"Our file is downloaded!\n"
]
},
{
"data": {
"text/plain": [
"['/Users/nathan.cheever/Downloads/100MB.zip']"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wait_for_download_to_complete(file_name='100MB.zip', directory='/Users/nathan.cheever/Downloads/')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Now let's simulate a script with Requestium + file download waiter"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import os\n",
"import glob\n",
"\n",
"def wait_for_download_to_complete(file_name, directory, delay=2, tries_max=10):\n",
" \"\"\"Waits for a file to download before continuing execution.\n",
" \n",
" Args:\n",
" file_name: str, the name of the file to be downloaded including the extension\n",
" directory: str, the full path to the directory wherein the file will land\n",
" delay: int, how many seconds to wait before checking the file again\n",
" tries_max: int, how many attempts at checking a download is happening before quitting\n",
" \n",
" Returns:\n",
" bool: True if the file successfully downloaded, else False\n",
" \"\"\"\n",
" downloading_file = os.path.join(directory, \"Unconfirmed*.crdownload\")\n",
" finished_file = os.path.join(directory, file_name)\n",
" n_tries = 0\n",
" download_started = False\n",
" \n",
" while n_tries < tries_max:\n",
" \n",
" currently_downloading = glob.glob(downloading_file)\n",
" file_is_downloaded = glob.glob(finished_file)\n",
" \n",
" if currently_downloading and not file_is_downloaded:\n",
" download_started = True\n",
" time.sleep(delay)\n",
" print(f'{currently_downloading} is downloading. Waiting...')\n",
" \n",
" elif not currently_downloading and download_started and not file_is_downloaded:\n",
" raise ValueError(f\"File downloaded but was perhaps misnamed. No {finished_file} file found!\")\n",
" \n",
" elif file_is_downloaded:\n",
" print(f'Our file is downloaded!')\n",
" break\n",
" \n",
" else:\n",
" n_tries += 1\n",
" time.sleep(delay)\n",
" print(f'Waiting for download to begin...')\n",
" print(f'{tries_max - n_tries} tries left')\n",
"\n",
" return file_is_downloaded"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Waiting for download to begin...\n",
"9 tries left\n",
"Our file is downloaded!\n"
]
}
],
"source": [
"import requestium\n",
"chromedriver = \"/Users/nathan.cheever/Desktop/chromedriver\"\n",
"# Create the session\n",
"s = requestium.Session(webdriver_path=chromedriver, browser='chrome', default_timeout=15)\n",
"\n",
"# get the website\n",
"s.driver.get(\"https://nces.ed.gov/collegenavigator/\")\n",
"# Let's just select the schools in New York (because I love New York!)\n",
"state_dropdown = s.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_ucMapMain_lstState')\n",
"state_select = requestium.Select(state_dropdown)\n",
"state_select.deselect_all()\n",
"state_select.select_by_visible_text(\"New York\")\n",
"# Only selecting schools with undergrad and grad options\n",
"s.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_chkGrad').click()\n",
"s.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_chkBach').click()\n",
"# Public schools\n",
"s.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_chkControlPublic').click()\n",
"# Show Results\n",
"s.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_btnSearch').click()\n",
"# Click on \"Export Results\"\n",
"s.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucFavoritesTop_divExport').click()\n",
"# Click the CSV option output\n",
"s.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucFavoritesTop_rdbCSV').click()\n",
"# click the final export button --> this will download the file to our specified directory\n",
"s.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucFavoritesTop_aExportData').click()\n",
"\n",
"\n",
"# Wait for the file to download before closing the browser\n",
"wait_for_download_to_complete(file_name='CollegeNavigator_Search_*.csv', directory='/Users/nathan.cheever/Downloads/')\n",
"s.driver.quit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import requestium\n",
"# from pyvirtualdisplay import Display\n",
"\n",
"# chromedriver = \"/path/to/chromedriver\"\n",
"\n",
"# display = Display(visible=0, size=(1000, 1000))\n",
"# display.start()\n",
"\n",
"# session = requestium.Session(\n",
"# webdriver_path=chromedriver, browser='chrome', default_timeout=15, \n",
"# webdriver_options=webdriver_options)\n",
"\n",
"# session.driver.get(link)\n",
"\n",
"# # -- Do all your web interactions ...\n",
"\n",
"# session.driver.quit() # Stops the Chrome session\n",
"# display.sendstop() # Safely closes the virtualdisplay"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"front-matter": {
"date": "2018-02-18",
"slug": "etl_number_2",
"subtitle": "Using Selenium when you have to download stuff as part of your ETL process",
"title": "Selenium in your ETL pipeline"
},
"hugo-jupyter": {
"render-to": "content/posts/"
},
"kernelspec": {
"display_name": "rdemo",
"language": "python",
"name": "rdemo"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
# Note this is designed to run for Python 3.6
# -- Pre reqs:
# 1. Install Python (I like using Miniconda, version 3.6)
# 2. Install Git
# 3. Install a chromedriver
# 4. Install Chrome
# 5. Install Python dependencies
from pyvirtualdisplay import Display # For headless browsing
from selenium import webdriver
import requestium
import time
import glob
import os
import logging
from logging.handlers import RotatingFileHandler
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
file_handler = RotatingFileHandler("log.txt", 'a', 1 * 1024 * 1024, 10)
file_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'))
file_handler.setLevel(logging.INFO)
logger.addHandler(file_handler)
class Browser(object):
def __init__(self, link, directory=None):
"""Class for handling proper setup and teardown of a webdriven client for a remote environment.
Args:
link: str, URL to the page you want to visit
directory: str, full path to an existing directory where downloaded files will land
"""
display = Display(visible=0, size=(1000, 1000))
self.chromedriver = "/usr/local/bin/chromedriver"
options = webdriver.ChromeOptions()
if isinstance(directory, str) and os.path.isdir(directory):
self.webdriver_options = {'prefs': {'download.default_directory': directory}}
self.directory = directory
else:
self.webdriver_options = None
self.directory = '.'
self.session = requestium.Session(
webdriver_path=self.chromedriver,
browser='chrome',
default_timeout=15,
webdriver_options=self.webdriver_options)
self.display.start()
self.session.driver.get(link)
logger.info(f"(setup) Started display, session and visited {link}.")
def select_and_download_data(self):
"""Example for interacting with the NCES college website"""
# Let's just select the schools in New York (because I love New York!)
states = self.session.driver.ensure_element_by_id('ctl00_cphCollegeNavBody_ucSearchMain_ucMapMain_lstState')
state_select = requestium.Select(states)
state_select.deselect_all()
state_select.select_by_value("NY")
# Only selecting schools with undergrad and grad options
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkGrad").click()
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkBach").click()
# Public schools
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_chkControlPublic").click()
# Show Results
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucSearchMain_btnSearch").click()
# Click on "Export Results"
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_divExport").click()
# Click the CSV option output
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_rdbCSV").click()
# click the final export button --> this will download the file to our specified directory
self.session.driver.ensure_element_by_id("ctl00_cphCollegeNavBody_ucFavoritesTop_aExportData").click()
success = self.wait_for_download_to_complete(file_name=file_)
return success
def wait_for_download_to_complete(self, file_name, delay=2, tries_max=10):
"""Waits for a file to download before continuing execution.
Args:
file_name: str, the name of the file to be downloaded including the extension
delay: int, how many seconds to wait before checking the file again
tries_max: int, how many attempts at checking a download is happening before quitting
Returns:
bool: True if the file successfully downloaded, else False
"""
downloading_file = os.path.join(self.directory, "Unconfirmed*.crdownload")
finished_file = os.path.join(self.directory, file_name)
n_tries = 0
download_started = False
while n_tries < tries_max:
currently_downloading = glob.glob(downloading_file)
file_is_downloaded = glob.glob(finished_file)
# A file is downloading, but our expected file isn't there yet
if currently_downloading and not file_is_downloaded:
download_started = True
time.sleep(delay)
elif not currently_downloading and download_started and not file_is_downloaded:
raise ValueError(f"File downloaded but was perhaps misnamed. No {finished_file} file found!")