Commit 1c7a75d0 authored by Brandon's avatar Brandon

add whitelist for url checker

parent 9f3331f4
...@@ -11,6 +11,7 @@ import requests ...@@ -11,6 +11,7 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import validators import validators
whitelist = ["https://www.lbl.gov/disclaimers/"]
def get_url(this_page): def get_url(this_page):
"""Print out the URL """Print out the URL
...@@ -42,7 +43,12 @@ def check_url(page): ...@@ -42,7 +43,12 @@ def check_url(page):
if not validators.url(url): if not validators.url(url):
print("ERROR: INVALID URL") print("ERROR: INVALID URL")
try: try:
requests.get(url) if url in whitelist:
print("WHITELIST: {}".format(url))
else:
print(url)
requests.get(url)
except requests.exceptions.ConnectionError: except requests.exceptions.ConnectionError:
print("Bad URL: ", url) print("Bad URL: ", url)
raise raise
...@@ -66,7 +72,7 @@ def main(): ...@@ -66,7 +72,7 @@ def main():
for each_file in filenames: for each_file in filenames:
if each_file.endswith(".html"): if each_file.endswith(".html"):
filepath = root + os.sep + each_file filepath = root + os.sep + each_file
print(" ", filepath, "...", end=' ', flush=True) print(" ", filepath, "...")
filehandle = open(filepath, "r") filehandle = open(filepath, "r")
mypage = filehandle.read() mypage = filehandle.read()
page = str(BeautifulSoup(mypage, "html.parser")) page = str(BeautifulSoup(mypage, "html.parser"))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment