scrape_urls.py 4.34 KB
Newer Older
1 2 3 4 5
"""URL checking script

This script loops over all static HTML pages generated on the NERSC
documentation website, and validates (i.e., resolves) every URL on every page.
It throws an error if even one URL fails.
Brandon's avatar
Brandon committed
6 7

A cached list of known good URLs is supported via good-url-cache.txt
8 9 10 11 12 13 14
"""

import os
import argparse
import requests
from bs4 import BeautifulSoup
import validators
Brandon's avatar
Brandon committed
15
import sys
16

17 18 19
# Known good pages that do not need to be validated. More are appended to this
# list as the script crawls the docs website so that we do not re-validate the
# same pages.
Brandon's avatar
Brandon committed
20 21 22 23 24 25 26 27 28 29 30 31 32
badlist  = []
goodlist = ["https://www.lbl.gov/disclaimers",
            "https://science.energy.gov",
            "https://www.lbl.gov",
            "https://nxcloud01.nersc.gov",
            "http://epsi.pppl.gov/xgc-users/how-to-become-an-xgc-user",
            "https://stash.nersc.gov",
            "https://stash.nersc.gov:8443",
            "https://www.nersc.gov",
            "http://localhost",
            "https://localhost",
            "http://localhost:5000",
            "https://localhost:5000",
33 34
            "https://registry.services.nersc.gov",
            "https://rancher.spin.nersc.gov/v2-beta/projects/1a5/services/NotMyStack"]
Brandon's avatar
Brandon committed
35
skiplist = ["https://doi.org/"]
36 37 38 39 40 41 42 43 44

def get_url(this_page):
    """Print out the URL

    Found on StackOverflow: https://stackoverflow.com/a/15517610

    :param this_page: html of web page
    :return: urls in that page
    """
45 46 47 48 49

    # Validate only external URLs, not internal ones. (mkdocs can validate
    # internal links itself.) External URLs have the "http" prefix, whereas
    # internal links user relative paths.
    start_link = this_page.find('a href="http')
50 51 52 53 54 55 56 57 58 59 60
    if start_link == -1:
        return None, 0
    start_quote = this_page.find('"', start_link)
    end_quote = this_page.find('"', start_quote + 1)
    this_url = this_page[start_quote + 1: end_quote]
    return this_url, end_quote


def check_url(page):
    """Function that checks the validity of a URL."""
    while True:
Brandon's avatar
Brandon committed
61
        url_raw, end_quote = get_url(page)
62
        page = page[end_quote:]
Brandon's avatar
Brandon committed
63
        if url_raw:
Brandon's avatar
Brandon committed
64

Brandon's avatar
Brandon committed
65
            url = url_raw.rstrip("/")
Brandon's avatar
Brandon committed
66 67 68 69 70
            
            if any(suburl in url for suburl in skiplist):
                print("SKIP: {}".format(url))
                continue

71
            if not validators.url(url):
Brandon's avatar
Brandon committed
72
                print("INVALID: {}".format(url))
Brandon's avatar
Brandon committed
73 74
                continue
            
75
            try:
Brandon's avatar
Brandon committed
76
                if url not in goodlist:
Brandon's avatar
Brandon committed
77
                    requests.get(url, timeout=60)
Brandon's avatar
Brandon committed
78 79 80
                    goodlist.append(url)
                    print("OK: {}".format(url))
                    
Brandon's avatar
Brandon committed
81
            except requests.exceptions.ConnectionError as ex:
Brandon's avatar
Brandon committed
82
                print("BAD: ", url)
Brandon's avatar
Brandon committed
83
                print("INFO:", ex)
84
                badlist.append(url)
85 86 87 88 89 90 91 92 93 94 95
        else:
            break


def main():
    """Loops over all documentation pages and checks validity of URLs."""
    parser = argparse.ArgumentParser(description="Validate some URLs.")

    parser.add_argument("doc_base_dir", metavar="doc_base_dir", type=str,
                        help="Base directory of NERSC documentation site.")

Brandon's avatar
Brandon committed
96 97 98 99
    parser.add_argument("--goodurls", type=str,
                        default="good-urls-cache.txt",
                        help="File with list of good urls (to skip)")

100 101
    args = parser.parse_args()

Brandon's avatar
Brandon committed
102 103 104 105 106 107 108 109
    if os.path.isfile(args.goodurls):
        with open(args.goodurls) as f:
            global goodlist
            goodlist += f.read().splitlines()
            print("read cached good urls from {}".format(args.goodurls))
    for url in goodlist:
        print("GOOD: {}".format(url))
    
110 111 112 113 114 115
    print("Checking pages for valid URLs ...")
    doc_root_dir = args.doc_base_dir
    for root, dirs, filenames in os.walk(doc_root_dir):
        for each_file in filenames:
            if each_file.endswith(".html"):
                filepath = root + os.sep + each_file
Brandon's avatar
Brandon committed
116
                print("   ", filepath, "...")
117 118 119 120
                filehandle = open(filepath, "r")
                mypage = filehandle.read()
                page = str(BeautifulSoup(mypage, "html.parser"))
                check_url(page)
Brandon's avatar
Brandon committed
121 122 123 124

    with open(args.goodurls, "w") as f:
        f.write('\n'.join(goodlist))

125 126 127 128 129
    print("SUMMARY:")
    if len(badlist) > 0:
        print("Failed urls:")
        for url in badlist:
            print(url)
Brandon's avatar
Brandon committed
130
        return 1
131 132
    else:
        print("No bad urls!")
Brandon's avatar
Brandon committed
133
        return 0
134

Brandon's avatar
Brandon committed
135
sys.exit(main())