crawl.jl 4.55 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
struct CrawlData
    urls
    visited
    urlsvalid
    urls404
    urls3xx
    urlsexternal
    urlsfail

    CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
end

function handlehref(data::CrawlData, base, url, href; forcehttps=false)
    # Remove URL fragment (#)
    href = match(r"^(?<path>[^\#]*)", href)[:path]
    href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+\:\/\/)?", href)[:protocol]
    if href_protocol != nothing
        if href_protocol == "http" || href_protocol == "https"
            if forcehttps
                href_noprot = chop(href; head=length(href_protocol), tail=0)
                ismatch = startswith(href_noprot, base_noprot)
            else
                ismatch = startswith(href, base)
            end
            # Index urls that start with the base URL
            if startswith(href, base)
                push!(data.urls, href)
                @debug "Match (absolute URL): $href"
            else
                push!(data.urlsexternal, href)
            end
        else
            @debug "Ignoring URL $href with ignored procotol $href_protocol"
        end
    elseif startswith(href, "/")
        abshref = base * chop(href; head=1, tail=0)
        @debug "Match (absolute path): $abshref"
        push!(data.urls, abshref)
    else
        abshref = url * href
        @debug "Match (relative path): $abshref"
        push!(data.urls, abshref)
    end
end

function visit(data::CrawlData, url, base; forcehttps=false)
    @info "Visiting $url …"
    res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
    if res.status == 404
        @info "Ignoring HTTP 404 status code url $url"
        push!(data.urls404, url)
    elseif HTTP.isredirect(res) # res.status >= 300 && res.status < 400
        push!(data.urls3xx, url)

        for (name, value) in res.headers
            if name == "Location"
                @debug "Identified redirect $url to $value"
                handlehref(data, base, url, value; forcehttps=false)
                break
            end
        end
    elseif res.status == 200
        push!(data.urlsvalid, url)

        # Scan for new URLs on this page
        body = String(res.body)
        for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
            href = m[:url]
            handlehref(data, base, url, href; forcehttps=false)
        end
    else
        @debug "For $url response status is $(res.status)"
        push!(data.urlsfail, url)
    end
    push!(data.visited, url)
end

function write_urls(filename, urls)
    open(filename, "w") do f
        for url in urls
            println(f, url)
        end
    end
end

function basechop(base)
    if startswith(base, "http://")
        return chop(base; head=length("http://"), tail=0)
    elseif startswith(base, "https://")
        return chop(base; head=length("https://"), tail=0)
    else
        @error "Unexpected prefix "
    end
end

function crawl_and_generate(base; forcehttps=false)
    m = match(r"(<?protocol>(http)|(https))\:\/\/[^\/]+(?<port>\:[0-9]+)?(?<trailslash>\/)?(?<path>)?", base)
    if m == nothing
        @error "Failed to parse passed URL"
        exit(1)
    elseif m[:trailslash] == nothing
        @error "Missing trailing slash"
        exit(1)
    end

    data = CrawlData()

    if forcehttps && startswith(base, "http://")
        base = chop(base; head=length("http://", tail=0))
    end
 
    push!(data.urls, base)

    while length(data.visited) != length(data.urls)
        @info "Intermediate count: #urls: $(length(data.urls)), #visited: $(length(data.visited)), #valid $(length(data.urlsvalid)), #3xx $(length(data.urls3xx))…"
        for url in data.urls
            if in(data.visited, url) == false
                visit(data, url, base; forcehttps=forcehttps)
            end
        end
    end

    @info "Identified $(length(data.urls)) urls, $(length(data.urls404)) 404 urls, $(length(data.urlsexternal)) external urls, $(length(data.urls3xx)) 3xx urls, $(length(data.urlsfail)) fail."

    @info "Clearing files …"
    rm(out_404; force=true)
    rm(out_3xx; force=true)
    rm(out_external; force=true)
    rm(out_fail; force=true)
    rm(out_sitemap; force=true)

    @info "Writing url files …"
    write_urls(out_404, data.urls404)
    write_urls(out_3xx, data.urls3xx)
    write_urls(out_external, data.urlsexternal)
    write_urls(out_fail, data.urlsfail)

    @info "Writing sitemap $out_sitemap …"
    Sitemap.write_sitemap(out_sitemap, data.urlsvalid)
end