crawl.jl 4.73 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
struct CrawlData
    urls
    visited
    urlsvalid
    urls404
    urls3xx
    urlsexternal
    urlsfail

    CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
end

Jan Klass's avatar
Jan Klass committed
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
function getprotocol(url)
    match(r"^(?<protocol>[a-zA-Z0-9]+)?\:\/\/", url)[:protocol]
end

function handlehref_abs(data::CrawlData, base, url, href, href_protocol; forcehttps=false)
    if href_protocol == "http" && forcehttps
        href_protocol = "https"
        href_noprot = chop(href; head=length(href_protocol), tail=0)
        href = "https" * href_noprot
    end

    if href_protocol == "http" || href_protocol == "https"
        # Index urls that start with the base URL
        if startswith(href, base)
            push!(data.urls, href)
            @debug "Match (absolute URL): $href"
        else
            @debug "External URL: $href"
            push!(data.urlsexternal, href)
        end
    else
        @debug "Ignoring URL $href with ignored procotol $href_protocol"
    end
end

38 39 40
function handlehref(data::CrawlData, base, url, href; forcehttps=false)
    # Remove URL fragment (#)
    href = match(r"^(?<path>[^\#]*)", href)[:path]
Jan Klass's avatar
Jan Klass committed
41

Jan Klass's avatar
Jan Klass committed
42
    href_protocol = getprotocol(href)
43
    if href_protocol != nothing
Jan Klass's avatar
Jan Klass committed
44
        handlehref_abs(data, base, url, href, href_protocol)
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
    elseif startswith(href, "/")
        abshref = base * chop(href; head=1, tail=0)
        @debug "Match (absolute path): $abshref"
        push!(data.urls, abshref)
    else
        abshref = url * href
        @debug "Match (relative path): $abshref"
        push!(data.urls, abshref)
    end
end

function visit(data::CrawlData, url, base; forcehttps=false)
    @info "Visiting $url …"
    res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
    if res.status == 404
        @info "Ignoring HTTP 404 status code url $url"
        push!(data.urls404, url)
    elseif HTTP.isredirect(res) # res.status >= 300 && res.status < 400
        push!(data.urls3xx, url)

        for (name, value) in res.headers
            if name == "Location"
                @debug "Identified redirect $url to $value"
Jan Klass's avatar
Jan Klass committed
68
                handlehref(data, base, url, value; forcehttps=forcehttps)
69 70 71 72 73 74 75 76 77 78
                break
            end
        end
    elseif res.status == 200
        push!(data.urlsvalid, url)

        # Scan for new URLs on this page
        body = String(res.body)
        for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
            href = m[:url]
Jan Klass's avatar
Jan Klass committed
79
            handlehref(data, base, url, href; forcehttps=forcehttps)
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
        end
    else
        @debug "For $url response status is $(res.status)"
        push!(data.urlsfail, url)
    end
    push!(data.visited, url)
end

function write_urls(filename, urls)
    open(filename, "w") do f
        for url in urls
            println(f, url)
        end
    end
end

function basechop(base)
    if startswith(base, "http://")
        return chop(base; head=length("http://"), tail=0)
    elseif startswith(base, "https://")
        return chop(base; head=length("https://"), tail=0)
    else
        @error "Unexpected prefix "
    end
end

function crawl_and_generate(base; forcehttps=false)
    m = match(r"(<?protocol>(http)|(https))\:\/\/[^\/]+(?<port>\:[0-9]+)?(?<trailslash>\/)?(?<path>)?", base)
    if m == nothing
        @error "Failed to parse passed URL"
        exit(1)
    elseif m[:trailslash] == nothing
        @error "Missing trailing slash"
        exit(1)
    end

    data = CrawlData()

    if forcehttps && startswith(base, "http://")
Jan Klass's avatar
Jan Klass committed
119
        base = "https://" * chop(base; head=length("http://", tail=0))
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
    end
 
    push!(data.urls, base)
    while length(data.visited) != length(data.urls)
        @info "Intermediate count: #urls: $(length(data.urls)), #visited: $(length(data.visited)), #valid $(length(data.urlsvalid)), #3xx $(length(data.urls3xx))…"
        for url in data.urls
            if in(data.visited, url) == false
                visit(data, url, base; forcehttps=forcehttps)
            end
        end
    end

    @info "Identified $(length(data.urls)) urls, $(length(data.urls404)) 404 urls, $(length(data.urlsexternal)) external urls, $(length(data.urls3xx)) 3xx urls, $(length(data.urlsfail)) fail."

    @info "Clearing files …"
    rm(out_404; force=true)
    rm(out_3xx; force=true)
    rm(out_external; force=true)
    rm(out_fail; force=true)
    rm(out_sitemap; force=true)

    @info "Writing url files …"
    write_urls(out_404, data.urls404)
    write_urls(out_3xx, data.urls3xx)
    write_urls(out_external, data.urlsexternal)
    write_urls(out_fail, data.urlsfail)

    @info "Writing sitemap $out_sitemap …"
    Sitemap.write_sitemap(out_sitemap, data.urlsvalid)
end