crawl.jl 4.57 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
struct CrawlData
    urls
    visited
    urlsvalid
    urls404
    urls3xx
    urlsexternal
    urlsfail

    CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
end

function handlehref(data::CrawlData, base, url, href; forcehttps=false)
    # Remove URL fragment (#)
    href = match(r"^(?<path>[^\#]*)", href)[:path]
    href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+\:\/\/)?", href)[:protocol]
    if href_protocol != nothing
        if href_protocol == "http" || href_protocol == "https"
            if forcehttps
                href_noprot = chop(href; head=length(href_protocol), tail=0)
                ismatch = startswith(href_noprot, base_noprot)
            else
                ismatch = startswith(href, base)
            end
            # Index urls that start with the base URL
            if startswith(href, base)
                push!(data.urls, href)
                @debug "Match (absolute URL): $href"
            else
                push!(data.urlsexternal, href)
            end
        else
            @debug "Ignoring URL $href with ignored procotol $href_protocol"
        end
    elseif startswith(href, "/")
        abshref = base * chop(href; head=1, tail=0)
        @debug "Match (absolute path): $abshref"
        push!(data.urls, abshref)
    else
        abshref = url * href
        @debug "Match (relative path): $abshref"
        push!(data.urls, abshref)
    end
end

function visit(data::CrawlData, url, base; forcehttps=false)
    @info "Visiting $url …"
    res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
    if res.status == 404
        @info "Ignoring HTTP 404 status code url $url"
        push!(data.urls404, url)
    elseif HTTP.isredirect(res) # res.status >= 300 && res.status < 400
        push!(data.urls3xx, url)

        for (name, value) in res.headers
            if name == "Location"
                @debug "Identified redirect $url to $value"
Jan Klass's avatar
Jan Klass committed
58
                handlehref(data, base, url, value; forcehttps=forcehttps)
59 60 61 62 63 64 65 66 67 68
                break
            end
        end
    elseif res.status == 200
        push!(data.urlsvalid, url)

        # Scan for new URLs on this page
        body = String(res.body)
        for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
            href = m[:url]
Jan Klass's avatar
Jan Klass committed
69
            handlehref(data, base, url, href; forcehttps=forcehttps)
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
        end
    else
        @debug "For $url response status is $(res.status)"
        push!(data.urlsfail, url)
    end
    push!(data.visited, url)
end

function write_urls(filename, urls)
    open(filename, "w") do f
        for url in urls
            println(f, url)
        end
    end
end

function basechop(base)
    if startswith(base, "http://")
        return chop(base; head=length("http://"), tail=0)
    elseif startswith(base, "https://")
        return chop(base; head=length("https://"), tail=0)
    else
        @error "Unexpected prefix "
    end
end

function crawl_and_generate(base; forcehttps=false)
    m = match(r"(<?protocol>(http)|(https))\:\/\/[^\/]+(?<port>\:[0-9]+)?(?<trailslash>\/)?(?<path>)?", base)
    if m == nothing
        @error "Failed to parse passed URL"
        exit(1)
    elseif m[:trailslash] == nothing
        @error "Missing trailing slash"
        exit(1)
    end

    data = CrawlData()

    if forcehttps && startswith(base, "http://")
Jan Klass's avatar
Jan Klass committed
109
        base = "https://" * chop(base; head=length("http://", tail=0))
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
    end
 
    push!(data.urls, base)
    while length(data.visited) != length(data.urls)
        @info "Intermediate count: #urls: $(length(data.urls)), #visited: $(length(data.visited)), #valid $(length(data.urlsvalid)), #3xx $(length(data.urls3xx))…"
        for url in data.urls
            if in(data.visited, url) == false
                visit(data, url, base; forcehttps=forcehttps)
            end
        end
    end

    @info "Identified $(length(data.urls)) urls, $(length(data.urls404)) 404 urls, $(length(data.urlsexternal)) external urls, $(length(data.urls3xx)) 3xx urls, $(length(data.urlsfail)) fail."

    @info "Clearing files …"
    rm(out_404; force=true)
    rm(out_3xx; force=true)
    rm(out_external; force=true)
    rm(out_fail; force=true)
    rm(out_sitemap; force=true)

    @info "Writing url files …"
    write_urls(out_404, data.urls404)
    write_urls(out_3xx, data.urls3xx)
    write_urls(out_external, data.urlsexternal)
    write_urls(out_fail, data.urlsfail)

    @info "Writing sitemap $out_sitemap …"
    Sitemap.write_sitemap(out_sitemap, data.urlsvalid)
end