crawl-sitemap.jl 3.72 KB
Newer Older
1 2 3 4 5 6 7 8 9
using LightXML
using HTTP

# Use a trailing slash for base domains
base = "https://kcode.de/"

out_404 = "404.log"
out_3xx = "3xx.log"
out_external = "external.log"
10
out_fail = "fail.log"
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
out_sitemap = "sitemap.xml"

function handlehref(url, href)
    # Remove URL fragment (#)
    href = match(r"^(?<path>[^\#]*)", href)[:path]
    href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+\:\/\/)?", href)[:protocol]
    if href_protocol != nothing
        if href_protocol == "http" || href_protocol == "https"
            # Index urls that start with the base URL
            # TODO: Allow both http and https interchangably
            if startswith(href, base)
                push!(urls, href)
                @debug "Match (absolute URL): $href"
            else
                push!(urlsexternal, href)
            end
        else
            @debug "Ignoring URL $href with ignored procotol $href_protocol"
        end
    elseif startswith(href, "/")
        abshref = base * chop(href; head=1, tail=0)
        @debug "Match (absolute path): $abshref"
        push!(urls, abshref)
    else
        abshref = url * href
        @debug "Match (relative path): $abshref"
        push!(urls, abshref)
    end
end

function visit(url, base)
    @info "Visiting $url …"
    res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
    if res.status == 404
        @info "Ignoring HTTP 404 status code url $url"
        push!(urls404, url)
47
    elseif HTTP.isredirect(res) # res.status >= 300 && res.status < 400
48
        push!(urls3xx, url)
49 50 51 52 53 54 55 56
        
        for (name, value) in res.headers
            if name == "Location"
                @debug "Identified redirect $url to $value"
                handlehref(url, value)
                break
            end
        end
57 58 59 60 61 62 63 64
    elseif res.status == 200
        push!(urlsvalid, url)
        body = String(res.body)
        for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
            href = m[:url]
            handlehref(url, href)
        end
    else
65 66
        @debug "For $url response status is $(res.status)"
        push!(urlsfail, url)
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
    end
    push!(visited, url)
end

function write_urls(filename, urls)
    open(filename, "w") do f
        for url in urls
            println(f, url)
        end
    end
end

function write_sitemap(filename, urls)
    xdoc = XMLDocument()
    xurlset = create_root(xdoc, "urlset")
    set_attribute(xurlset, "xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
    for url in urls
        xurl = new_child(xurlset, "url")
        xloc = new_child(xurl, "loc")
        add_text(xloc, url)
    end
    save_file(xdoc, filename)
end

urls = Set()
visited = Set()
urlsvalid = Set()
urls404 = Set()
urls3xx = Set()
urlsexternal = Set()
97
urlsfail = Set()
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112

push!(urls, base)

while length(visited) != length(urls)
    @info "Intermediate count: #urls: $(length(urls)), #visited: $(length(visited)), #valid $(length(urlsvalid)), #3xx $(length(urls3xx))…"
    for url in urls
        if !in(visited, url)
            visit(url, base)
        end
    end
end

@info "Checking url from base $base…"
url = base

113
@info "Identified $(length(urls)) urls, $(length(urls404)) 404 urls, $(length(urlsexternal)) external urls, $(length(urls3xx)) 3xx urls, $(length(urlsfail)) fail."
114 115 116 117 118

@info "Clearing files …"
rm(out_404; force=true)
rm(out_3xx; force=true)
rm(out_external; force=true)
119
rm(out_fail; force=true)
120 121 122 123 124 125
rm(out_sitemap; force=true)

@info "Writing url files …"
write_urls(out_404, urls404)
write_urls(out_3xx, urls3xx)
write_urls(out_external, urlsexternal)
126
write_urls(out_fail, urlsfail)
127 128 129

@info "Writing sitemap $out_sitemap …"
write_sitemap(out_sitemap, urlsvalid)