crawl-sitemap.jl 3.35 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
using LightXML
using HTTP

# Use a trailing slash for base domains
base = "https://kcode.de/"

out_404 = "404.log"
out_3xx = "3xx.log"
out_external = "external.log"
out_sitemap = "sitemap.xml"

function handlehref(url, href)
    # Remove URL fragment (#)
    href = match(r"^(?<path>[^\#]*)", href)[:path]
    href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+\:\/\/)?", href)[:protocol]
    if href_protocol != nothing
        if href_protocol == "http" || href_protocol == "https"
            # Index urls that start with the base URL
            # TODO: Allow both http and https interchangably
            if startswith(href, base)
                push!(urls, href)
                @debug "Match (absolute URL): $href"
            else
                push!(urlsexternal, href)
            end
        else
            @debug "Ignoring URL $href with ignored procotol $href_protocol"
        end
    elseif startswith(href, "/")
        abshref = base * chop(href; head=1, tail=0)
        @debug "Match (absolute path): $abshref"
        push!(urls, abshref)
    else
        abshref = url * href
        @debug "Match (relative path): $abshref"
        push!(urls, abshref)
    end
end

function visit(url, base)
    @info "Visiting $url …"
    res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
    if res.status == 404
        @info "Ignoring HTTP 404 status code url $url"
        push!(urls404, url)
    elseif res.status >= 300 && res.status < 400
        # 300 redirect
        @warn "Ignoring HTTP 3xx status code url $url"
        push!(urls3xx, url)
    elseif res.status == 200
        push!(urlsvalid, url)
        push!(urls, url)
        body = String(res.body)
        for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
            href = m[:url]
            handlehref(url, href)
        end
    else
        @warn "For $url response status is $(res.status)"
    end
    push!(visited, url)
end

function write_urls(filename, urls)
    open(filename, "w") do f
        for url in urls
            println(f, url)
        end
    end
end

function write_sitemap(filename, urls)
    xdoc = XMLDocument()
    xurlset = create_root(xdoc, "urlset")
    set_attribute(xurlset, "xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
    for url in urls
        xurl = new_child(xurlset, "url")
        xloc = new_child(xurl, "loc")
        add_text(xloc, url)
    end
    save_file(xdoc, filename)
end

urls = Set()
visited = Set()
urlsvalid = Set()
urls404 = Set()
urls3xx = Set()
urlsexternal = Set()

push!(urls, base)

while length(visited) != length(urls)
    @info "Intermediate count: #urls: $(length(urls)), #visited: $(length(visited)), #valid $(length(urlsvalid)), #3xx $(length(urls3xx))…"
    for url in urls
        if !in(visited, url)
            visit(url, base)
        end
    end
end

@info "Checking url from base $base…"
url = base

@info "Identified $(length(urls)) urls and $(length(urls404)) 404 urls."

@info "Clearing files …"
rm(out_404; force=true)
rm(out_3xx; force=true)
rm(out_external; force=true)
rm(out_sitemap; force=true)

@info "Writing url files …"
write_urls(out_404, urls404)
write_urls(out_3xx, urls3xx)
write_urls(out_external, urlsexternal)

@info "Writing sitemap $out_sitemap …"
write_sitemap(out_sitemap, urlsvalid)