check.jl 2.27 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
function checkurl(url)
    res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false, require_ssl_verification=false)
    if res.status == 200
        @info "Valid URL $url"
    elseif HTTP.isredirect(res)
        @info "Invalid: Redirect at $url"
    else
        @info "Invalid: HTTP status code $(res.status) at $url"
    end
end

function rebase(url, base)
    m = match(r"^(?<protocol>[a-zA-Z0-9]+)\:\/\/(?<host>[^\/]+)\/(?<path>.*)$", url)
    return base * m[:path]
end

function check(base)
    @info "Reading sitemap $out_sitemap"
    urls = []

    xdoc = LightXML.parse_file(out_sitemap)
    xurlset = LightXML.root(xdoc)
    if LightXML.name(xurlset) != "urlset"
        @error "Invalid sitemap. Expected <urlset>, found <$(LightXML.name(xurlset))>."
        exit(1)
    end
    for xurl in LightXML.child_nodes(xurlset)
        # Apparently we iterate more things than we can see in the XML.
        # Skip anything that is not an element node.
        if !is_elementnode(xurl)
            continue
        end
        if LightXML.name(xurl) != "url"
            continue
            @error "Invalid sitemap. Expected <url>, found <$(LightXML.name(xurl))>."
            exit(1)
        end
        if !has_children(xurl)
            @warn "Invalid sitemap. <url> must have a <loc> child."
            break
        end
        # xurl = XMLElement(xurl)
        url = nothing
        for xloc in LightXML.child_nodes(xurl)
            # Apparently we iterate more things than we can see in the XML.
            # Skip anything that is not an element node.
            if !is_elementnode(xloc)
                continue
            end
            url = LightXML.content(xloc)
            push!(urls, url)
        end
        if url == nothing
            @warn "Sitemap contains invalid url element with missing required loc element."
        end
    end
    LightXML.free(xdoc)

    @info "Found $(length(urls)) URLs"
    @info "Checking URLs …"
    for url in urls
        if length(base) > 0
            
            @debug "Checking URL $url …"
            url = rebase(url, base)
            checkurl(url)
        else
            @debug "Checking URL $url …"
            checkurl(url)
        end
    end
end