Commit abfafa05 authored by Jan Klass's avatar Jan Klass

Implement arguments, implement checking sitemap URLs for validity

parent 8cc4722c
using LightXML
using HTTP
# Use a trailing slash for base domains
base = "https://kcode.de/"
out_404 = "404.log"
out_3xx = "3xx.log"
out_external = "external.log"
......@@ -88,42 +85,127 @@ function write_sitemap(filename, urls)
save_file(xdoc, filename)
end
urls = Set()
visited = Set()
urlsvalid = Set()
urls404 = Set()
urls3xx = Set()
urlsexternal = Set()
urlsfail = Set()
push!(urls, base)
while length(visited) != length(urls)
@info "Intermediate count: #urls: $(length(urls)), #visited: $(length(visited)), #valid $(length(urlsvalid)), #3xx $(length(urls3xx))…"
for url in urls
if !in(visited, url)
visit(url, base)
function crawl_and_generate(base)
urls = Set()
visited = Set()
urlsvalid = Set()
urls404 = Set()
urls3xx = Set()
urlsexternal = Set()
urlsfail = Set()
push!(urls, base)
while length(visited) != length(urls)
@info "Intermediate count: #urls: $(length(urls)), #visited: $(length(visited)), #valid $(length(urlsvalid)), #3xx $(length(urls3xx))…"
for url in urls
if !in(visited, url)
visit(url, base)
end
end
end
@info "Checking url from base $base…"
url = base
@info "Identified $(length(urls)) urls, $(length(urls404)) 404 urls, $(length(urlsexternal)) external urls, $(length(urls3xx)) 3xx urls, $(length(urlsfail)) fail."
@info "Clearing files …"
rm(out_404; force=true)
rm(out_3xx; force=true)
rm(out_external; force=true)
rm(out_fail; force=true)
rm(out_sitemap; force=true)
@info "Writing url files …"
write_urls(out_404, urls404)
write_urls(out_3xx, urls3xx)
write_urls(out_external, urlsexternal)
write_urls(out_fail, urlsfail)
@info "Writing sitemap $out_sitemap …"
write_sitemap(out_sitemap, urlsvalid)
end
@info "Checking url from base $base…"
url = base
function checkurl(url)
res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
if res.status == 200
@info "Valid URL $url"
elseif HTTP.isredirect(res)
@info "Invalid: Redirect at $url"
else
@info "Invalid: HTTP status code $(res.status) at $url"
end
end
@info "Identified $(length(urls)) urls, $(length(urls404)) 404 urls, $(length(urlsexternal)) external urls, $(length(urls3xx)) 3xx urls, $(length(urlsfail)) fail."
function check(base)
@info "Reading sitemap $out_sitemap"
urls = []
xdoc = LightXML.parse_file(out_sitemap)
xurlset = LightXML.root(xdoc)
if LightXML.name(xurlset) != "urlset"
@error "Invalid sitemap. Expected <urlset>, found <$(LightXML.name(xurlset))>."
exit(1)
end
for xurl in LightXML.child_nodes(xurlset)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xurl)
continue
end
if LightXML.name(xurl) != "url"
continue
@error "Invalid sitemap. Expected <url>, found <$(LightXML.name(xurl))>."
exit(1)
end
if !has_children(xurl)
@warn "Invalid sitemap. <url> must have a <loc> child."
break
end
# xurl = XMLElement(xurl)
url = nothing
for xloc in LightXML.child_nodes(xurl)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xloc)
continue
end
url = LightXML.content(xloc)
push!(urls, url)
end
if url == nothing
@warn "Sitemap contains invalid url element with missing required loc element."
end
end
LightXML.free(xdoc)
@info "Clearing files …"
rm(out_404; force=true)
rm(out_3xx; force=true)
rm(out_external; force=true)
rm(out_fail; force=true)
rm(out_sitemap; force=true)
@info "Found $(length(urls)) URLs"
@info "Checking URLs …"
for url in urls
if length(base) > 0
@debug "Checking URL $url (TODO) …"
checkurl(url)
else
@debug "Checking URL $url …"
checkurl(url)
end
end
end
@info "Writing url files …"
write_urls(out_404, urls404)
write_urls(out_3xx, urls3xx)
write_urls(out_external, urlsexternal)
write_urls(out_fail, urlsfail)
if length(ARGS) != 2
@error "Parameters <action> and <param> are required"
exit(1)
end
action = ARGS[1]
param = ARGS[2]
@info "Writing sitemap $out_sitemap …"
write_sitemap(out_sitemap, urlsvalid)
if action == "crawl"
# TODO: Make sure to use trailing slash
crawl_and_generate(param)
elseif action == "check"
check(param)
else
@error "Unknown action $action"
exit(1)
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment