Commit 8cc4722c authored by Jan Klass's avatar Jan Klass

Handle failures, handle redirects, improve logging

parent ef5ee13e
......@@ -7,6 +7,7 @@ base = "https://kcode.de/"
out_404 = "404.log"
out_3xx = "3xx.log"
out_external = "external.log"
out_fail = "fail.log"
out_sitemap = "sitemap.xml"
function handlehref(url, href)
......@@ -43,20 +44,26 @@ function visit(url, base)
if res.status == 404
@info "Ignoring HTTP 404 status code url $url"
push!(urls404, url)
elseif res.status >= 300 && res.status < 400
# 300 redirect
@warn "Ignoring HTTP 3xx status code url $url"
elseif HTTP.isredirect(res) # res.status >= 300 && res.status < 400
push!(urls3xx, url)
for (name, value) in res.headers
if name == "Location"
@debug "Identified redirect $url to $value"
handlehref(url, value)
break
end
end
elseif res.status == 200
push!(urlsvalid, url)
push!(urls, url)
body = String(res.body)
for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
href = m[:url]
handlehref(url, href)
end
else
@warn "For $url response status is $(res.status)"
@debug "For $url response status is $(res.status)"
push!(urlsfail, url)
end
push!(visited, url)
end
......@@ -87,6 +94,7 @@ urlsvalid = Set()
urls404 = Set()
urls3xx = Set()
urlsexternal = Set()
urlsfail = Set()
push!(urls, base)
......@@ -102,18 +110,20 @@ end
@info "Checking url from base $base…"
url = base
@info "Identified $(length(urls)) urls and $(length(urls404)) 404 urls."
@info "Identified $(length(urls)) urls, $(length(urls404)) 404 urls, $(length(urlsexternal)) external urls, $(length(urls3xx)) 3xx urls, $(length(urlsfail)) fail."
@info "Clearing files …"
rm(out_404; force=true)
rm(out_3xx; force=true)
rm(out_external; force=true)
rm(out_fail; force=true)
rm(out_sitemap; force=true)
@info "Writing url files …"
write_urls(out_404, urls404)
write_urls(out_3xx, urls3xx)
write_urls(out_external, urlsexternal)
write_urls(out_fail, urlsfail)
@info "Writing sitemap $out_sitemap …"
write_sitemap(out_sitemap, urlsvalid)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment