Commit 8cc4722c authored by Jan Klass's avatar Jan Klass

Handle failures, handle redirects, improve logging

parent ef5ee13e
...@@ -7,6 +7,7 @@ base = "https://kcode.de/" ...@@ -7,6 +7,7 @@ base = "https://kcode.de/"
out_404 = "404.log" out_404 = "404.log"
out_3xx = "3xx.log" out_3xx = "3xx.log"
out_external = "external.log" out_external = "external.log"
out_fail = "fail.log"
out_sitemap = "sitemap.xml" out_sitemap = "sitemap.xml"
function handlehref(url, href) function handlehref(url, href)
...@@ -43,20 +44,26 @@ function visit(url, base) ...@@ -43,20 +44,26 @@ function visit(url, base)
if res.status == 404 if res.status == 404
@info "Ignoring HTTP 404 status code url $url" @info "Ignoring HTTP 404 status code url $url"
push!(urls404, url) push!(urls404, url)
elseif res.status >= 300 && res.status < 400 elseif HTTP.isredirect(res) # res.status >= 300 && res.status < 400
# 300 redirect
@warn "Ignoring HTTP 3xx status code url $url"
push!(urls3xx, url) push!(urls3xx, url)
for (name, value) in res.headers
if name == "Location"
@debug "Identified redirect $url to $value"
handlehref(url, value)
break
end
end
elseif res.status == 200 elseif res.status == 200
push!(urlsvalid, url) push!(urlsvalid, url)
push!(urls, url)
body = String(res.body) body = String(res.body)
for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body) for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
href = m[:url] href = m[:url]
handlehref(url, href) handlehref(url, href)
end end
else else
@warn "For $url response status is $(res.status)" @debug "For $url response status is $(res.status)"
push!(urlsfail, url)
end end
push!(visited, url) push!(visited, url)
end end
...@@ -87,6 +94,7 @@ urlsvalid = Set() ...@@ -87,6 +94,7 @@ urlsvalid = Set()
urls404 = Set() urls404 = Set()
urls3xx = Set() urls3xx = Set()
urlsexternal = Set() urlsexternal = Set()
urlsfail = Set()
push!(urls, base) push!(urls, base)
...@@ -102,18 +110,20 @@ end ...@@ -102,18 +110,20 @@ end
@info "Checking url from base $base…" @info "Checking url from base $base…"
url = base url = base
@info "Identified $(length(urls)) urls and $(length(urls404)) 404 urls." @info "Identified $(length(urls)) urls, $(length(urls404)) 404 urls, $(length(urlsexternal)) external urls, $(length(urls3xx)) 3xx urls, $(length(urlsfail)) fail."
@info "Clearing files …" @info "Clearing files …"
rm(out_404; force=true) rm(out_404; force=true)
rm(out_3xx; force=true) rm(out_3xx; force=true)
rm(out_external; force=true) rm(out_external; force=true)
rm(out_fail; force=true)
rm(out_sitemap; force=true) rm(out_sitemap; force=true)
@info "Writing url files …" @info "Writing url files …"
write_urls(out_404, urls404) write_urls(out_404, urls404)
write_urls(out_3xx, urls3xx) write_urls(out_3xx, urls3xx)
write_urls(out_external, urlsexternal) write_urls(out_external, urlsexternal)
write_urls(out_fail, urlsfail)
@info "Writing sitemap $out_sitemap …" @info "Writing sitemap $out_sitemap …"
write_sitemap(out_sitemap, urlsvalid) write_sitemap(out_sitemap, urlsvalid)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment