Commit ef5ee13e authored by Jan Klass's avatar Jan Klass

Initial crawler and sitemap generator

parents
[deps]
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
using LightXML
using HTTP
# Use a trailing slash for base domains
base = "https://kcode.de/"
out_404 = "404.log"
out_3xx = "3xx.log"
out_external = "external.log"
out_sitemap = "sitemap.xml"
function handlehref(url, href)
# Remove URL fragment (#)
href = match(r"^(?<path>[^\#]*)", href)[:path]
href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+\:\/\/)?", href)[:protocol]
if href_protocol != nothing
if href_protocol == "http" || href_protocol == "https"
# Index urls that start with the base URL
# TODO: Allow both http and https interchangably
if startswith(href, base)
push!(urls, href)
@debug "Match (absolute URL): $href"
else
push!(urlsexternal, href)
end
else
@debug "Ignoring URL $href with ignored procotol $href_protocol"
end
elseif startswith(href, "/")
abshref = base * chop(href; head=1, tail=0)
@debug "Match (absolute path): $abshref"
push!(urls, abshref)
else
abshref = url * href
@debug "Match (relative path): $abshref"
push!(urls, abshref)
end
end
function visit(url, base)
@info "Visiting $url …"
res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
if res.status == 404
@info "Ignoring HTTP 404 status code url $url"
push!(urls404, url)
elseif res.status >= 300 && res.status < 400
# 300 redirect
@warn "Ignoring HTTP 3xx status code url $url"
push!(urls3xx, url)
elseif res.status == 200
push!(urlsvalid, url)
push!(urls, url)
body = String(res.body)
for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
href = m[:url]
handlehref(url, href)
end
else
@warn "For $url response status is $(res.status)"
end
push!(visited, url)
end
function write_urls(filename, urls)
open(filename, "w") do f
for url in urls
println(f, url)
end
end
end
function write_sitemap(filename, urls)
xdoc = XMLDocument()
xurlset = create_root(xdoc, "urlset")
set_attribute(xurlset, "xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
for url in urls
xurl = new_child(xurlset, "url")
xloc = new_child(xurl, "loc")
add_text(xloc, url)
end
save_file(xdoc, filename)
end
urls = Set()
visited = Set()
urlsvalid = Set()
urls404 = Set()
urls3xx = Set()
urlsexternal = Set()
push!(urls, base)
while length(visited) != length(urls)
@info "Intermediate count: #urls: $(length(urls)), #visited: $(length(visited)), #valid $(length(urlsvalid)), #3xx $(length(urls3xx))…"
for url in urls
if !in(visited, url)
visit(url, base)
end
end
end
@info "Checking url from base $base…"
url = base
@info "Identified $(length(urls)) urls and $(length(urls404)) 404 urls."
@info "Clearing files …"
rm(out_404; force=true)
rm(out_3xx; force=true)
rm(out_external; force=true)
rm(out_sitemap; force=true)
@info "Writing url files …"
write_urls(out_404, urls404)
write_urls(out_3xx, urls3xx)
write_urls(out_external, urlsexternal)
@info "Writing sitemap $out_sitemap …"
write_sitemap(out_sitemap, urlsvalid)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment