Commit 94408e42 authored by Jan Klass's avatar Jan Klass

Split code into files. Implement crawl forcehttps parameter.

Argument help
Argument checks
CrawlData struct type
Pass data as parameters to crawl functions
parent abfafa05
function checkurl(url)
res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false, require_ssl_verification=false)
if res.status == 200
@info "Valid URL $url"
elseif HTTP.isredirect(res)
@info "Invalid: Redirect at $url"
else
@info "Invalid: HTTP status code $(res.status) at $url"
end
end
function rebase(url, base)
m = match(r"^(?<protocol>[a-zA-Z0-9]+)\:\/\/(?<host>[^\/]+)\/(?<path>.*)$", url)
return base * m[:path]
end
function check(base)
@info "Reading sitemap $out_sitemap"
urls = []
xdoc = LightXML.parse_file(out_sitemap)
xurlset = LightXML.root(xdoc)
if LightXML.name(xurlset) != "urlset"
@error "Invalid sitemap. Expected <urlset>, found <$(LightXML.name(xurlset))>."
exit(1)
end
for xurl in LightXML.child_nodes(xurlset)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xurl)
continue
end
if LightXML.name(xurl) != "url"
continue
@error "Invalid sitemap. Expected <url>, found <$(LightXML.name(xurl))>."
exit(1)
end
if !has_children(xurl)
@warn "Invalid sitemap. <url> must have a <loc> child."
break
end
# xurl = XMLElement(xurl)
url = nothing
for xloc in LightXML.child_nodes(xurl)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xloc)
continue
end
url = LightXML.content(xloc)
push!(urls, url)
end
if url == nothing
@warn "Sitemap contains invalid url element with missing required loc element."
end
end
LightXML.free(xdoc)
@info "Found $(length(urls)) URLs"
@info "Checking URLs …"
for url in urls
if length(base) > 0
@debug "Checking URL $url …"
url = rebase(url, base)
checkurl(url)
else
@debug "Checking URL $url …"
checkurl(url)
end
end
end
using LightXML
using HTTP
include("Sitemap.jl")
include("crawl.jl")
include("check.jl")
out_404 = "404.log"
out_3xx = "3xx.log"
out_external = "external.log"
out_fail = "fail.log"
out_sitemap = "sitemap.xml"
function handlehref(url, href)
# Remove URL fragment (#)
href = match(r"^(?<path>[^\#]*)", href)[:path]
href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+\:\/\/)?", href)[:protocol]
if href_protocol != nothing
if href_protocol == "http" || href_protocol == "https"
# Index urls that start with the base URL
# TODO: Allow both http and https interchangably
if startswith(href, base)
push!(urls, href)
@debug "Match (absolute URL): $href"
else
push!(urlsexternal, href)
end
else
@debug "Ignoring URL $href with ignored procotol $href_protocol"
end
elseif startswith(href, "/")
abshref = base * chop(href; head=1, tail=0)
@debug "Match (absolute path): $abshref"
push!(urls, abshref)
else
abshref = url * href
@debug "Match (relative path): $abshref"
push!(urls, abshref)
end
end
function visit(url, base)
@info "Visiting $url …"
res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
if res.status == 404
@info "Ignoring HTTP 404 status code url $url"
push!(urls404, url)
elseif HTTP.isredirect(res) # res.status >= 300 && res.status < 400
push!(urls3xx, url)
for (name, value) in res.headers
if name == "Location"
@debug "Identified redirect $url to $value"
handlehref(url, value)
break
end
end
elseif res.status == 200
push!(urlsvalid, url)
body = String(res.body)
for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
href = m[:url]
handlehref(url, href)
end
else
@debug "For $url response status is $(res.status)"
push!(urlsfail, url)
end
push!(visited, url)
end
function write_urls(filename, urls)
open(filename, "w") do f
for url in urls
println(f, url)
end
end
end
function write_sitemap(filename, urls)
xdoc = XMLDocument()
xurlset = create_root(xdoc, "urlset")
set_attribute(xurlset, "xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
for url in urls
xurl = new_child(xurlset, "url")
xloc = new_child(xurl, "loc")
add_text(xloc, url)
end
save_file(xdoc, filename)
end
function crawl_and_generate(base)
urls = Set()
visited = Set()
urlsvalid = Set()
urls404 = Set()
urls3xx = Set()
urlsexternal = Set()
urlsfail = Set()
push!(urls, base)
while length(visited) != length(urls)
@info "Intermediate count: #urls: $(length(urls)), #visited: $(length(visited)), #valid $(length(urlsvalid)), #3xx $(length(urls3xx))…"
for url in urls
if !in(visited, url)
visit(url, base)
end
end
end
@info "Checking url from base $base…"
url = base
@info "Identified $(length(urls)) urls, $(length(urls404)) 404 urls, $(length(urlsexternal)) external urls, $(length(urls3xx)) 3xx urls, $(length(urlsfail)) fail."
@info "Clearing files …"
rm(out_404; force=true)
rm(out_3xx; force=true)
rm(out_external; force=true)
rm(out_fail; force=true)
rm(out_sitemap; force=true)
@info "Writing url files …"
write_urls(out_404, urls404)
write_urls(out_3xx, urls3xx)
write_urls(out_external, urlsexternal)
write_urls(out_fail, urlsfail)
@info "Writing sitemap $out_sitemap …"
write_sitemap(out_sitemap, urlsvalid)
end
function checkurl(url)
res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
if res.status == 200
@info "Valid URL $url"
elseif HTTP.isredirect(res)
@info "Invalid: Redirect at $url"
else
@info "Invalid: HTTP status code $(res.status) at $url"
end
end
function check(base)
@info "Reading sitemap $out_sitemap"
urls = []
xdoc = LightXML.parse_file(out_sitemap)
xurlset = LightXML.root(xdoc)
if LightXML.name(xurlset) != "urlset"
@error "Invalid sitemap. Expected <urlset>, found <$(LightXML.name(xurlset))>."
exit(1)
end
for xurl in LightXML.child_nodes(xurlset)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xurl)
continue
end
if LightXML.name(xurl) != "url"
continue
@error "Invalid sitemap. Expected <url>, found <$(LightXML.name(xurl))>."
exit(1)
end
if !has_children(xurl)
@warn "Invalid sitemap. <url> must have a <loc> child."
break
end
# xurl = XMLElement(xurl)
url = nothing
for xloc in LightXML.child_nodes(xurl)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xloc)
continue
end
url = LightXML.content(xloc)
push!(urls, url)
end
if url == nothing
@warn "Sitemap contains invalid url element with missing required loc element."
end
end
LightXML.free(xdoc)
@info "Found $(length(urls)) URLs"
@info "Checking URLs …"
for url in urls
if length(base) > 0
@debug "Checking URL $url (TODO) …"
checkurl(url)
else
@debug "Checking URL $url …"
checkurl(url)
end
end
end
if length(ARGS) != 2
@error "Parameters <action> and <param> are required"
exit(1)
......@@ -200,12 +17,23 @@ end
action = ARGS[1]
param = ARGS[2]
if action == "crawl"
# TODO: Make sure to use trailing slash
crawl_and_generate(param)
elseif action == "check"
check(param)
else
@error "Unknown action $action"
exit(1)
try
if action == "crawl"
# TODO: Make sure to use trailing slash
crawl_and_generate(param; forcehttps=true)
elseif action == "check"
check(param)
else
@error "Unknown action $action"
println("Usage: <action> <param>")
println("<action>:")
println(" crawl <base URL>")
println(" check <base URL>")
exit(1)
end
catch ex
if isa(ex, InterruptException)
@info "Cancelled via interrupt"
end
throw(ex)
end
struct CrawlData
urls
visited
urlsvalid
urls404
urls3xx
urlsexternal
urlsfail
CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
end
function handlehref(data::CrawlData, base, url, href; forcehttps=false)
# Remove URL fragment (#)
href = match(r"^(?<path>[^\#]*)", href)[:path]
href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+\:\/\/)?", href)[:protocol]
if href_protocol != nothing
if href_protocol == "http" || href_protocol == "https"
if forcehttps
href_noprot = chop(href; head=length(href_protocol), tail=0)
ismatch = startswith(href_noprot, base_noprot)
else
ismatch = startswith(href, base)
end
# Index urls that start with the base URL
if startswith(href, base)
push!(data.urls, href)
@debug "Match (absolute URL): $href"
else
push!(data.urlsexternal, href)
end
else
@debug "Ignoring URL $href with ignored procotol $href_protocol"
end
elseif startswith(href, "/")
abshref = base * chop(href; head=1, tail=0)
@debug "Match (absolute path): $abshref"
push!(data.urls, abshref)
else
abshref = url * href
@debug "Match (relative path): $abshref"
push!(data.urls, abshref)
end
end
function visit(data::CrawlData, url, base; forcehttps=false)
@info "Visiting $url …"
res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
if res.status == 404
@info "Ignoring HTTP 404 status code url $url"
push!(data.urls404, url)
elseif HTTP.isredirect(res) # res.status >= 300 && res.status < 400
push!(data.urls3xx, url)
for (name, value) in res.headers
if name == "Location"
@debug "Identified redirect $url to $value"
handlehref(data, base, url, value; forcehttps=false)
break
end
end
elseif res.status == 200
push!(data.urlsvalid, url)
# Scan for new URLs on this page
body = String(res.body)
for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
href = m[:url]
handlehref(data, base, url, href; forcehttps=false)
end
else
@debug "For $url response status is $(res.status)"
push!(data.urlsfail, url)
end
push!(data.visited, url)
end
function write_urls(filename, urls)
open(filename, "w") do f
for url in urls
println(f, url)
end
end
end
function basechop(base)
if startswith(base, "http://")
return chop(base; head=length("http://"), tail=0)
elseif startswith(base, "https://")
return chop(base; head=length("https://"), tail=0)
else
@error "Unexpected prefix "
end
end
function crawl_and_generate(base; forcehttps=false)
m = match(r"(<?protocol>(http)|(https))\:\/\/[^\/]+(?<port>\:[0-9]+)?(?<trailslash>\/)?(?<path>)?", base)
if m == nothing
@error "Failed to parse passed URL"
exit(1)
elseif m[:trailslash] == nothing
@error "Missing trailing slash"
exit(1)
end
data = CrawlData()
if forcehttps && startswith(base, "http://")
base = chop(base; head=length("http://", tail=0))
end
push!(data.urls, base)
while length(data.visited) != length(data.urls)
@info "Intermediate count: #urls: $(length(data.urls)), #visited: $(length(data.visited)), #valid $(length(data.urlsvalid)), #3xx $(length(data.urls3xx))…"
for url in data.urls
if in(data.visited, url) == false
visit(data, url, base; forcehttps=forcehttps)
end
end
end
@info "Identified $(length(data.urls)) urls, $(length(data.urls404)) 404 urls, $(length(data.urlsexternal)) external urls, $(length(data.urls3xx)) 3xx urls, $(length(data.urlsfail)) fail."
@info "Clearing files …"
rm(out_404; force=true)
rm(out_3xx; force=true)
rm(out_external; force=true)
rm(out_fail; force=true)
rm(out_sitemap; force=true)
@info "Writing url files …"
write_urls(out_404, data.urls404)
write_urls(out_3xx, data.urls3xx)
write_urls(out_external, data.urlsexternal)
write_urls(out_fail, data.urlsfail)
@info "Writing sitemap $out_sitemap …"
Sitemap.write_sitemap(out_sitemap, data.urlsvalid)
end
module Sitemap
using LightXML
function write_sitemap(filename, urls)
xdoc = XMLDocument()
xurlset = create_root(xdoc, "urlset")
set_attribute(xurlset, "xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
for url in urls
xurl = new_child(xurlset, "url")
xloc = new_child(xurl, "loc")
add_text(xloc, url)
end
save_file(xdoc, filename)
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment