Commit 828a6295 authored by Jan Klass's avatar Jan Klass

BROKEN: More changes

Last changes I did for checking my website
parent 4182da01
function checkurl(url)
function checkurl(data::CrawlData, url)
push!(data.urlsvisited, url)
res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false, require_ssl_verification=false)
if res.status == 200
@info "Valid URL $url"
push!(data.urlsvalid, url)
elseif HTTP.isredirect(res)
@info "Invalid: Redirect at $url"
push!(data.urls3xx, url)
target = nothing
for (key, value) in res.headers
if lowercase(key) == "location"
target = value
break
end
end
if target == nothing
@warn "No redirect target on redirect at $url "
push!(data.urlsfail, url)
else
@info "Checking redirect target $target …"
checkurl(data, target)
end
else
@info "Invalid: HTTP status code $(res.status) at $url"
push!(data.urlsfail, url)
end
end
......@@ -15,19 +32,51 @@ function rebase(url, base)
end
function check(base)
urls = Sitemap.read_sitemap(out_sitemap)
data = CrawlData()
fnin::Filenames=Filenames()
fnout = Filenames(
"new.sitemap.urls.xml",
"new.sitemap.visited.xml",
"new.sitemap.valid.xml",
"new.sitemap.404.xml",
"new.sitemap.3xx.xml",
"new.sitemap.external.xml",
"new.sitemap.fail.xml",
"new.sitemap.ignored.xml",
)
#read(data.urls, fnin.urls)
read(data.urls, fnin.urlsvalid)
@info "Found $(length(urls)) URLs"
@info "Checking URLs …"
for url in urls
if length(base) > 0
@debug "Checking URL $url …"
url = rebase(url, base)
checkurl(url)
else
@debug "Checking URL $url …"
checkurl(url)
read(data.urlsvisited, fnout.urlsvisited)
read(data.urlsvalid, fnout.urlsvalid)
read(data.urls404, fnout.urls404)
read(data.urls3xx, fnout.urls3xx)
read(data.urlsfail, fnout.urlsfail)
left = length(data.urls)
limit = 1000
i = 0
@info "Checking $(left) URLs, $(length(data.urlsvisited)) already checked …"
for url in data.urls
if i > limit
break
end
if left % 100 == 0
@info "$left left"
end
left = left - 1
if in(url, data.urlsvisited) == false
i = i + 1
if length(base) > 0
@debug "Checking URL $url …"
url = rebase(url, base)
checkurl(data, url)
else
@debug "Checking URL $url …"
checkurl(data, url)
end
end
end
write_crawldata(data, fnout)
end
......@@ -87,7 +87,7 @@ function basechop(base)
end
end
function crawl_and_generate(base; forcehttps=false, limitnew=10)
function crawl_and_generate(base; forcehttps=false, limitnew=1000)
m = match(r"(<?protocol>(http)|(https))\:\/\/[^\/]+(?<port>\:[0-9]+)?(?<trailslash>\/)?(?<path>)?", base)
if m == nothing
@error "Failed to parse passed URL"
......
......@@ -18,4 +18,23 @@ struct Filenames
"sitemap.fail.xml",
"sitemap.ignored.xml",
)
Filenames(
urls
, urlsvisited
, urlsvalid
, urls404
, urls3xx
, urlsexternal
, urlsfail
, urlsignored
) = new(
urls
, urlsvisited
, urlsvalid
, urls404
, urls3xx
, urlsexternal
, urlsfail
, urlsignored
)
end
module Sitemap
using LightXML
export write_sitemap, read_sitemap
function read_sitemap(filename)
@info "Reading sitemap file $filename …"
urls = []
xdoc = LightXML.parse_file(filename)
xurlset = LightXML.root(xdoc)
if LightXML.name(xurlset) != "urlset"
@error "Invalid sitemap. Expected <urlset>, found <$(LightXML.name(xurlset))>."
exit(1)
end
for xurl in LightXML.child_nodes(xurlset)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xurl)
continue
end
if LightXML.name(xurl) != "url"
continue
@error "Invalid sitemap. Expected <url>, found <$(LightXML.name(xurl))>."
exit(1)
end
if !has_children(xurl)
@warn "Invalid sitemap. <url> must have a <loc> child."
break
end
# xurl = XMLElement(xurl)
url = nothing
for xloc in LightXML.child_nodes(xurl)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xloc)
continue
end
url = LightXML.content(xloc)
push!(urls, url)
end
if url == nothing
@warn "Sitemap contains invalid url element with missing required loc element."
end
end
LightXML.free(xdoc)
return urls
end
function write_sitemap(filename, urls)
xdoc = XMLDocument()
xurlset = create_root(xdoc, "urlset")
set_attribute(xurlset, "xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
for url in urls
xurl = new_child(xurlset, "url")
xloc = new_child(xurl, "loc")
add_text(xloc, url)
end
save_file(xdoc, filename)
end
function sort(set::Set)
vec = Vector()
for val in set
push!(vec, val)
end
sort!(vec)
return vec
end
function write_sitemap(filename, urls::Set)
write_sitemap(filename, sort(urls))
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment