Commit e689ceeb authored by Jan Klass's avatar Jan Klass

Fixes and improvements

Limit of new requests

Store and read partial data
parent 04bbdca7
......@@ -15,46 +15,7 @@ function rebase(url, base)
end
function check(base)
@info "Reading sitemap $out_sitemap"
urls = []
xdoc = LightXML.parse_file(out_sitemap)
xurlset = LightXML.root(xdoc)
if LightXML.name(xurlset) != "urlset"
@error "Invalid sitemap. Expected <urlset>, found <$(LightXML.name(xurlset))>."
exit(1)
end
for xurl in LightXML.child_nodes(xurlset)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xurl)
continue
end
if LightXML.name(xurl) != "url"
continue
@error "Invalid sitemap. Expected <url>, found <$(LightXML.name(xurl))>."
exit(1)
end
if !has_children(xurl)
@warn "Invalid sitemap. <url> must have a <loc> child."
break
end
# xurl = XMLElement(xurl)
url = nothing
for xloc in LightXML.child_nodes(xurl)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xloc)
continue
end
url = LightXML.content(xloc)
push!(urls, url)
end
if url == nothing
@warn "Sitemap contains invalid url element with missing required loc element."
end
end
LightXML.free(xdoc)
urls = Sitemap.read_sitemap(out_sitemap)
@info "Found $(length(urls)) URLs"
@info "Checking URLs …"
......
using HTTP
include("Sitemap.jl")
include("filenames.jl")
include("crawldata.jl")
include("crawl.jl")
include("check.jl")
out_404 = "404.log"
out_3xx = "3xx.log"
out_external = "external.log"
out_fail = "fail.log"
out_sitemap = "sitemap.xml"
filenames = Filenames()
if length(ARGS) != 2
@error "Parameters <action> and <param> are required"
......
struct CrawlData
urls
visited
urlsvalid
urls404
urls3xx
urlsexternal
urlsfail
CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
end
function handlehref_abs(data::CrawlData, base, url, href, href_protocol; forcehttps=false)
if href_protocol == "http" && forcehttps
href_noprot = chop(href; head=length(href_protocol), tail=0)
......@@ -78,7 +66,7 @@ function visit(data::CrawlData, url, base; forcehttps=false)
@debug "For $url response status is $(res.status)"
push!(data.urlsfail, url)
end
push!(data.visited, url)
push!(data.urlsvisited, url)
end
function write_urls(filename, urls)
......@@ -99,7 +87,7 @@ function basechop(base)
end
end
function crawl_and_generate(base; forcehttps=false)
function crawl_and_generate(base; forcehttps=false, limitnew=10)
m = match(r"(<?protocol>(http)|(https))\:\/\/[^\/]+(?<port>\:[0-9]+)?(?<trailslash>\/)?(?<path>)?", base)
if m == nothing
@error "Failed to parse passed URL"
......@@ -109,37 +97,35 @@ function crawl_and_generate(base; forcehttps=false)
exit(1)
end
data = CrawlData()
data = read_crawldata()
@info stringlengths(data)
if forcehttps && startswith(base, "http://")
base = "https://" * chop(base; head=length("http://", tail=0))
end
push!(data.urls, base)
while length(data.visited) != length(data.urls)
@info "Intermediate count: #urls: $(length(data.urls)), #visited: $(length(data.visited)), #valid $(length(data.urlsvalid)), #3xx $(length(data.urls3xx))…"
visited = 0
while length(data.urlsvisited) != length(data.urls)
if visited > limitnew
break
end
@info "Intermediate count: $(stringlengths(data)) …"
for url in data.urls
if in(data.visited, url) == false
if visited > limitnew
break
end
if in(url, data.urlsvisited) == false
visited = visited + 1
visit(data, url, base; forcehttps=forcehttps)
end
end
end
@info "Identified $(length(data.urls)) urls, $(length(data.urls404)) 404 urls, $(length(data.urlsexternal)) external urls, $(length(data.urls3xx)) 3xx urls, $(length(data.urlsfail)) fail."
@info "Clearing files …"
rm(out_404; force=true)
rm(out_3xx; force=true)
rm(out_external; force=true)
rm(out_fail; force=true)
rm(out_sitemap; force=true)
@info "Writing url files …"
write_urls(out_404, data.urls404)
write_urls(out_3xx, data.urls3xx)
write_urls(out_external, data.urlsexternal)
write_urls(out_fail, data.urlsfail)
@info "Identified $(stringlengths(data))"
@info "Writing sitemap $out_sitemap …"
Sitemap.write_sitemap(out_sitemap, data.urlsvalid)
write_crawldata(data)
end
struct CrawlData
urls
urlsvisited
urlsvalid
urls404
urls3xx
urlsexternal
urlsfail
CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
end
function readurls(filename)
if isfile(filename)
return Sitemap.read_sitemap(filename)
else
nothing
end
end
function read(set::Set, filename)
urls = readurls(filename)
if urls == nothing
return
end
for url in urls
push!(set, url)
end
end
function read_crawldata(data::CrawlData=CrawlData(), filenames::Filenames=Filenames()) CrawlData
@info "Reading sitemap files …"
read(data.urls, filenames.urls)
read(data.urlsvisited, filenames.urlsvisited)
read(data.urlsvalid, filenames.urlsvalid)
read(data.urls404, filenames.urls404)
read(data.urls3xx, filenames.urls3xx)
read(data.urlsexternal, filenames.urlsexternal)
read(data.urlsfail, filenames.urlsfail)
return data
end
function write_crawldata(data::CrawlData, filenames::Filenames=Filenames())
@info "Writing sitemap files …"
Sitemap.write_sitemap(filenames.urls, data.urls)
Sitemap.write_sitemap(filenames.urlsvisited, data.urlsvisited)
Sitemap.write_sitemap(filenames.urlsvalid, data.urlsvalid)
Sitemap.write_sitemap(filenames.urls404, data.urls404)
Sitemap.write_sitemap(filenames.urls3xx, data.urls3xx)
Sitemap.write_sitemap(filenames.urlsexternal, data.urlsexternal)
Sitemap.write_sitemap(filenames.urlsfail, data.urlsfail)
end
function stringlengths(data::CrawlData)
return """#urls: $(length(data.urls)),
#visited: $(length(data.urlsvisited)),
#valid $(length(data.urlsvalid)),
#404 $(length(data.urls404)),
#3xx $(length(data.urls3xx)),
#external $(length(data.urlsexternal)),
#fail $(length(data.urlsfail)),
"""
end
struct Filenames
urls
urlsvisited
urlsvalid
urls404
urls3xx
urlsexternal
urlsfail
Filenames() = new(
"sitemap.urls.xml",
"sitemap.visited.xml",
"sitemap.valid.xml",
"sitemap.404.xml",
"sitemap.3xx.xml",
"sitemap.external.xml",
"sitemap.fail.xml",
)
end
module Sitemap
using LightXML
export write_sitemap
export write_sitemap, read_sitemap
function read_sitemap(filename)
@info "Reading sitemap file $filename …"
urls = []
xdoc = LightXML.parse_file(filename)
xurlset = LightXML.root(xdoc)
if LightXML.name(xurlset) != "urlset"
@error "Invalid sitemap. Expected <urlset>, found <$(LightXML.name(xurlset))>."
exit(1)
end
for xurl in LightXML.child_nodes(xurlset)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xurl)
continue
end
if LightXML.name(xurl) != "url"
continue
@error "Invalid sitemap. Expected <url>, found <$(LightXML.name(xurl))>."
exit(1)
end
if !has_children(xurl)
@warn "Invalid sitemap. <url> must have a <loc> child."
break
end
# xurl = XMLElement(xurl)
url = nothing
for xloc in LightXML.child_nodes(xurl)
# Apparently we iterate more things than we can see in the XML.
# Skip anything that is not an element node.
if !is_elementnode(xloc)
continue
end
url = LightXML.content(xloc)
push!(urls, url)
end
if url == nothing
@warn "Sitemap contains invalid url element with missing required loc element."
end
end
LightXML.free(xdoc)
return urls
end
function write_sitemap(filename, urls)
xdoc = XMLDocument()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment