Commit 7202c156 authored by Jan Klass's avatar Jan Klass

ff

parent 94408e42
......@@ -55,7 +55,7 @@ function visit(data::CrawlData, url, base; forcehttps=false)
for (name, value) in res.headers
if name == "Location"
@debug "Identified redirect $url to $value"
handlehref(data, base, url, value; forcehttps=false)
handlehref(data, base, url, value; forcehttps=forcehttps)
break
end
end
......@@ -66,7 +66,7 @@ function visit(data::CrawlData, url, base; forcehttps=false)
body = String(res.body)
for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
href = m[:url]
handlehref(data, base, url, href; forcehttps=false)
handlehref(data, base, url, href; forcehttps=forcehttps)
end
else
@debug "For $url response status is $(res.status)"
......@@ -106,11 +106,10 @@ function crawl_and_generate(base; forcehttps=false)
data = CrawlData()
if forcehttps && startswith(base, "http://")
base = chop(base; head=length("http://", tail=0))
base = "https://" * chop(base; head=length("http://", tail=0))
end
push!(data.urls, base)
while length(data.visited) != length(data.urls)
@info "Intermediate count: #urls: $(length(data.urls)), #visited: $(length(data.visited)), #valid $(length(data.urlsvalid)), #3xx $(length(data.urls3xx))…"
for url in data.urls
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment