Commit 18721b8a authored by Jan Klass's avatar Jan Klass

Fix

parent 39e68569
......@@ -10,21 +10,20 @@ struct CrawlData
CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
end
function handlehref(data::CrawlData, base, url, href; forcehttps=false)
# Remove URL fragment (#)
href = match(r"^(?<path>[^\#]*)", href)[:path]
function getprotocol(url)
match(r"^(?<protocol>[a-zA-Z0-9]+)?\:\/\/", url)[:protocol]
end
href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+)?\:\/\/", href)[:protocol]
if href_protocol != nothing
if href_protocol == "http" || href_protocol == "https"
if forcehttps
function handlehref_abs(data::CrawlData, base, url, href, href_protocol; forcehttps=false)
if href_protocol == "http" && forcehttps
href_protocol = "https"
href_noprot = chop(href; head=length(href_protocol), tail=0)
ismatch = startswith(href_noprot, base_noprot)
else
ismatch = startswith(href, base)
href = "https" * href_noprot
end
if href_protocol == "http" || href_protocol == "https"
# Index urls that start with the base URL
if ismatch
if startswith(href, base)
push!(data.urls, href)
@debug "Match (absolute URL): $href"
else
......@@ -34,6 +33,15 @@ function handlehref(data::CrawlData, base, url, href; forcehttps=false)
else
@debug "Ignoring URL $href with ignored procotol $href_protocol"
end
end
function handlehref(data::CrawlData, base, url, href; forcehttps=false)
# Remove URL fragment (#)
href = match(r"^(?<path>[^\#]*)", href)[:path]
href_protocol = getprotocol(href)
if href_protocol != nothing
handlehref_abs(data, base, url, href, href_protocol)
elseif startswith(href, "/")
abshref = base * chop(href; head=1, tail=0)
@debug "Match (absolute path): $abshref"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment