Commit 18721b8a authored by Jan Klass's avatar Jan Klass

Fix

parent 39e68569
...@@ -10,21 +10,20 @@ struct CrawlData ...@@ -10,21 +10,20 @@ struct CrawlData
CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set()) CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
end end
function handlehref(data::CrawlData, base, url, href; forcehttps=false) function getprotocol(url)
# Remove URL fragment (#) match(r"^(?<protocol>[a-zA-Z0-9]+)?\:\/\/", url)[:protocol]
href = match(r"^(?<path>[^\#]*)", href)[:path] end
href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+)?\:\/\/", href)[:protocol] function handlehref_abs(data::CrawlData, base, url, href, href_protocol; forcehttps=false)
if href_protocol != nothing if href_protocol == "http" && forcehttps
if href_protocol == "http" || href_protocol == "https" href_protocol = "https"
if forcehttps
href_noprot = chop(href; head=length(href_protocol), tail=0) href_noprot = chop(href; head=length(href_protocol), tail=0)
ismatch = startswith(href_noprot, base_noprot) href = "https" * href_noprot
else
ismatch = startswith(href, base)
end end
if href_protocol == "http" || href_protocol == "https"
# Index urls that start with the base URL # Index urls that start with the base URL
if ismatch if startswith(href, base)
push!(data.urls, href) push!(data.urls, href)
@debug "Match (absolute URL): $href" @debug "Match (absolute URL): $href"
else else
...@@ -34,6 +33,15 @@ function handlehref(data::CrawlData, base, url, href; forcehttps=false) ...@@ -34,6 +33,15 @@ function handlehref(data::CrawlData, base, url, href; forcehttps=false)
else else
@debug "Ignoring URL $href with ignored procotol $href_protocol" @debug "Ignoring URL $href with ignored procotol $href_protocol"
end end
end
function handlehref(data::CrawlData, base, url, href; forcehttps=false)
# Remove URL fragment (#)
href = match(r"^(?<path>[^\#]*)", href)[:path]
href_protocol = getprotocol(href)
if href_protocol != nothing
handlehref_abs(data, base, url, href, href_protocol)
elseif startswith(href, "/") elseif startswith(href, "/")
abshref = base * chop(href; head=1, tail=0) abshref = base * chop(href; head=1, tail=0)
@debug "Match (absolute path): $abshref" @debug "Match (absolute path): $abshref"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment