Commit 18721b8a authored by Jan Klass's avatar Jan Klass

Fix

parent 39e68569
...@@ -10,30 +10,38 @@ struct CrawlData ...@@ -10,30 +10,38 @@ struct CrawlData
CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set()) CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
end end
function getprotocol(url)
match(r"^(?<protocol>[a-zA-Z0-9]+)?\:\/\/", url)[:protocol]
end
function handlehref_abs(data::CrawlData, base, url, href, href_protocol; forcehttps=false)
if href_protocol == "http" && forcehttps
href_protocol = "https"
href_noprot = chop(href; head=length(href_protocol), tail=0)
href = "https" * href_noprot
end
if href_protocol == "http" || href_protocol == "https"
# Index urls that start with the base URL
if startswith(href, base)
push!(data.urls, href)
@debug "Match (absolute URL): $href"
else
@debug "External URL: $href"
push!(data.urlsexternal, href)
end
else
@debug "Ignoring URL $href with ignored procotol $href_protocol"
end
end
function handlehref(data::CrawlData, base, url, href; forcehttps=false) function handlehref(data::CrawlData, base, url, href; forcehttps=false)
# Remove URL fragment (#) # Remove URL fragment (#)
href = match(r"^(?<path>[^\#]*)", href)[:path] href = match(r"^(?<path>[^\#]*)", href)[:path]
href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+)?\:\/\/", href)[:protocol] href_protocol = getprotocol(href)
if href_protocol != nothing if href_protocol != nothing
if href_protocol == "http" || href_protocol == "https" handlehref_abs(data, base, url, href, href_protocol)
if forcehttps
href_noprot = chop(href; head=length(href_protocol), tail=0)
ismatch = startswith(href_noprot, base_noprot)
else
ismatch = startswith(href, base)
end
# Index urls that start with the base URL
if ismatch
push!(data.urls, href)
@debug "Match (absolute URL): $href"
else
@debug "External URL: $href"
push!(data.urlsexternal, href)
end
else
@debug "Ignoring URL $href with ignored procotol $href_protocol"
end
elseif startswith(href, "/") elseif startswith(href, "/")
abshref = base * chop(href; head=1, tail=0) abshref = base * chop(href; head=1, tail=0)
@debug "Match (absolute path): $abshref" @debug "Match (absolute path): $abshref"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment