Commit 04bbdca7 authored by Jan Klass's avatar Jan Klass


parent 18721b8a
......@@ -10,14 +10,10 @@ struct CrawlData
CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
function getprotocol(url)
match(r"^(?<protocol>[a-zA-Z0-9]+)?\:\/\/", url)[:protocol]
function handlehref_abs(data::CrawlData, base, url, href, href_protocol; forcehttps=false)
if href_protocol == "http" && forcehttps
href_protocol = "https"
href_noprot = chop(href; head=length(href_protocol), tail=0)
href_protocol = "https"
href = "https" * href_noprot
......@@ -39,9 +35,9 @@ function handlehref(data::CrawlData, base, url, href; forcehttps=false)
# Remove URL fragment (#)
href = match(r"^(?<path>[^\#]*)", href)[:path]
href_protocol = getprotocol(href)
href_protocol = match(r"^((?<protocol>[a-zA-Z0-9]+)?\:\/\/)?", href)[:protocol]
if href_protocol != nothing
handlehref_abs(data, base, url, href, href_protocol)
handlehref_abs(data, base, url, href, href_protocol; forcehttps=forcehttps)
elseif startswith(href, "/")
abshref = base * chop(href; head=1, tail=0)
@debug "Match (absolute path): $abshref"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment