Commit 39e68569 authored by Jan Klass's avatar Jan Klass

Fix

parent c7140e2a
......@@ -13,7 +13,8 @@ end
function handlehref(data::CrawlData, base, url, href; forcehttps=false)
# Remove URL fragment (#)
href = match(r"^(?<path>[^\#]*)", href)[:path]
href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+\:\/\/)?", href)[:protocol]
href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+)?\:\/\/", href)[:protocol]
if href_protocol != nothing
if href_protocol == "http" || href_protocol == "https"
if forcehttps
......@@ -23,10 +24,11 @@ function handlehref(data::CrawlData, base, url, href; forcehttps=false)
ismatch = startswith(href, base)
end
# Index urls that start with the base URL
if startswith(href, base)
if ismatch
push!(data.urls, href)
@debug "Match (absolute URL): $href"
else
@debug "External URL: $href"
push!(data.urlsexternal, href)
end
else
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment