Commit 39e68569 authored by Jan Klass's avatar Jan Klass

Fix

parent c7140e2a
...@@ -13,7 +13,8 @@ end ...@@ -13,7 +13,8 @@ end
function handlehref(data::CrawlData, base, url, href; forcehttps=false) function handlehref(data::CrawlData, base, url, href; forcehttps=false)
# Remove URL fragment (#) # Remove URL fragment (#)
href = match(r"^(?<path>[^\#]*)", href)[:path] href = match(r"^(?<path>[^\#]*)", href)[:path]
href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+\:\/\/)?", href)[:protocol]
href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+)?\:\/\/", href)[:protocol]
if href_protocol != nothing if href_protocol != nothing
if href_protocol == "http" || href_protocol == "https" if href_protocol == "http" || href_protocol == "https"
if forcehttps if forcehttps
...@@ -23,10 +24,11 @@ function handlehref(data::CrawlData, base, url, href; forcehttps=false) ...@@ -23,10 +24,11 @@ function handlehref(data::CrawlData, base, url, href; forcehttps=false)
ismatch = startswith(href, base) ismatch = startswith(href, base)
end end
# Index urls that start with the base URL # Index urls that start with the base URL
if startswith(href, base) if ismatch
push!(data.urls, href) push!(data.urls, href)
@debug "Match (absolute URL): $href" @debug "Match (absolute URL): $href"
else else
@debug "External URL: $href"
push!(data.urlsexternal, href) push!(data.urlsexternal, href)
end end
else else
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment