Commit 4182da01 authored by Jan Klass's avatar Jan Klass

Add ignored prefixes

parent e689ceeb
......@@ -97,6 +97,11 @@ function crawl_and_generate(base; forcehttps=false, limitnew=10)
exit(1)
end
urlsignoredprefixes = [
"https://kcode.de/wordpress/wp-json/",
"https://kcode.de/wordpress/tag/",
]
data = read_crawldata()
@info stringlengths(data)
......@@ -118,6 +123,13 @@ function crawl_and_generate(base; forcehttps=false, limitnew=10)
if visited > limitnew
break
end
for pref in urlsignoredprefixes
if startswith(url, pref)
push!(data.urlsvisited, url)
push!(data.urlsignored, url)
continue
end
end
if in(url, data.urlsvisited) == false
visited = visited + 1
visit(data, url, base; forcehttps=forcehttps)
......
......@@ -6,8 +6,9 @@ struct CrawlData
urls3xx
urlsexternal
urlsfail
urlsignored
CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set(), Set())
end
function readurls(filename)
......@@ -38,6 +39,7 @@ function read_crawldata(data::CrawlData=CrawlData(), filenames::Filenames=Filena
read(data.urls3xx, filenames.urls3xx)
read(data.urlsexternal, filenames.urlsexternal)
read(data.urlsfail, filenames.urlsfail)
read(data.urlsignored, filenames.urlsignored)
return data
end
......@@ -51,6 +53,7 @@ function write_crawldata(data::CrawlData, filenames::Filenames=Filenames())
Sitemap.write_sitemap(filenames.urls3xx, data.urls3xx)
Sitemap.write_sitemap(filenames.urlsexternal, data.urlsexternal)
Sitemap.write_sitemap(filenames.urlsfail, data.urlsfail)
Sitemap.write_sitemap(filenames.urlsignored, data.urlsignored)
end
function stringlengths(data::CrawlData)
......@@ -61,5 +64,6 @@ function stringlengths(data::CrawlData)
#3xx $(length(data.urls3xx)),
#external $(length(data.urlsexternal)),
#fail $(length(data.urlsfail)),
#ignored $(length(data.urlsignored)),
"""
end
......@@ -6,6 +6,7 @@ struct Filenames
urls3xx
urlsexternal
urlsfail
urlsignored
Filenames() = new(
"sitemap.urls.xml",
......@@ -15,5 +16,6 @@ struct Filenames
"sitemap.3xx.xml",
"sitemap.external.xml",
"sitemap.fail.xml",
"sitemap.ignored.xml",
)
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment