crawldata.jl 1.88 KB
Newer Older
Jan Klass's avatar
Jan Klass committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
struct CrawlData
    urls
    urlsvisited
    urlsvalid
    urls404
    urls3xx
    urlsexternal
    urlsfail

    CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
end

function readurls(filename)
    if isfile(filename)
        return Sitemap.read_sitemap(filename)
    else
        nothing
    end
end

function read(set::Set, filename)
    urls = readurls(filename)
    if urls == nothing
        return
    end
    for url in urls
        push!(set, url)
    end
end

function read_crawldata(data::CrawlData=CrawlData(), filenames::Filenames=Filenames()) CrawlData
    @info "Reading sitemap files …"

    read(data.urls, filenames.urls)
    read(data.urlsvisited, filenames.urlsvisited)
    read(data.urlsvalid, filenames.urlsvalid)
    read(data.urls404, filenames.urls404)
    read(data.urls3xx, filenames.urls3xx)
    read(data.urlsexternal, filenames.urlsexternal)
    read(data.urlsfail, filenames.urlsfail)

    return data
end

function write_crawldata(data::CrawlData, filenames::Filenames=Filenames())
    @info "Writing sitemap files …"
    Sitemap.write_sitemap(filenames.urls, data.urls)
    Sitemap.write_sitemap(filenames.urlsvisited, data.urlsvisited)
    Sitemap.write_sitemap(filenames.urlsvalid, data.urlsvalid)
    Sitemap.write_sitemap(filenames.urls404, data.urls404)
    Sitemap.write_sitemap(filenames.urls3xx, data.urls3xx)
    Sitemap.write_sitemap(filenames.urlsexternal, data.urlsexternal)
    Sitemap.write_sitemap(filenames.urlsfail, data.urlsfail)
end

function stringlengths(data::CrawlData)
    return """#urls: $(length(data.urls)), 
        #visited: $(length(data.urlsvisited)),
        #valid $(length(data.urlsvalid)),
        #404 $(length(data.urls404)),
        #3xx $(length(data.urls3xx)),
        #external $(length(data.urlsexternal)),
        #fail $(length(data.urlsfail)),
        """
end