crawl.jl 4.57 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
struct CrawlData
    urls
    visited
    urlsvalid
    urls404
    urls3xx
    urlsexternal
    urlsfail

    CrawlData() = new(Set(), Set(), Set(), Set(), Set(), Set(), Set())
end

function handlehref(data::CrawlData, base, url, href; forcehttps=false)
    # Remove URL fragment (#)
    href = match(r"^(?<path>[^\#]*)", href)[:path]
    href_protocol = match(r"^(?<protocol>[a-zA-Z0-9]+\:\/\/)?", href)[:protocol]
    if href_protocol != nothing
        if href_protocol == "http" || href_protocol == "https"
            if forcehttps
                href_noprot = chop(href; head=length(href_protocol), tail=0)
                ismatch = startswith(href_noprot, base_noprot)
            else
                ismatch = startswith(href, base)
            end
            # Index urls that start with the base URL
            if startswith(href, base)
                push!(data.urls, href)
                @debug "Match (absolute URL): $href"
            else
                push!(data.urlsexternal, href)
            end
        else
            @debug "Ignoring URL $href with ignored procotol $href_protocol"
        end
    elseif startswith(href, "/")
        abshref = base * chop(href; head=1, tail=0)
        @debug "Match (absolute path): $abshref"
        push!(data.urls, abshref)
    else
        abshref = url * href
        @debug "Match (relative path): $abshref"
        push!(data.urls, abshref)
    end
end

function visit(data::CrawlData, url, base; forcehttps=false)
    @info "Visiting $url …"
    res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
    if res.status == 404
        @info "Ignoring HTTP 404 status code url $url"
        push!(data.urls404, url)
    elseif HTTP.isredirect(res) # res.status >= 300 && res.status < 400
        push!(data.urls3xx, url)

        for (name, value) in res.headers
            if name == "Location"
                @debug "Identified redirect $url to $value"
Jan Klass's avatar
ff    
Jan Klass committed
58
                handlehref(data, base, url, value; forcehttps=forcehttps)
59
60
61
62
63
64
65
66
67
68
                break
            end
        end
    elseif res.status == 200
        push!(data.urlsvalid, url)

        # Scan for new URLs on this page
        body = String(res.body)
        for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
            href = m[:url]
Jan Klass's avatar
ff    
Jan Klass committed
69
            handlehref(data, base, url, href; forcehttps=forcehttps)
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
        end
    else
        @debug "For $url response status is $(res.status)"
        push!(data.urlsfail, url)
    end
    push!(data.visited, url)
end

function write_urls(filename, urls)
    open(filename, "w") do f
        for url in urls
            println(f, url)
        end
    end
end

function basechop(base)
    if startswith(base, "http://")
        return chop(base; head=length("http://"), tail=0)
    elseif startswith(base, "https://")
        return chop(base; head=length("https://"), tail=0)
    else
        @error "Unexpected prefix "
    end
end

function crawl_and_generate(base; forcehttps=false)
    m = match(r"(<?protocol>(http)|(https))\:\/\/[^\/]+(?<port>\:[0-9]+)?(?<trailslash>\/)?(?<path>)?", base)
    if m == nothing
        @error "Failed to parse passed URL"
        exit(1)
    elseif m[:trailslash] == nothing
        @error "Missing trailing slash"
        exit(1)
    end

    data = CrawlData()

    if forcehttps && startswith(base, "http://")
Jan Klass's avatar
ff    
Jan Klass committed
109
        base = "https://" * chop(base; head=length("http://", tail=0))
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
    end
 
    push!(data.urls, base)
    while length(data.visited) != length(data.urls)
        @info "Intermediate count: #urls: $(length(data.urls)), #visited: $(length(data.visited)), #valid $(length(data.urlsvalid)), #3xx $(length(data.urls3xx))…"
        for url in data.urls
            if in(data.visited, url) == false
                visit(data, url, base; forcehttps=forcehttps)
            end
        end
    end

    @info "Identified $(length(data.urls)) urls, $(length(data.urls404)) 404 urls, $(length(data.urlsexternal)) external urls, $(length(data.urls3xx)) 3xx urls, $(length(data.urlsfail)) fail."

    @info "Clearing files …"
    rm(out_404; force=true)
    rm(out_3xx; force=true)
    rm(out_external; force=true)
    rm(out_fail; force=true)
    rm(out_sitemap; force=true)

    @info "Writing url files …"
    write_urls(out_404, data.urls404)
    write_urls(out_3xx, data.urls3xx)
    write_urls(out_external, data.urlsexternal)
    write_urls(out_fail, data.urlsfail)

    @info "Writing sitemap $out_sitemap …"
    Sitemap.write_sitemap(out_sitemap, data.urlsvalid)
end