crawl.jl 4.44 KB
Newer Older
Jan Klass's avatar
Jan Klass committed
1 2 3
function handlehref_abs(data::CrawlData, base, url, href, href_protocol; forcehttps=false)
    if href_protocol == "http" && forcehttps
        href_noprot = chop(href; head=length(href_protocol), tail=0)
Jan Klass's avatar
Jan Klass committed
4
        href_protocol = "https"
Jan Klass's avatar
Jan Klass committed
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
        href = "https" * href_noprot
    end

    if href_protocol == "http" || href_protocol == "https"
        # Index urls that start with the base URL
        if startswith(href, base)
            push!(data.urls, href)
            @debug "Match (absolute URL): $href"
        else
            @debug "External URL: $href"
            push!(data.urlsexternal, href)
        end
    else
        @debug "Ignoring URL $href with ignored procotol $href_protocol"
    end
end

22 23 24
function handlehref(data::CrawlData, base, url, href; forcehttps=false)
    # Remove URL fragment (#)
    href = match(r"^(?<path>[^\#]*)", href)[:path]
Jan Klass's avatar
Jan Klass committed
25

Jan Klass's avatar
Jan Klass committed
26
    href_protocol = match(r"^((?<protocol>[a-zA-Z0-9]+)?\:\/\/)?", href)[:protocol]
27
    if href_protocol != nothing
Jan Klass's avatar
Jan Klass committed
28
        handlehref_abs(data, base, url, href, href_protocol; forcehttps=forcehttps)
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
    elseif startswith(href, "/")
        abshref = base * chop(href; head=1, tail=0)
        @debug "Match (absolute path): $abshref"
        push!(data.urls, abshref)
    else
        abshref = url * href
        @debug "Match (relative path): $abshref"
        push!(data.urls, abshref)
    end
end

function visit(data::CrawlData, url, base; forcehttps=false)
    @info "Visiting $url …"
    res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
    if res.status == 404
        @info "Ignoring HTTP 404 status code url $url"
        push!(data.urls404, url)
    elseif HTTP.isredirect(res) # res.status >= 300 && res.status < 400
        push!(data.urls3xx, url)

        for (name, value) in res.headers
            if name == "Location"
                @debug "Identified redirect $url to $value"
Jan Klass's avatar
Jan Klass committed
52
                handlehref(data, base, url, value; forcehttps=forcehttps)
53 54 55 56 57 58 59 60 61 62
                break
            end
        end
    elseif res.status == 200
        push!(data.urlsvalid, url)

        # Scan for new URLs on this page
        body = String(res.body)
        for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
            href = m[:url]
Jan Klass's avatar
Jan Klass committed
63
            handlehref(data, base, url, href; forcehttps=forcehttps)
64 65 66 67 68
        end
    else
        @debug "For $url response status is $(res.status)"
        push!(data.urlsfail, url)
    end
Jan Klass's avatar
Jan Klass committed
69
    push!(data.urlsvisited, url)
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
end

function write_urls(filename, urls)
    open(filename, "w") do f
        for url in urls
            println(f, url)
        end
    end
end

function basechop(base)
    if startswith(base, "http://")
        return chop(base; head=length("http://"), tail=0)
    elseif startswith(base, "https://")
        return chop(base; head=length("https://"), tail=0)
    else
        @error "Unexpected prefix "
    end
end

Jan Klass's avatar
Jan Klass committed
90
function crawl_and_generate(base; forcehttps=false, limitnew=1000)
91 92 93 94 95 96 97 98 99
    m = match(r"(<?protocol>(http)|(https))\:\/\/[^\/]+(?<port>\:[0-9]+)?(?<trailslash>\/)?(?<path>)?", base)
    if m == nothing
        @error "Failed to parse passed URL"
        exit(1)
    elseif m[:trailslash] == nothing
        @error "Missing trailing slash"
        exit(1)
    end

Jan Klass's avatar
Jan Klass committed
100 101 102 103 104
    urlsignoredprefixes = [
        "https://kcode.de/wordpress/wp-json/",
        "https://kcode.de/wordpress/tag/",
    ]

Jan Klass's avatar
Jan Klass committed
105 106 107
    data = read_crawldata()

    @info stringlengths(data)
108 109

    if forcehttps && startswith(base, "http://")
Jan Klass's avatar
Jan Klass committed
110
        base = "https://" * chop(base; head=length("http://", tail=0))
111
    end
Jan Klass's avatar
Jan Klass committed
112

113
    push!(data.urls, base)
Jan Klass's avatar
Jan Klass committed
114 115 116 117 118 119 120 121
    visited = 0
    while length(data.urlsvisited) != length(data.urls)
        if visited > limitnew
            break
        end

        @info "Intermediate count: $(stringlengths(data)) …"

122
        for url in data.urls
Jan Klass's avatar
Jan Klass committed
123 124 125
            if visited > limitnew
                break
            end
Jan Klass's avatar
Jan Klass committed
126 127 128 129 130 131 132
            for pref in urlsignoredprefixes
                if startswith(url, pref)
                    push!(data.urlsvisited, url)
                    push!(data.urlsignored, url)
                    continue
                end
            end
Jan Klass's avatar
Jan Klass committed
133 134
            if in(url, data.urlsvisited) == false
                visited = visited + 1
135 136 137 138 139
                visit(data, url, base; forcehttps=forcehttps)
            end
        end
    end

Jan Klass's avatar
Jan Klass committed
140
    @info "Identified $(stringlengths(data))"
141

Jan Klass's avatar
Jan Klass committed
142
    write_crawldata(data)
143
end