crawl.jl 4.06 KB
Newer Older
Jan Klass's avatar
Fix  
Jan Klass committed
1 2 3
function handlehref_abs(data::CrawlData, base, url, href, href_protocol; forcehttps=false)
    if href_protocol == "http" && forcehttps
        href_noprot = chop(href; head=length(href_protocol), tail=0)
Jan Klass's avatar
Fix  
Jan Klass committed
4
        href_protocol = "https"
Jan Klass's avatar
Fix  
Jan Klass committed
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
        href = "https" * href_noprot
    end

    if href_protocol == "http" || href_protocol == "https"
        # Index urls that start with the base URL
        if startswith(href, base)
            push!(data.urls, href)
            @debug "Match (absolute URL): $href"
        else
            @debug "External URL: $href"
            push!(data.urlsexternal, href)
        end
    else
        @debug "Ignoring URL $href with ignored procotol $href_protocol"
    end
end

22 23 24
function handlehref(data::CrawlData, base, url, href; forcehttps=false)
    # Remove URL fragment (#)
    href = match(r"^(?<path>[^\#]*)", href)[:path]
Jan Klass's avatar
Fix  
Jan Klass committed
25

Jan Klass's avatar
Fix  
Jan Klass committed
26
    href_protocol = match(r"^((?<protocol>[a-zA-Z0-9]+)?\:\/\/)?", href)[:protocol]
27
    if href_protocol != nothing
Jan Klass's avatar
Fix  
Jan Klass committed
28
        handlehref_abs(data, base, url, href, href_protocol; forcehttps=forcehttps)
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
    elseif startswith(href, "/")
        abshref = base * chop(href; head=1, tail=0)
        @debug "Match (absolute path): $abshref"
        push!(data.urls, abshref)
    else
        abshref = url * href
        @debug "Match (relative path): $abshref"
        push!(data.urls, abshref)
    end
end

function visit(data::CrawlData, url, base; forcehttps=false)
    @info "Visiting $url …"
    res = HTTP.get(url; readtimeout=2, redirect=false, status_exception=false)
    if res.status == 404
        @info "Ignoring HTTP 404 status code url $url"
        push!(data.urls404, url)
    elseif HTTP.isredirect(res) # res.status >= 300 && res.status < 400
        push!(data.urls3xx, url)

        for (name, value) in res.headers
            if name == "Location"
                @debug "Identified redirect $url to $value"
Jan Klass's avatar
ff  
Jan Klass committed
52
                handlehref(data, base, url, value; forcehttps=forcehttps)
53 54 55 56 57 58 59 60 61 62
                break
            end
        end
    elseif res.status == 200
        push!(data.urlsvalid, url)

        # Scan for new URLs on this page
        body = String(res.body)
        for m in eachmatch(r"href=\"(?<url>[^\"]+)\"", body)
            href = m[:url]
Jan Klass's avatar
ff  
Jan Klass committed
63
            handlehref(data, base, url, href; forcehttps=forcehttps)
64 65 66 67 68
        end
    else
        @debug "For $url response status is $(res.status)"
        push!(data.urlsfail, url)
    end
Jan Klass's avatar
Jan Klass committed
69
    push!(data.urlsvisited, url)
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
end

function write_urls(filename, urls)
    open(filename, "w") do f
        for url in urls
            println(f, url)
        end
    end
end

function basechop(base)
    if startswith(base, "http://")
        return chop(base; head=length("http://"), tail=0)
    elseif startswith(base, "https://")
        return chop(base; head=length("https://"), tail=0)
    else
        @error "Unexpected prefix "
    end
end

Jan Klass's avatar
Jan Klass committed
90
function crawl_and_generate(base; forcehttps=false, limitnew=10)
91 92 93 94 95 96 97 98 99
    m = match(r"(<?protocol>(http)|(https))\:\/\/[^\/]+(?<port>\:[0-9]+)?(?<trailslash>\/)?(?<path>)?", base)
    if m == nothing
        @error "Failed to parse passed URL"
        exit(1)
    elseif m[:trailslash] == nothing
        @error "Missing trailing slash"
        exit(1)
    end

Jan Klass's avatar
Jan Klass committed
100 101 102
    data = read_crawldata()

    @info stringlengths(data)
103 104

    if forcehttps && startswith(base, "http://")
Jan Klass's avatar
ff  
Jan Klass committed
105
        base = "https://" * chop(base; head=length("http://", tail=0))
106
    end
Jan Klass's avatar
Jan Klass committed
107

108
    push!(data.urls, base)
Jan Klass's avatar
Jan Klass committed
109 110 111 112 113 114 115 116
    visited = 0
    while length(data.urlsvisited) != length(data.urls)
        if visited > limitnew
            break
        end

        @info "Intermediate count: $(stringlengths(data)) …"

117
        for url in data.urls
Jan Klass's avatar
Jan Klass committed
118 119 120 121 122
            if visited > limitnew
                break
            end
            if in(url, data.urlsvisited) == false
                visited = visited + 1
123 124 125 126 127
                visit(data, url, base; forcehttps=forcehttps)
            end
        end
    end

Jan Klass's avatar
Jan Klass committed
128
    @info "Identified $(stringlengths(data))"
129

Jan Klass's avatar
Jan Klass committed
130
    write_crawldata(data)
131
end