Commit 1ef34b75 authored by Giantzoc's avatar Giantzoc
Browse files

updated crawler to visit all pages in the current domain instead of using the...

updated crawler to visit all pages in the current domain instead of using the menu to navigate the site
parent 5f857570
......@@ -4,8 +4,8 @@ const jsdom = require('jsdom');
const { JSDOM } = jsdom;
var fs = require('fs');
var START_URL = "http://legacy.aonprd.com";
var MAX_PAGES_TO_VISIT = 1000;
var START_URL = "http://legacy.aonprd.com/";
var MAX_PAGES_TO_VISIT = 10000;
var pagesVisited = {};
var numPagesVisited = 0;
......@@ -13,43 +13,59 @@ var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
var mkdirp = require('mkdirp');
var timer = 1000;
// Make the request
console.log("First page " + url);
request(START_URL, function (error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if (response.statusCode !== 200) {
console.log("Unable to load first page");
return;
}
var dom = new JSDOM(body);
// Only parse links on the first page
const navMenu = dom.window.document.getElementById('nav-menu');
dom = new JSDOM(navMenu.innerHTML);
const links = dom.window.document.querySelectorAll('a');
links.forEach(link => {
const url = link.attributes.getNamedItem('href').textContent;
pagesToVisit.push(baseUrl + '/' + url);
});
crawl();
});
pagesToVisit.push(baseUrl);
crawl();
// Make the request
//console.log("First page " + url);
// request(START_URL, function (error, response, body) {
// // Check status code (200 is HTTP OK)
// console.log("Status code: " + response.statusCode);
// if (response.statusCode !== 200) {
// console.log("Unable to load first page");
// return;
// }
// var dom = new JSDOM(body);
// // Only parse links on the first page
// const navMenu = dom.window.document.getElementById('nav-menu');
// dom = new JSDOM(navMenu.innerHTML);
// const links = dom.window.document.querySelectorAll('a');
// links.forEach(link => {
// const url = link.attributes.getNamedItem('href').textContent;
// pagesToVisit.push(baseUrl + '/' + url);
// });
// crawl();
// });
function sleep(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
function crawl() {
async function crawl() {
if (numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
if(pagesToVisit.length === 0){
console.log("No more pages to visit");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
//slow things down
await sleep(timer);
// New page we haven't visited
visitPage(nextPage, savePage);
}
......@@ -67,23 +83,15 @@ function visitPage(url, callback) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if (response.statusCode !== 200) {
callback();
crawl();
return;
}
var html = body;
callback(url, html, crawl);
savePage(url, html, crawl);
});
}
const mkdirSync = function (dirPath) {
try {
fs.mkdirSync(dirPath)
} catch (err) {
if (err.code !== 'EEXIST') throw err
}
}
function savePage(url, html, callback){
var relativeUrl = url.replace('http://legacy.aonprd.com/', '');
const baseDir = "/home/steven/Source/node scraper/Pathfinder/";
......@@ -104,5 +112,30 @@ function savePage(url, html, callback){
console.log("The file was saved!");
});
// Parse links from the page and add them to the collection
var dom = new JSDOM(html);
const links = dom.window.document.querySelectorAll('a');
links.forEach(link => {
if (link.attributes.getNamedItem('href') != null){
const href = link.attributes.getNamedItem('href').textContent;
//check for absolute link
if (typeof (href) != "undefined" && href.indexOf('http') === -1 && href.indexOf('#') === -1){
//build complete link from relative link
var linkUrl = new URL(href, url, true);
//don't add duplicates to the list
if(pagesToVisit.indexOf(linkUrl) === -1){
pagesToVisit.push(linkUrl.toString());
}
}
}
});
callback();
}
\ No newline at end of file
}
function RemoveLastDirectoryPartOf(url) {
var folders = url.split('/');
folders.pop();
return (folders.join('/'));
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment