Commit c2df0fa4 authored by Giantzoc's avatar Giantzoc
Browse files

Updated program to use prisma

parent b712d6bf
var request = require('request');
var URL = require('url-parse');
var Promise = require("bluebird");
const request = Promise.promisifyAll(require('request'));
const URL = require('url-parse');
const jsdom = require('jsdom');
const { JSDOM } = jsdom;
var fs = require('fs');
const fs = Promise.promisifyAll(require('fs'));
const mkdirp = require('mkdirp-promise');
const { prisma } = require('./prisma-new/generated/prisma-client')
const async = require('async');
var START_URL = "http://legacy.aonprd.com/";
var MAX_PAGES_TO_VISIT = 10000;
const START_URL = "http://legacy.aonprd.com/";
const MAX_PAGES_TO_VISIT = 10000;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
var mkdirp = require('mkdirp');
var timer = 1000;
pagesToVisit.push(baseUrl);
crawl();
// Make the request
//console.log("First page " + url);
// request(START_URL, function (error, response, body) {
// // Check status code (200 is HTTP OK)
// console.log("Status code: " + response.statusCode);
// if (response.statusCode !== 200) {
// console.log("Unable to load first page");
// return;
// }
// var dom = new JSDOM(body);
// // Only parse links on the first page
// const navMenu = dom.window.document.getElementById('nav-menu');
// dom = new JSDOM(navMenu.innerHTML);
// const links = dom.window.document.querySelectorAll('a');
// links.forEach(link => {
// const url = link.attributes.getNamedItem('href').textContent;
// pagesToVisit.push(baseUrl + '/' + url);
// });
// crawl();
// });
function sleep(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
const url = new URL(START_URL);
const baseUrl = url.protocol + "//" + url.hostname;
const timer = 1000;
const baseDir = "/home/steven/Source/node scraper/Pathfinder/";
main();
//pagesToVisit.push(baseUrl);
async function main() {
await createLink(baseUrl);
//pagesToVisit = await prisma.links();
await crawl();
}
async function crawl() {
if (numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
try {
// var linksToDelete = await prisma.links({
// where: {
// url_not_contains: "http://legacy.aonprd.com"
// }
// });
// for (i = 0; i < linksToDelete.length; i++) {
// var link = linksToDelete[i];
// await prisma.deleteLink({ id: link.id });
// }
if (numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var linksToVisit = await prisma.links({ where: { visited: false } }); //
if (linksToVisit.length === 0) {
console.log("No more pages to visit");
return;
}
var nextLink = linksToVisit.pop();
var response = await visitPage(nextLink);
if(response != null && response.statusCode == 200){
await parseLinks(response.body, nextLink.url);
await savePage(nextLink, response.body);
}
//slow things down
//await sleep(timer);
await crawl();
}
if(pagesToVisit.length === 0){
console.log("No more pages to visit");
return;
catch (e) {
console.log(e.message);
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
}
//slow things down
await sleep(timer);
async function visitPage(link){
try{
// New page we haven't visited
visitPage(nextPage, savePage);
numPagesVisited++;
console.log("Visiting page " + link.url);
var response = await request.getAsync(link.url);
console.log("Status code: " + response.statusCode);
const updatedLink = await prisma.updateLink({
data: {
visited: true,
//body: response.body, 256 KB limit
status: response.statusCode.toString(),
error: response.error
},
where: {
id: link.id
}
});
return response;
}
catch (e) {
console.log("Visit Page Error: " + e.message);
//update link and remove from queue
const updatedLink = await prisma.updateLink({
data: {
visited: true,
//body: response.body, 256 KB limit
status: (400).toString(),
error: e.message
},
where: {
id: link.id
}
});
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
async function sleep(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
numPagesVisited++;
async function savePage(link, body) {
try {
//https://medium.com/@networkaaron/dev-diary-request-mkdirp-fs-writefile-bluebird-86a72bc0550a
const relativeUrl = link.url.replace(START_URL, '');
const folderPath = baseDir + relativeUrl;
const folders = folderPath.split('/');
const filename = folders.pop();
await mkdirp(folderPath.replace(filename, ''))
.then(fs.writeFileAsync(folderPath, body))
.then(prisma.updateLink({
data: {
saved: true
},
where: {
id: link.id
}
})).then(console.log("The file was saved!"))
.catch(function(err) {
// path exists unless there was an error
console.error(err)
});
// fs.writeFile(folderPath, body, async function (err) {
// if (err) {
// return console.log(err);
// }
// const updatedLink = await prisma.updateLink({
// data: {
// saved: true
// },
// where: {
// id: link.id
// }
// });
// console.log("The file was saved!");
// });
}
catch (e) {
console.log("Save Page Error: " + e.message);
}
// Make the request
console.log("Visiting page " + url);
request(url, function (error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if (response.statusCode !== 200) {
crawl();
return;
}
var html = body;
savePage(url, html, crawl);
});
}
function savePage(url, html, callback){
var relativeUrl = url.replace('http://legacy.aonprd.com/', '');
const baseDir = "/home/steven/Source/node scraper/Pathfinder/";
var folderPath = baseDir + relativeUrl;
const folders = folderPath.split('/');
const filename = folders.pop();
mkdirp(folderPath.replace(filename, ''), function (err) {
// path exists unless there was an error
if (err) console.error(err)
else console.log('pow!')
});
fs.writeFile(folderPath, html, function (err) {
if (err) {
return console.log(err);
}
async function parseLinks(html, url) {
try {
// Parse links from the page and add them to the collection
var dom = new JSDOM(html);
//https://stackoverflow.com/questions/32765157/filter-or-map-nodelists-in-es6
const links = Array.from(dom.window.document.querySelectorAll('a'));
console.log("The file was saved!");
});
//https://stackoverflow.com/questions/37576685/using-async-await-with-a-foreach-loop
for(const link of links){
await saveLink(link, url);
}
}
catch (e) {
console.log("Parse Links Error: " + e.message);
}
}
// Parse links from the page and add them to the collection
var dom = new JSDOM(html);
const links = dom.window.document.querySelectorAll('a');
links.forEach(link => {
if (link.attributes.getNamedItem('href') != null){
async function saveLink(link, url){
try{
if (link.attributes.getNamedItem('href') != null) {
const href = link.attributes.getNamedItem('href').textContent;
//check for absolute link
if (typeof (href) != "undefined" && href.indexOf('http') === -1 && href.indexOf('#') === -1){
//build complete link from relative link
var linkUrl = new URL(href, url, true);
//don't add duplicates to the list
if(pagesToVisit.indexOf(linkUrl) === -1){
pagesToVisit.push(linkUrl.toString());
if (typeof (href) != "undefined") {
//check for absolute link
// TODO: better handling of # links
if (isUrlAbsolute(href) && href.indexOf(baseUrl) > -1) {
//build complete link from absolute link
var linkUrl = new URL(href, url, true);
linkUrl.hash = ""; //remove the hash
await createLink(linkUrl.toString());
} else {
//build complete link from relative link
var linkUrl = new URL(href, url, true);
linkUrl.hash = ""; //remove the hash
await createLink(linkUrl.toString());
}
}
}
});
callback();
}
catch (e) {
console.log("Save Link Error: " + e.message);
}
}
function RemoveLastDirectoryPartOf(url) {
var folders = url.split('/');
folders.pop();
return (folders.join('/'));
function isUrlAbsolute(url) {
if (url.indexOf('//') === 0) { return true; } // URL is protocol-relative (= absolute)
if (url.indexOf('://') === -1) { return false; } // URL has no protocol (= relative)
if (url.indexOf('.') === -1) { return false; } // URL does not contain a dot, i.e. no TLD (= relative, possibly REST)
if (url.indexOf('/') === -1) { return false; } // URL does not contain a single slash (= relative)
if (url.indexOf(':') > url.indexOf('/')) { return false; } // The first colon comes after the first slash (= relative)
if (url.indexOf('://') < url.indexOf('.')) { return true; } // Protocol is defined before first dot (= absolute)
return false; // Anything else must be relative
}
async function createLink(url) {
try{
//make sure to stay on the same site
if (url.indexOf(START_URL) > -1) {
//see if base link has been added
var link = await prisma.link({ url: url });
if (link == null) {
//add base link
const newLink = await prisma.createLink({
url: url
});
console.log("New Link: " + newLink.id);
return newLink;
}
}
}
catch (e) {
console.log("Create Link Error: " + e.message);
}
return null;
}
\ No newline at end of file
This diff is collapsed.
......@@ -6,9 +6,14 @@
"author": "GiantZOC",
"license": "ISC",
"dependencies": {
"async": "^2.6.1",
"bluebird": "^3.5.2",
"cheerio": "^0.22.0",
"graphql-yoga": "^1.16.7",
"jsdom": "^12.0.0",
"mkdirp": "^0.5.1",
"mkdirp-promise": "^5.0.1",
"prisma-client-lib": "^1.19.1",
"request": "^2.65.0",
"url-parse": "^1.0.5"
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment