Commit 5066e6ec authored by Mehmet Kozan's avatar Mehmet Kozan

minor bugs fixed.

parent da0ff4e4
Pipeline #15719175 passed with stages
in 3 minutes and 32 seconds
......@@ -3,16 +3,6 @@ const psl = require('psl');
const cleanUrl = require('url-clean');
const cheerio = require('cheerio');
const normalize_options = {
removeDirectoryIndex: false,
removeTrailingSlash: false,
stripWWW: false,
stripFragment: false,
normalizeHttps: false,
normalizeProtocol: false,
removeQueryParameters: [/^utm_\w+/i, 'ref']
}
const result_normalize_options = {
removeDirectoryIndex: true,
removeTrailingSlash: true,
......@@ -23,19 +13,14 @@ const result_normalize_options = {
removeQueryParameters: [/^utm_\w+/i, 'ref']
}
function _has_illegal_chars(str) {
if (/[^a-z0-9\:\/\?\#\[\]\@\!\$\&\'\(\)\*\+\,\;\=\.\-\_\~\%]/i.test(str)) {
return true;
}
return false;
return /[^a-z0-9\:\/\?\#\[\]\@\!\$\&\'\(\)\*\+\,\;\=\.\-\_\~\%]/i.test(str);
}
function parse(currentUrlStr, baseUrlStr) {
let ret = {
url: null,
baseurl: null,
normalized: null,
protocol: null,
host: null,
domain: null,
......@@ -48,40 +33,39 @@ function parse(currentUrlStr, baseUrlStr) {
if (currentUrlStr && _has_illegal_chars(currentUrlStr)) return null;
if (baseUrlStr && _has_illegal_chars(baseUrlStr)) return null;
let currentNormUrlStr = cleanUrl(currentUrlStr, normalize_options);
let normalizedBaseUrl = null;
if (!currentNormUrlStr) {
normalizedBaseUrl = cleanUrl(baseUrlStr, normalize_options);
if (normalizedBaseUrl) {
currentUrlStr = normalizedBaseUrl;
currentNormUrlStr = normalizedBaseUrl;
normalizedBaseUrl = null;
baseUrlStr = null;
} else {
return null;
currentUrlStr = currentUrlStr.replace(/^\/\//, 'http://');
currentUrlStr = currentUrlStr.replace(/#.*$/, '');
if(baseUrlStr) {
baseUrlStr = baseUrlStr.replace(/^\/\//, 'http://');
baseUrlStr = baseUrlStr.replace(/#.*$/, '');
}
else {
if ( ! /^\.*\/|^(?!localhost)\w+:/.test(currentUrlStr)){
currentUrlStr = currentUrlStr.replace(/^(?!(?:\w+:)?\/\/)/, 'http://');
}
}
let parsedUrl = URL.parse(currentNormUrlStr, true, true);
let parsedUrl = URL.parse(currentUrlStr, true, true);
delete parsedUrl.hash ;
if (parsedUrl.protocol && parsedUrl.protocol != 'http:' && parsedUrl.protocol != 'https:') return null;
//current url is relative like "abc", "/abc" or "../abc"
if (parsedUrl.host == null && baseUrlStr) {
normalizedBaseUrl = cleanUrl(baseUrlStr, normalize_options);
ret.baseurl = normalizedBaseUrl;
let parsedBaseUrl = URL.parse(normalizedBaseUrl, normalize_options);
let parsedBaseUrl = URL.parse(baseUrlStr, true,true);
delete parsedUrl.hash;
ret.baseurl = URL.format(parsedBaseUrl);
let absoluteUrl = URL.parse(URL.resolve(parsedBaseUrl, parsedUrl));
currentUrlStr = URL.format(absoluteUrl);
}
ret.url = currentUrlStr;
ret.normalized = cleanUrl(currentUrlStr, result_normalize_options);
parsedUrl = URL.parse(ret.normalized, true, true);
parsedUrl = URL.parse(currentUrlStr, true, true);
delete parsedUrl.hash;
ret.url = URL.format(parsedUrl);
ret.protocol = parsedUrl.protocol;
ret.host = parsedUrl.host;
ret.path = parsedUrl.pathname;
......@@ -94,7 +78,6 @@ function parse(currentUrlStr, baseUrlStr) {
ret.search = parsedUrl.search;
ret.querycount = parsedUrl.search ? parsedUrl.search.split("=").length - 1 : 0;
//ret.type = normalizedBaseUrl ? gettype(ret.normalized,normalizedBaseUrl):"none";
return ret;
}
......@@ -107,7 +90,7 @@ function extract(data, sourceUrl) {
let embedBaseUrlStr = $('base').attr('href');
let embedBaseUrl = parse(embedBaseUrlStr);
baseUrl = embedBaseUrl ? embedBaseUrl : baseUrl;
let baseUrlStr = baseUrl ? baseUrl.normalized : null;
let baseUrlStr = baseUrl ? baseUrl.url : null;
$('a').each(function (i, el) {
let href = $(this).attr('href');
......@@ -115,19 +98,18 @@ function extract(data, sourceUrl) {
//href = href.replace(/;.*$/g,"");
if (typeof href == "undefined" || href.length < 3 || /^(javascript|mailto:|ftp:)/ig.test(href)) return;
//let currentUrl = embedBaseUrl == null ? parse(href,baseUrl.normalized) : parse(href,embedBaseUrl.normalized);
let currentUrl = parse(href, baseUrlStr);
if (currentUrl && currentUrl.normalized) {
if (urlMap.has(currentUrl.normalized)) {
let tmpUrl = urlMap.get(currentUrl.normalized);
if (currentUrl && currentUrl.url) {
if (urlMap.has(currentUrl.url)) {
let tmpUrl = urlMap.get(currentUrl.url);
if (!tmpUrl.text.includes(text)) {
tmpUrl.text = `${tmpUrl.text} ${text}`;
}
} else {
currentUrl.text = text == null ? "" : text;
currentUrl.baseurl = baseUrlStr;
urlMap.set(currentUrl.normalized, currentUrl);
urlMap.set(currentUrl.url, currentUrl);
}
}
});
......@@ -143,7 +125,7 @@ function extract(data, sourceUrl) {
retArr = retArr.map(function (el) {
return {
url: el.normalized,
url: el.url,
text: el.text,
type: el.type
}
......@@ -163,6 +145,8 @@ function gettype(linkurl, pageurl) {
let linkurl_path = linkurl.path ? linkurl.path : "";
let pageurl_path = pageurl.path ? pageurl.path : "";
linkurl_path = linkurl_path.replace(/\/index\.[a-z]+$/,'/').replace(/\/default\.[a-z]+$/,'/');
pageurl_path = pageurl_path.replace(/\/index\.[a-z]+$/,'/').replace(/\/default\.[a-z]+$/,'/');
let linkurl_parts = linkurl_path.split("/").filter(function (elem, index, array) {
return elem.length > 0
......
{
"name": "crawler-url-parser",
"version": "2.0.2",
"version": "2.0.4",
"description": "An `URL` parser for crawling purpose.",
"main": "crawler-url-parser.js",
"keywords": [
......
......@@ -8,6 +8,6 @@
<a href="ddd">test-link-2</a><br />
<a href="./ddd">test-link-3</a><br />
<a href="../ddd">test-link-3</a><br />
<a href="google.com">test-link-3</a><br />
<a href="google.com">link without protocol</a><br />
</body>
</html>
\ No newline at end of file
This diff is collapsed.
......@@ -2,6 +2,29 @@ const assert = require('assert');
const cup = require("../");
describe('gettype url as samelevel, sublevel, uplevel', function() {
it('should gettype sublevel urls', function() {
let res = cup.gettype("//sub.domain.com/aaa/bbb/","//sub.domain.com/aaa/bbb/ccc");
assert.equal(res,"uplevel");
});
it('should gettype uplevel urls', function() {
let res = cup.gettype("//sub.domain.com/aaa/bbb/ccc/ddd","//sub.domain.com/aaa/bbb/ccc");
assert.equal(res,"sublevel");
});
it('should gettype samelevel urls', function() {
let res = cup.gettype("//sub.domain.com/aaa/bbb/eee","//sub.domain.com/aaa/bbb/ccc");
assert.equal(res,"samelevel");;
});
it('should handle invalid urls', function() {
let res = cup.gettype("//sub.domain.com/aaa/bbb/eee","//sub.anotherdomain.com/aaa/bbb/ccc");
assert.equal(res,"external");
});
});
describe('gettype url without protocol as samelevel, sublevel, uplevel', function() {
it('should gettype sublevel urls', function() {
let res = cup.gettype("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc");
assert.equal(res,"uplevel");
......
......@@ -10,7 +10,7 @@ describe('05 static html', function() {
let htmlPath = path.resolve(__dirname,'05_tubitak.html');
let htmlString = fs.readFileSync(htmlPath,'utf-8');
let result = cup.extract(htmlString,"http://journals.tubitak.gov.tr/");
//todo ///
let suplevelArr = result.filter((el, index, arr) => el.type == "sublevel");
let uplevelArr = result.filter((el, index, arr) => el.type == "uplevel");
let samelevelArr = result.filter((el, index, arr) => el.type == "samelevel");
......@@ -18,11 +18,11 @@ describe('05 static html', function() {
let subdomainArr = result.filter((el, index, arr) => el.type == "subdomain");
let updomainArr = result.filter((el, index, arr) => el.type == "updomain");
let externalArr = result.filter((el, index, arr) => el.type == "external");
assert.equal(result.length,33);
assert.equal(suplevelArr.length+uplevelArr.length+samelevelArr.length+internalArr.length+subdomainArr.length+updomainArr.length+externalArr.length,33);
assert.equal(result.length,34);
assert.equal(suplevelArr.length+uplevelArr.length+samelevelArr.length+internalArr.length+subdomainArr.length+updomainArr.length+externalArr.length,34);
assert.equal(suplevelArr.length,12);
assert.equal(uplevelArr.length,0);
assert.equal(samelevelArr.length,0);
assert.equal(samelevelArr.length,1);
assert.equal(internalArr.length,13);
assert.equal(subdomainArr.length,2);
assert.equal(updomainArr.length,1);
......
......@@ -8,7 +8,6 @@ describe('07 readme parse test', function() {
assert.equal(result.baseurl,null);
assert.equal(result.domain,"stackoverflow.com");
assert.equal(result.host,"question.stackoverflow.com");
assert.equal(result.normalized,"http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
assert.equal(result.path,"/aaa/bbb/ddd");
assert.equal(result.protocol,"http:");
assert.equal(result.querycount,2);
......@@ -21,13 +20,12 @@ describe('07 readme parse test', function() {
let result = cup.parse("http://www.question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
assert.equal(result.baseurl,null);
assert.equal(result.domain,"stackoverflow.com");
assert.equal(result.host,"question.stackoverflow.com");
assert.equal(result.normalized,"http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
assert.equal(result.host,"www.question.stackoverflow.com");
assert.equal(result.path,"/aaa/bbb/ddd");
assert.equal(result.protocol,"http:");
assert.equal(result.querycount,2);
assert.equal(result.search,"?q1=query1&q2=query2");
assert.equal(result.subdomain,"question");
assert.equal(result.subdomain,"www.question");
assert.equal(result.url,"http://www.question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
});
......@@ -35,13 +33,12 @@ describe('07 readme parse test', function() {
let result = cup.parse("http://www.question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2/");
assert.equal(result.baseurl,null);
assert.equal(result.domain,"stackoverflow.com");
assert.equal(result.host,"question.stackoverflow.com");
assert.equal(result.normalized,"http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
assert.equal(result.host,"www.question.stackoverflow.com");
assert.equal(result.path,"/aaa/bbb/ddd");
assert.equal(result.protocol,"http:");
assert.equal(result.querycount,2);
assert.equal(result.search,"?q1=query1&q2=query2");
assert.equal(result.subdomain,"question");
assert.equal(result.search,"?q1=query1&q2=query2/");
assert.equal(result.subdomain,"www.question");
assert.equal(result.url,"http://www.question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2/");
});
......@@ -49,13 +46,12 @@ describe('07 readme parse test', function() {
let result = cup.parse("https://www.question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2/");
assert.equal(result.baseurl,null);
assert.equal(result.domain,"stackoverflow.com");
assert.equal(result.host,"question.stackoverflow.com");
assert.equal(result.normalized,"https://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
assert.equal(result.host,"www.question.stackoverflow.com");
assert.equal(result.path,"/aaa/bbb/ddd");
assert.equal(result.protocol,"https:");
assert.equal(result.querycount,2);
assert.equal(result.search,"?q1=query1&q2=query2");
assert.equal(result.subdomain,"question");
assert.equal(result.search,"?q1=query1&q2=query2/");
assert.equal(result.subdomain,"www.question");
assert.equal(result.url,"https://www.question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2/");
});
});
......@@ -68,7 +64,6 @@ describe('07 readme parse with baseURL test', function() {
assert.equal(result.baseurl,"http://question.stackoverflow.com/aaa/bbb/ccc/");
assert.equal(result.domain,"stackoverflow.com");
assert.equal(result.host,"question.stackoverflow.com");
assert.equal(result.normalized,"http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
assert.equal(result.path,"/aaa/bbb/ddd");
assert.equal(result.protocol,"http:");
assert.equal(result.querycount,2);
......@@ -81,13 +76,12 @@ describe('07 readme parse with baseURL test', function() {
let result = cup.parse("../ddd?q1=query1&q2=query2","http://www.question.stackoverflow.com/aaa/bbb/ccc/");
assert.equal(result.baseurl,"http://www.question.stackoverflow.com/aaa/bbb/ccc/");
assert.equal(result.domain,"stackoverflow.com");
assert.equal(result.host,"question.stackoverflow.com");
assert.equal(result.normalized,"http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
assert.equal(result.host,"www.question.stackoverflow.com");
assert.equal(result.path,"/aaa/bbb/ddd");
assert.equal(result.protocol,"http:");
assert.equal(result.querycount,2);
assert.equal(result.search,"?q1=query1&q2=query2");
assert.equal(result.subdomain,"question");
assert.equal(result.subdomain,"www.question");
assert.equal(result.url,"http://www.question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
});
......@@ -95,13 +89,12 @@ describe('07 readme parse with baseURL test', function() {
let result = cup.parse("../ddd?q1=query1&q2=query2","http://www.stackoverflow.com/aaa/bbb/ccc/");
assert.equal(result.baseurl,"http://www.stackoverflow.com/aaa/bbb/ccc/");
assert.equal(result.domain,"stackoverflow.com");
assert.equal(result.host,"stackoverflow.com");
assert.equal(result.normalized,"http://stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
assert.equal(result.host,"www.stackoverflow.com");
assert.equal(result.path,"/aaa/bbb/ddd");
assert.equal(result.protocol,"http:");
assert.equal(result.querycount,2);
assert.equal(result.search,"?q1=query1&q2=query2");
assert.equal(result.subdomain,null);
assert.equal(result.subdomain,"www");
assert.equal(result.url,"http://www.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
});
......@@ -109,13 +102,12 @@ describe('07 readme parse with baseURL test', function() {
let result = cup.parse("../ddd?q1=query1&q2=query2","http://www.stackoverflow.com/aaa/bbb/ccc");
assert.equal(result.baseurl,"http://www.stackoverflow.com/aaa/bbb/ccc");
assert.equal(result.domain,"stackoverflow.com");
assert.equal(result.host,"stackoverflow.com");
assert.equal(result.normalized,"http://stackoverflow.com/aaa/ddd?q1=query1&q2=query2");
assert.equal(result.host,"www.stackoverflow.com");
assert.equal(result.path,"/aaa/ddd");
assert.equal(result.protocol,"http:");
assert.equal(result.querycount,2);
assert.equal(result.search,"?q1=query1&q2=query2");
assert.equal(result.subdomain,null);
assert.equal(result.subdomain,"www");
assert.equal(result.url,"http://www.stackoverflow.com/aaa/ddd?q1=query1&q2=query2");
});
......
......@@ -9,7 +9,7 @@ describe('crawler test 01', function () {
it(`should pass for ${url}`, function () {
let res = cup.parse("#start-of-content", url);
assert.equal(res.normalized, "https://github.com/Microsoft");
assert.equal(res.url, "https://github.com/Microsoft");
});
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment