Commit 202c5b25 authored by Mehmet Kozan's avatar Mehmet Kozan

clean-url dependency removed.

parent 5066e6ec
Pipeline #15962284 passed with stages
in 3 minutes and 1 second
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Launch Program",
"program": "${workspaceFolder}\\crawler-url-parser.js"
},
{
"type": "node",
"request": "launch",
"name": "Launch QuickStart",
"program": "${workspaceFolder}\\QUICKSTART.js"
},
{
"type": "node",
"request": "launch",
"name": "Launch Test",
"program": "${workspaceFolder}\\node_modules\\mocha\\bin\\_mocha",
"args": ["--recursive"],
"cwd": "${workspaceRoot}"
}
]
}
\ No newline at end of file
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Launch Program",
"program": "${workspaceFolder}\\crawler-url-parser.js"
},
{
"type": "node",
"request": "launch",
"name": "Launch QuickStart",
"program": "${workspaceFolder}\\quickstart.js"
},
{
"type": "node",
"request": "launch",
"name": "Launch Test",
"program": "${workspaceFolder}\\node_modules\\mocha\\bin\\_mocha",
"args": [
"--recursive"
],
"cwd": "${workspaceRoot}"
}
]
}
# 2.0.5
* clean-url dependency removed.
# 1.5.1
* files refactored.
......
# crawler-url-parser
> **An URL parser for crawling purpose**
![logo](https://assets.gitlab-static.net/uploads/-/system/project/avatar/4809017/crawler-url-parser.png)
**An URL parser for crawling purpose**
[![version](https://img.shields.io/npm/v/crawler-url-parser.svg)](https://www.npmjs.org/package/crawler-url-parser)
[![downloads](https://img.shields.io/npm/dt/crawler-url-parser.svg)](https://www.npmjs.org/package/crawler-url-parser)
......@@ -146,8 +145,26 @@ console.log(level); //external
```
## Test
`mocha` or `npm test`
> more than 200 unit test cases.
> check test folder and QUICKSTART.js for extra usage.
* `mocha` or `npm test`
* More than 200 unit test cases.
* Check [test folder](https://gitlab.com/autokent/crawler-url-parser/tree/master/test) and [quickstart.js](https://gitlab.com/autokent/crawler-url-parser/blob/master/quickstart.js) for extra usages.
## Support
I use this package actively myself, so it has my top priority. You can chat on WhatsApp about any infos, ideas and suggestions.
[![WhatsApp](https://img.shields.io/badge/style-chat-green.svg?style=flat&label=whatsapp)](https://api.whatsapp.com/send?phone=905063042480&text=Hi%2C%0ALet%27s%20talk%20about%20crawler-url-parser)
### Submitting an Issue
If you find a bug or a mistake, you can help by submitting an issue to [GitLab Repository](https://gitlab.com/autokent/crawler-url-parser/issues)
### Creating a Merge Request
GitLab calls it merge request instead of pull request.
* [A Guide for First-Timers](https://about.gitlab.com/2016/06/16/fearless-contribution-a-guide-for-first-timers/)
* [How to create a merge request](https://docs.gitlab.com/ee/gitlab-basics/add-merge-request.html)
* Check [Contributing Guide](https://gitlab.com/autokent/crawler-url-parser/blob/master/CONTRIBUTING.md)
## License
[MIT licensed](https://gitlab.com/autokent/crawler-url-parser/blob/master/LICENSE) and all it's dependencies are MIT or BSD licensed.
\ No newline at end of file
const URL = require('url');
const psl = require('psl');
const cleanUrl = require('url-clean');
const cheerio = require('cheerio');
const result_normalize_options = {
......@@ -35,26 +34,25 @@ function parse(currentUrlStr, baseUrlStr) {
currentUrlStr = currentUrlStr.replace(/^\/\//, 'http://');
currentUrlStr = currentUrlStr.replace(/#.*$/, '');
if(baseUrlStr) {
if (baseUrlStr) {
baseUrlStr = baseUrlStr.replace(/^\/\//, 'http://');
baseUrlStr = baseUrlStr.replace(/#.*$/, '');
}
else {
if ( ! /^\.*\/|^(?!localhost)\w+:/.test(currentUrlStr)){
} else {
if (!/^\.*\/|^(?!localhost)\w+:/.test(currentUrlStr)) {
currentUrlStr = currentUrlStr.replace(/^(?!(?:\w+:)?\/\/)/, 'http://');
}
}
let parsedUrl = URL.parse(currentUrlStr, true, true);
delete parsedUrl.hash ;
delete parsedUrl.hash;
if (parsedUrl.protocol && parsedUrl.protocol != 'http:' && parsedUrl.protocol != 'https:') return null;
//current url is relative like "abc", "/abc" or "../abc"
if (parsedUrl.host == null && baseUrlStr) {
let parsedBaseUrl = URL.parse(baseUrlStr, true,true);
let parsedBaseUrl = URL.parse(baseUrlStr, true, true);
delete parsedUrl.hash;
ret.baseurl = URL.format(parsedBaseUrl);
......@@ -145,8 +143,8 @@ function gettype(linkurl, pageurl) {
let linkurl_path = linkurl.path ? linkurl.path : "";
let pageurl_path = pageurl.path ? pageurl.path : "";
linkurl_path = linkurl_path.replace(/\/index\.[a-z]+$/,'/').replace(/\/default\.[a-z]+$/,'/');
pageurl_path = pageurl_path.replace(/\/index\.[a-z]+$/,'/').replace(/\/default\.[a-z]+$/,'/');
linkurl_path = linkurl_path.replace(/\/index\.[a-z]+$/, '/').replace(/\/default\.[a-z]+$/, '/');
pageurl_path = pageurl_path.replace(/\/index\.[a-z]+$/, '/').replace(/\/default\.[a-z]+$/, '/');
let linkurl_parts = linkurl_path.split("/").filter(function (elem, index, array) {
return elem.length > 0
......
{
"name": "crawler-url-parser",
"version": "2.0.4",
"version": "2.0.5",
"description": "An `URL` parser for crawling purpose.",
"main": "crawler-url-parser.js",
"keywords": [
......@@ -36,8 +36,7 @@
"dependencies": {
"cheerio": "^1.0.0-rc.2",
"psl": "^1.1.20",
"url": "^0.11.0",
"url-clean": "1.0.2"
"url": "^0.11.0"
},
"devDependencies": {
"mocha": "^4.0.1",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment