Commit 6dacbb9d authored by Ludan Stoecklé's avatar Ludan Stoecklé

1.12.1

parent a1e713ca
......@@ -19,7 +19,7 @@ stages:
# variables etc.
variables:
ROSAENLG_VERSION: 1.12.0
ROSAENLG_VERSION: 1.12.1
DOCKER_REGISTRY: registry.gitlab.com/rosaenlg-projects/rosaenlg
DOCKER_CLI_ROOT: ${DOCKER_REGISTRY}/cli
DOCKER_SERVER_ROOT: ${DOCKER_REGISTRY}/server
......
{
"name": "browser-ide-demo",
"version": "1.12.0",
"version": "1.12.1",
"description": "Client side demo of NLG templates compilation and rendering using RosaeNLG",
"main": "index.js",
"scripts": {
......@@ -33,7 +33,7 @@
"gulp-inject-string": "^1.1.2",
"gulp-rename": "^1.4.0",
"merge-stream": "^2.0.0",
"rosaenlg": "1.12.0",
"rosaenlg": "1.12.1",
"vue-codemirror": "^4.0.6"
}
}
lib/
.nyc_output
test/
tsconfig.json
example.js
README.md
The MIT License (MIT)
Copyright (c) 2019, RosaeNLG.org, Ludan Stoecklé
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
# Content Deduplicate
**This module is deprecated.**
You can still use it, but `node-simhash` provides better approach and value.
See https://medium.com/@jonathankoren/near-duplicate-detection-b6694e807f7a and https://moz.com/devblog/near-duplicate-detection.
This module brings a distance functions and various helpers to calculate distance between strings. It is designed to prevent "duplicate content": avoid having 2 texts which are too close.
It can be used for SEO purposes or anything.
It is tailored for middle-sized strings, let's say 30 to 300 words.
Provided functions are:
- a specific distance function - it is the main added value of the module
- a helper function that finds out whichs texts are too close from a given text
- a clustering function, which groups close texts together
All functions are language dependant. Supported languages are French, English, Italian and German.
## Distance calculation
### How it works
The main function calculates distances between two strings. It is tailored for middle-sized strings - 30 to 300 words. It is less "strict" than Levenshtein distance as the idea is to see how close two strings look like.
The algorithm is the following:
- stemming of both strings (which also removes punctuation and numbers)
- remove stop words
- each longest common substring counts for 1 (except the very first common one, to respect `dist(a,a) = 0`)
- each left element counts for 1
For example, `AAA BBB CCC KKK PPP OOO` to `ZZZ AAA BBB CCC PPP OOO` = 3:
- `AAA BBB CCC` is common => +0 (as it is the first match)
- left: `KKK PPP OOO` vs `ZZZ PPP OOO`
- `PPP OOO` is common => +1
- left: `KKK` vs `ZZZ` => +2
Mathematically it is almost a real distance:
- `d(a,b) = d(b,a)` is true
- `d(a,b) = 0 <=> a = b` is not really true as 2 strings can have a 0 distance even when they are different: for instance `he has 5` and `it has 6` have a 0 distance
- `d(a,c) <= d(a,b) + d(b,c)` is true
### Raw distance
Use `getDistanceRaw` to get that distance.
### Relative distance
It is sometimes more useful to get a relative distance: pourcentage of how close two strings are. It uses the same algorithm but divides the result by the sum of the number of words of both strings:
- 1 => both strings are 100% different
- 0 => both strings are the same
- 0.3 => both strings are 30% different
Use `getDistancePourcentage` to get that distance.
### Example
Example:
```javascript
const contentDeduplicate = require('./dist/index.js');
// should be 2
console.log(
contentDeduplicate.getDistanceRaw('I eat huge quantities of vegetables', 'he eats huge quantities of meat', 'en_US'),
);
// should be 0.25
console.log(
contentDeduplicate.getDistancePourcentage(
'I eat huge quantities of vegetables',
'he eats huge quantities of meat',
null,
'en_US',
),
);
```
In `getDistancePourcentage`, the 3rd parameter is a threshold. If, while being calculated, the distance becomes greater than this threshold, calculation stops and 1 (100% of difference) is returned. This is used to improve speed - usually we care about close strings, but not about the exact distance of distant strings.
```javascript
console.log(
contentDeduplicate.getDistancePourcentage(
'I eat huge quantities of vegetables and I love wine, beer and pineapples',
'he eats huge quantities of meat and I love wine, coca-cola, and pineapples',
0.1,
'en_US',
),
);
```
will output 1: the distance is not 1, but is greater than 0.1.
## Distance report
Often you have a list of strings, and what to check how close they are each from other.
`getDistanceReport` will calculate all distances and produce a JSON report containing, for each text, the closest ones, but also the most distant one.
*Computation time can become quite long: 1 minute for a few hundreds of strings.*
Parameters are the following:
- an array of textual objects; each object `must` have a `text` property containing its string; feel free to put other properties typically an ID
- the maximal acceptable distance: if the distance between two strings is lower than this threshold, then it will not be added in the list of the closest ones; use 0.2 for instance to only trigger when texts are 20% different or less
- the maximum number of closest strings to be given in the output (only the most close will be given)
- the language of the strings
The output is an array of objects:
- `for`: reference to the textual object
- `closestOnes`: an array with the closes elements; each object points to an element (`with` property) and gives the distance (`difference` property)
- `mostDifferent` is the most distant text (`with` and `difference` properties)
Example:
```javascript
const contentDeduplicate = require('./dist/index.js');
const toCompare = [
{
id: 1,
text: 'I eat huge quantities of vegetables',
},
{
id: 2,
text: 'he eats huge quantities of meat',
},
{
id: 3,
text: 'she is vegan',
},
];
console.log(JSON.stringify(contentDeduplicate.getDistanceReport(toCompare, 0.3, 5, 'en_US'), null, 1));
```
will output:
```json
[
{
"for": {
"id": 1,
"text": "I eat huge quantities of vegetables"
},
"closestOnes": [
{
"difference": 0.25,
"with": {
"id": 2,
"text": "he eats huge quantities of meat"
}
}
],
"mostDifferent": {
"with": {
"id": 3,
"text": "she is vegan"
},
"difference": 1
}
},
{
"for": {
"id": 2,
"text": "he eats huge quantities of meat"
},
"closestOnes": [
{
"difference": 0.25,
"with": {
"id": 1,
"text": "I eat huge quantities of vegetables"
}
}
],
"mostDifferent": {
"with": {
"id": 3,
"text": "she is vegan"
},
"difference": 1
}
},
{
"for": {
"id": 3,
"text": "she is vegan"
},
"closestOnes": [],
"mostDifferent": {
"with": {
"id": 2,
"text": "he eats huge quantities of meat"
},
"difference": 1
}
}
]
```
## Clustering
Use `getClusters` to cluster your texts, thanks to `k-medoids` lib.
Input:
- an array of `Text` objects; each element must have a `text` property, and you can also use an ID or something to know which are the texts
- the number of clusters (it is not discovered automatically by this method)
- a language
Output: array of clusters.
Example:
```javascript
const contentDeduplicate = require('./dist/index.js');
const toCompare = [
{
id: 1,
text: 'I eat huge quantities of vegetables',
},
{
id: 2,
text: 'he eats huge quantities of meat',
},
{
id: 3,
text: 'she is vegan',
},
];
console.log(JSON.stringify(contentDeduplicate.getClusters(toCompare, 2, 'en_US'), null, 1));
```
will output 2 clusters:
```json
[
[
{
"id": 3,
"text": "she is vegan"
}
],
[
{
"id": 1,
"text": "I eat huge quantities of vegetables"
},
{
"id": 2,
"text": "he eats huge quantities of meat"
}
]
]
```
## Performance and cache
When using `getDistanceReport` and `getClusters`, 2 caches are used to avoid:
- preparing (stemming stopwords etc.) the same string multiple times
- recalculating already computed distances
const contentDeduplicate = require('./dist/index.js');
// should be 2
console.log(
contentDeduplicate.getDistanceRaw('I eat huge quantities of vegetables', 'he eats huge quantities of meat', 'en_US'),
);
// should be 0.25
console.log(
contentDeduplicate.getDistancePourcentage(
'I eat huge quantities of vegetables',
'he eats huge quantities of meat',
null,
'en_US',
),
);
// should be 1
console.log(
contentDeduplicate.getDistancePourcentage(
'I eat huge quantities of vegetables and I love wine, beer and pineapples',
'he eats huge quantities of meat and I love wine, coca-cola, and pineapples',
0.1,
'en_US',
),
);
const toCompare = [
{
id: 1,
text: 'I eat huge quantities of vegetables',
},
{
id: 2,
text: 'he eats huge quantities of meat',
},
{
id: 3,
text: 'she is vegan',
},
];
console.log(JSON.stringify(contentDeduplicate.getDistanceReport(toCompare, 0.3, 5, 'en_US'), null, 1));
console.log(JSON.stringify(contentDeduplicate.getClusters(toCompare, 2, 'en_US'), null, 1));
import { Clusterer } from 'k-medoids';
import { getDistanceRaw, Distance, CacheDistMap, EncodedMap } from './distance';
import { Text } from './distanceReport';
import { Languages } from 'synonym-optimizer';
export function getClusters(texts: Text[], clusters: number, lang: Languages): {}[][] {
if (!texts || texts.length == 0) {
const err = new Error();
err.message = `you must provide some texts`;
throw err;
}
if (!clusters) {
const err = new Error();
err.message = `number of clusters is mandatory`;
throw err;
}
console.log(`clustering, number of texts: ${texts.length} in ${clusters} clusters`);
const cacheDistMap: CacheDistMap = new Map<string, Map<string, Distance>>();
const encodedMap: EncodedMap = new Map<string, string[]>();
function distanceFct(t1: Text, t2: Text): number {
return getDistanceRaw(t1.text, t2.text, lang, cacheDistMap, encodedMap);
}
const myClusterer = Clusterer.getInstance(texts, clusters, distanceFct);
return myClusterer.getClusteredData();
}
import { getStandardStopWords, getStemmedWords, Languages } from 'synonym-optimizer';
interface FoundCommon {
found: string[];
length: number;
indexS1: number;
indexS2: number;
}
export interface Distance {
val: number;
max: number;
}
export type CacheDistMap = Map<string, Map<string, Distance>>;
export type EncodedMap = Map<string, string[]>;
/* https://github.com/trekhleb/javascript-algorithms/blob/master/src/algorithms/string/longest-common-substring/longestCommonSubstring.js
is MIT
*/
function longestCommonSubstring(s1: string[], s2: string[]): FoundCommon {
// Convert strings to arrays to treat unicode symbols length correctly.
// For example:
// '𐌵'.length === 2
// [...'𐌵'].length === 1
//const s1 = [...string1];
//const s2 = [...string2];
// Init the matrix of all substring lengths to use Dynamic Programming approach.
const substringMatrix = Array(s2.length + 1)
.fill(null)
.map(() => {
return Array(s1.length + 1).fill(null);
});
// Fill the first row and first column with zeros to provide initial values.
for (let columnIndex = 0; columnIndex <= s1.length; columnIndex += 1) {
substringMatrix[0][columnIndex] = 0;
}
for (let rowIndex = 0; rowIndex <= s2.length; rowIndex += 1) {
substringMatrix[rowIndex][0] = 0;
}
// Build the matrix of all substring lengths to use Dynamic Programming approach.
let longestSubstringLength = 0;
let longestSubstringColumn = 0;
let longestSubstringRow = 0;
for (let rowIndex = 1; rowIndex <= s2.length; rowIndex += 1) {
for (let columnIndex = 1; columnIndex <= s1.length; columnIndex += 1) {
if (s1[columnIndex - 1] === s2[rowIndex - 1]) {
substringMatrix[rowIndex][columnIndex] = substringMatrix[rowIndex - 1][columnIndex - 1] + 1;
} else {
substringMatrix[rowIndex][columnIndex] = 0;
}
// Try to find the biggest length of all common substring lengths
// and to memorize its last character position (indices)
if (substringMatrix[rowIndex][columnIndex] > longestSubstringLength) {
longestSubstringLength = substringMatrix[rowIndex][columnIndex];
longestSubstringColumn = columnIndex;
longestSubstringRow = rowIndex;
}
}
}
if (longestSubstringLength === 0) {
// Longest common substring has not been found.
return {
found: [],
length: longestSubstringLength,
indexS1: -1,
indexS2: -1,
};
}
const res: FoundCommon = {
found: [],
length: longestSubstringLength,
indexS1: longestSubstringColumn - longestSubstringLength,
indexS2: longestSubstringRow - longestSubstringLength,
};
// Detect the longest substring from the matrix.
// const longestSubstring = [];
while (substringMatrix[longestSubstringRow][longestSubstringColumn] > 0) {
// longestSubstring = s1[longestSubstringColumn - 1] + longestSubstring;
res.found.unshift(s1[longestSubstringColumn - 1]);
longestSubstringRow -= 1;
longestSubstringColumn -= 1;
}
return res;
}
function distanceHelper(s1: string[], s2: string[], addNext: number, distMaxAbsoluteLeft: number): number {
// console.log(`s1: ${s1}`);
// console.log(`s2: ${s2}`);
// console.log(`distMaxAbsoluteLeft: ${distMaxAbsoluteLeft}`);
if (distMaxAbsoluteLeft != null && distMaxAbsoluteLeft < 0) {
// console.log(`stopping distance calculation because too far!`);
return Infinity;
}
const foundCommon: FoundCommon = longestCommonSubstring(s1, s2);
if (foundCommon.length == 0) {
// console.log('nothing found, ending');
return s1.filter(elt => elt != '_').length + s2.filter(elt => elt != '%').length;
} else {
// console.log(foundCommon);
for (let i = 0; i < foundCommon.length; i++) {
s1[foundCommon.indexS1 + i] = `_`;
s2[foundCommon.indexS2 + i] = '%';
}
return addNext + distanceHelper(s1, s2, 1, distMaxAbsoluteLeft != null ? distMaxAbsoluteLeft - addNext : null);
}
}
// const encodedMap = new Map<string, string[]>();
// const distMap = new Map<string, Map<string, Distance>>();
function getDistance(
s1: string,
s2: string,
distMaxPc: number,
lang: Languages,
cacheDistMap: CacheDistMap,
encodedMap: EncodedMap,
): Distance {
if (!lang) {
const err = new Error();
err.message = `lang is mandatory`;
throw err;
}
// console.log(`calling distance: ${s1} ${s2}`);
function getStemmedWordsLocal(s: string): string[] {
// console.log(`getStemmedWordsLocal: s: ${s}, lang ${lang}`);
const res = getStemmedWords(s, getStandardStopWords(lang), lang);
// console.log(`getStemmedWordsLocal ${res}`);
return res;
}
function getPrepared(s: string): string[] {
if (encodedMap) {
const cached = encodedMap.get(s);
if (cached) {
// console.log(`found in cache! ${cached}`);
return cached;
}
}
const encoded = getStemmedWordsLocal(s);
if (encodedMap) {
encodedMap.set(s, encoded);
}
return encoded;
}
if (cacheDistMap) {
if (!cacheDistMap.has(s1)) {
cacheDistMap.set(s1, new Map<string, Distance>());
}
if (!cacheDistMap.has(s2)) {
cacheDistMap.set(s2, new Map<string, Distance>());
}
const cachedDist = cacheDistMap.get(s1).get(s2);
if (cachedDist) {
// process.stdout.write('X');
return cachedDist;
}
}
const prepared1 = [...getPrepared(s1)];
const prepared2 = [...getPrepared(s2)];
const max = prepared1.length + prepared2.length;
const distance: Distance = {
val: distanceHelper(prepared1, prepared2, 0, distMaxPc ? distMaxPc * max : null),
max: max,
};
if (cacheDistMap) {
cacheDistMap.get(s1).set(s2, distance);
cacheDistMap.get(s2).set(s1, distance);
}
return distance;
}
export function getDistancePourcentage(
s1: string,
s2: string,
distMaxPc: number,
lang: Languages,
cacheDistMap: CacheDistMap,
encodedMap: EncodedMap,