Commit b9ed52f7 authored by Rob Tomsick's avatar Rob Tomsick

Implement word-boundary shingling. (In prep for tiered, word-aware Jaccard scoring.)

parent 97156dbf
......@@ -39,6 +39,7 @@ import static org.jooq.impl.DSL.field;
import static org.jooq.impl.DSL.name;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
......@@ -47,10 +48,12 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.jooq.Condition;
import org.jooq.DSLContext;
......@@ -516,6 +519,40 @@ implements DictionaryService
}
return ngrams;
}
}
/**
* Generate list of n-grams for the given input string, delimiting
* based on word boundaries (whitespace). The resulting n-grams will
* have their words delimited with the given character sequence.
*
* @param a input string
* @param n size of n-grams
* @param delim delimiting sequence, {@code null} for plain concatenation
* @return list of n-grams each of the specified length
*/
private static final List<String>
wordShingle(String a, int n, String delim)
{
List<String> ngrams = new ArrayList<>();
ArrayList<String> split =
Arrays.asList(StringUtils.splitByCharacterType(a))
.stream()
.filter(StringUtils :: isNotBlank)
.collect(Collectors.toCollection(ArrayList :: new));
Collector<CharSequence, ?, String> collector =
delim == null ? Collectors.joining() : Collectors.joining(delim);
for (int i = 0; i < split.size() - n + 1; i++)
{
ngrams.add(split.subList(i, i + n)
.stream()
.collect(collector));
}
return ngrams;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment