Commit d61bbba1 authored by Rob Tomsick's avatar Rob Tomsick

Implement word-aware scoring

parent b9ed52f7
......@@ -482,14 +482,68 @@ implements DictionaryService
}
/**
* <p>
* Scoring algorithm for comparing two strings. Uses a word-aware Jaccard
* distance implementation. Words which appear in one input but not the
* other are ignored. The individual words are then scored, and the
* scores summed to determine the overall score. Each scored word will be
* scored twice: first based on its Jaccard distance to the word in its
* same position in the other input, and second based on the Jaccard
* distance between a phonetic has of the word and a phonetic hash of its
* complementary word in the other input.
* </p>
*
* <p>If either or both inputs are empty or consist solely of whitespace,
* the resulting score will always be 0.</p>
*
* @param a first input string, not {@code null}
* @param b second input string, not {@code null}
*
* @return score, with 2n indicating a perfect text and phonetic match
* (where n is the number of words scored)
*/
private static final double
score(String a, String b)
{
if (a.length() == 0 || b.length() == 0)
if (StringUtils.isBlank(a) || StringUtils.isBlank(b))
{
return 0.0d;
}
/* use whitespace-aware shingling + scoring */
if (StringUtils.containsWhitespace(a) ||
StringUtils.containsWhitespace(b))
{
List<String> wsa = wordShingle(a, 1, null);
List<String> wsb = wordShingle(b, 1, null);
List<String> l = (wsa.size() < wsb.size()) ? wsa : wsb;
List<String> g = (wsa.size() >= wsb.size()) ? wsa : wsb;
/* compute intersection of l and g, where intersection is defined
* as an n-gram for which the jaccard distance is > 0.0.
*
* This implementation is O(nj) where n = len(l) and j = len(g).
*
* Yes, this can be improved. Quite a lot, actually.
*
* The inputs are also short enough that we don't care for the
* purposes of experimentation.
*/
double score = 0.0d;
for (String gs : g)
{
for (String ls : l)
{
score += score(gs, ls);
}
}
return score;
}
int shingleSize = Math.min(a.length(), b.length());
shingleSize = Math.min(shingleSize, 3);
......@@ -541,7 +595,7 @@ implements DictionaryService
.stream()
.filter(StringUtils :: isNotBlank)
.collect(Collectors.toCollection(ArrayList :: new));
Collector<CharSequence, ?, String> collector =
delim == null ? Collectors.joining() : Collectors.joining(delim);
......@@ -554,5 +608,4 @@ implements DictionaryService
return ngrams;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment