Commit b104a1a0 authored by Rob Tomsick's avatar Rob Tomsick

Experiment further with fuzzy phonetic searching

parent 7e3bcd48
......@@ -52,13 +52,14 @@ import java.util.Set;
import java.util.UUID;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.jooq.Condition;
import org.jooq.DSLContext;
import org.jooq.Field;
import org.jooq.impl.DSL;
import edu.unc.cscc.crxrest.Dictionary;
import edu.unc.cscc.crxrest.DictionaryService;
......@@ -204,51 +205,104 @@ implements DictionaryService
@Override
public List<Product>
findProducts(String query, int limit)
findProducts(final String query, int limit)
{
limit = limit > 0 ? limit : this.resultLimit();
LinkedHashSet<UUID> ids = new LinkedHashSet<>();
final LinkedHashSet<UUID> ids = new LinkedHashSet<>();
final Field<String> hashField = field(name("phonetic_hash"), String.class);
/* search by prop name, then NPN, then substance */
ids.addAll(this.ctx.select(field(name("id"), UUID.class))
.from(D_TABLE)
.where(field(name("proprietary_name"), String.class).containsIgnoreCase(query))
.orderBy(field(name("proprietary_name"), String.class).asc())
.limit(limit)
.fetch(field(name("id"), UUID.class)));
if (ids.size() >= limit)
{
return this.loadProducts(new ArrayList<>(ids).subList(0, limit));
}
final Field<UUID> ide = field(name("id__entries"), UUID.class);
final Field<String> nf = field(name("name"), String.class);
ids.addAll(
this.ctx.select(ide)
.from(N_TABLE)
.where(nf.containsIgnoreCase(query))
.orderBy(nf.asc())
.limit(limit - ids.size())
.fetch(ide));
if (ids.size() >= limit)
/* fuzzy hash search over proprietary names */
final List<List<String>> hashes =
partitionByLength(hashFuzz(query, HASH_DISTANCE, MIN_FUZZY_HASH_LEN));
// /* first pass for exact matches for non-wildcard hashes */
// for (List<String> hashGroup : hashes)
// {
// if (ids.size() >= limit)
// {
// break;
// }
// Condition cond = DSL.falseCondition();
// for (String hash : hashGroup)
// {
// if (! hash.contains("_"))
// {
// cond = cond.or(hashField.eq(hash));
// }
// }
// ids.addAll(this.ctx
// .select(field(name("id"), UUID.class))
// .from(D_TABLE)
// .where(cond)
// .fetch(field(name("id"), UUID.class)));
// }
for (List<String> hashGroup : hashes)
{
return this.loadProducts(new ArrayList<>(ids).subList(0, limit));
if (ids.size() >= limit)
{
break;
}
Condition cond = DSL.falseCondition();
for (String hash : hashGroup)
{
cond = cond.or(hashField.like(hash));
}
ids.addAll(this.ctx
.select(field(name("id"), UUID.class))
.from(D_TABLE)
.where(cond)
.fetch(field(name("id"), UUID.class)));
/* search NPNs */
cond = DSL.falseCondition();
for (String hash : hashGroup)
{
cond = cond.or(hashField.like(hash));
}
ids.addAll(this.ctx
.select(field(name("id__entries"), UUID.class))
.from(N_TABLE)
.where(cond)
.fetch(field(name("id__entries"), UUID.class)));
}
ids.addAll(
this.ctx.select(ide)
.from(S_TABLE)
.where(nf.containsIgnoreCase(query))
.orderBy(nf.asc())
.limit(limit - ids.size())
.fetch(ide));
return this.loadProducts(new ArrayList<>(ids));
List<NDCProduct> products = this.loadProducts(new ArrayList<>(ids));
products.sort(comparatorFor(query, p -> {
/* prop name and NPNs - use whatever's closes to query */
List<String> names = new ArrayList<>(p.nonProprietaryNames());
names.add(p.name());
names.sort((a, b) ->
(int) ((StringUtils.getJaroWinklerDistance(query, b)
-
StringUtils.getJaroWinklerDistance(query, a)) * 1000d));
return names.get(0);
}));
return new ArrayList<Product>(products.subList(0, limit));
}
@Override
......@@ -271,7 +325,7 @@ implements DictionaryService
.limit(limit)
.fetch(field(name("id"), UUID.class));
return this.loadProducts(ids);
return new ArrayList<Product>(this.loadProducts(ids));
}
@Override
......@@ -281,7 +335,7 @@ implements DictionaryService
return Collections.emptyList();
}
private final List<Product>
private final List<NDCProduct>
loadProducts(final List<UUID> ids)
{
if (ids.isEmpty())
......@@ -358,38 +412,77 @@ implements DictionaryService
private static final <T> Comparator<T>
comparatorFor(String reference, Function<T, String> accessor)
{
return (a, b) ->
StringUtils.getFuzzyDistance(reference, accessor.apply(b), Locale.ENGLISH)
-
StringUtils.getFuzzyDistance(reference, accessor.apply(a), Locale.ENGLISH);
final String hash = PhoneticHash.hash(reference);
return (a, b) ->
{
/*
* TODO?
*
* We get JW distance, and use that to adjust the weight of
* the lev distance between the hashes. The idea is that
* the distance in hashes become more important for strings
* that are further from reference.
*/
final String ah = PhoneticHash.hash(accessor.apply(a));
final String bh = PhoneticHash.hash(accessor.apply(b));
double jwa = StringUtils.getJaroWinklerDistance(reference, accessor.apply(a));
double jwb = StringUtils.getJaroWinklerDistance(reference, accessor.apply(b));
double leva = StringUtils.getJaroWinklerDistance(hash, ah);
double levb = StringUtils.getJaroWinklerDistance(hash, bh);
double ascore = jwa;
double bscore = jwb;
return (int) (bscore * 1000d) - (int) (ascore * 1000d);
};
}
private static final Stream<String>
hashFuzz(final String str, final int distance, final int minHashLen)
hashFuzz(final String input, final int distance, final int minHashLen)
{
final char[] fh = PhoneticHash.hash(str).toCharArray();
return Stream.concat(Stream.of(fh), permute(fh, distance, minHashLen))
.map(String :: valueOf)
.distinct();
return hashPermute(PhoneticHash.hash(input).toCharArray(), distance, minHashLen)
.distinct()
.map(String :: valueOf);
}
private static final Stream<char[]>
permute(char[] input, int distance, int minLen)
hashPermute(final char[] hash, final int distance, final int minHashLen)
{
if (input.length < distance || input.length < minLen)
if (hash.length < minHashLen)
{
return Stream.empty();
}
else if (distance < 1 || input.length == minLen)
Stream.Builder<char[]> builder = Stream.builder();
builder.accept(hash);
for (int i = 0; i < hash.length; i++)
{
return Stream.of(input);
/* do wildcard permutations */
if (hash[i] != '_')
{
char[] w = ArrayUtils.clone(hash);
w[i] = '_';
builder.accept(w);
}
/* do sub-hash permutations if we can */
if (hash.length > minHashLen && distance > 0)
{
hashPermute(ArrayUtils.remove(hash, i), distance - 1, minHashLen)
.forEach(builder :: accept);
}
}
return IntStream.range(0, input.length)
.mapToObj(i -> ArrayUtils.remove(input, i))
.flatMap(h -> Stream.concat(Stream.of(input),
permute(h, distance - 1, minLen)));
return builder.build();
}
private static final List<List<String>>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment