Commit bd6ae6e6 authored by Rob Tomsick's avatar Rob Tomsick

Implement fuzzy hash search for drug names

parent 79b4b8cd
......@@ -39,9 +39,11 @@ import static org.jooq.impl.DSL.field;
import static org.jooq.impl.DSL.name;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
......@@ -50,7 +52,10 @@ import java.util.Set;
import java.util.UUID;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.jooq.DSLContext;
import org.jooq.Field;
......@@ -71,6 +76,7 @@ public class NDCDictionaryService
implements DictionaryService
{
private static final int RESULT_LIMIT = 1000;
private static final int HASH_DISTANCE = 3;
private final DSLContext ctx;
private final Dictionary dictionary;
......@@ -154,20 +160,46 @@ implements DictionaryService
final Field<String> nf = field(name("name"), String.class);
final List<Drug> drugs = this.ctx
final Set<Drug> drugs = this.ctx
.select(nf)
.from(N_TABLE)
.where(nf.containsIgnoreCase(query))
.orderBy(nf.asc())
.limit(limit)
.fetch(r -> new Drug(r.get(nf).toUpperCase()))
.stream()
.distinct()
.collect(Collectors.toCollection(ArrayList :: new));
.collect(Collectors.toCollection(HashSet :: new));
drugs.sort(comparatorFor(query, d -> d.canonicalName()));
/* fuzzy hash search */
return drugs;
final Iterator<List<String>> hi =
partitionByLength(hashFuzz(query, HASH_DISTANCE)).iterator();
while (drugs.size() < limit && hi.hasNext())
{
List<String> hashes = hi.next();
/* let's ignore anything < 3 chars since hashing gets useless at
* that point
*/
if (hashes.get(0).length() < 3)
{
break;
}
this.ctx
.select(nf)
.from(N_TABLE)
.where(field(name("phonetic_hash"), String.class).in(hashes))
.limit(limit - drugs.size())
.fetch(nf)
.stream()
.map(n -> new Drug(n.toUpperCase()))
.forEach(drugs :: add);
}
return drugs.stream()
.sorted(comparatorFor(query, d -> d.canonicalName()))
.collect(Collectors.toList());
}
@Override
......@@ -176,16 +208,13 @@ implements DictionaryService
{
limit = limit > 0 ? limit : this.resultLimit();
LinkedHashSet<UUID> ids = new LinkedHashSet<>();
query = query + "%";
LinkedHashSet<UUID> ids = new LinkedHashSet<>();
/* search by prop name, then NPN, then substance */
ids.addAll(this.ctx.select(field(name("id"), UUID.class))
.from(D_TABLE)
.where(field(name("proprietary_name"), String.class).likeIgnoreCase(query))
.where(field(name("proprietary_name"), String.class).containsIgnoreCase(query))
.orderBy(field(name("proprietary_name"), String.class).asc())
.limit(limit)
.fetch(field(name("id"), UUID.class)));
......@@ -201,7 +230,7 @@ implements DictionaryService
ids.addAll(
this.ctx.select(ide)
.from(N_TABLE)
.where(nf.likeIgnoreCase(query))
.where(nf.containsIgnoreCase(query))
.orderBy(nf.asc())
.limit(limit - ids.size())
.fetch(ide));
......@@ -214,7 +243,7 @@ implements DictionaryService
ids.addAll(
this.ctx.select(ide)
.from(S_TABLE)
.where(nf.likeIgnoreCase(query))
.where(nf.containsIgnoreCase(query))
.orderBy(nf.asc())
.limit(limit - ids.size())
.fetch(ide));
......@@ -330,9 +359,56 @@ implements DictionaryService
comparatorFor(String reference, Function<T, String> accessor)
{
return (a, b) ->
StringUtils.getFuzzyDistance(reference, accessor.apply(a), Locale.ENGLISH)
StringUtils.getFuzzyDistance(reference, accessor.apply(b), Locale.ENGLISH)
-
StringUtils.getFuzzyDistance(reference, accessor.apply(b), Locale.ENGLISH);
StringUtils.getFuzzyDistance(reference, accessor.apply(a), Locale.ENGLISH);
}
private static final Stream<String>
hashFuzz(final String str, final int distance)
{
final char[] fh = PhoneticHash.hash(str).toCharArray();
return Stream.concat(Stream.of(fh), permute(fh, distance))
.map(String :: valueOf)
.distinct();
}
private static final Stream<char[]>
permute(char[] input, int distance)
{
if (input.length < distance)
{
return Stream.empty();
}
else if (distance < 1)
{
return Stream.of(input);
}
return IntStream.range(0, input.length)
.mapToObj(i -> ArrayUtils.remove(input, i))
.flatMap(h -> Stream.concat(Stream.of(input),
permute(h, distance - 1)));
}
private static final List<List<String>>
partitionByLength(Collection<String> col)
{
return partitionByLength(col.stream());
}
private static final List<List<String>>
partitionByLength(Stream<String> stream)
{
return stream
.collect(Collectors.toMap(s -> s.length(), Stream :: of, Stream :: concat))
.entrySet().stream()
.sorted((a, b) -> b.getKey() - a.getKey())
.map(e -> e.getValue().collect(Collectors.toList()))
.collect(Collectors.toList());
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment