...
 
Commits (2)
......@@ -41,15 +41,17 @@ import static org.jooq.impl.DSL.name;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.function.Function;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.Stream.Builder;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
......@@ -235,67 +237,94 @@ implements DictionaryService
public List<Product>
findProducts(final String query, int limit)
{
limit = limit > 0 ? limit : this.resultLimit();
if (query.length() < MIN_QUERY_LEN)
{
return Collections.emptyList();
}
final LinkedHashSet<UUID> ids = new LinkedHashSet<>();
final Field<String> hashField = field(name("phonetic_hash"), String.class);
/* search by prop name, then NPN, then substance */
ids.addAll(this.ctx.select(field(name("id"), UUID.class))
.from(D_TABLE)
.where(field(name("proprietary_name"), String.class).containsIgnoreCase(query))
.fetch(field(name("id"), UUID.class)));
/* fuzzy hash search over proprietary names */
final List<List<String>> hashes =
partitionHashesByLength(hashFuzz(query, HASH_DISTANCE, MIN_FUZZY_HASH_LEN));
/* FIXME - "valsartin" produces poor result ordering
*
* This search approach has one big hole:
*
* "valsartin" produces a top result of
* "Valsartan and hydrochlorothiazide" and not "Valsartan" as it
* should. Without debugging this, my guess is that both hits are
* scored the same because we're looking at proprietary name, but
* ignoring the fact that a multi-ingredient product should be ranked
* lower than a single-ingredient product in cases where we have only
* one input word.
*/
for (final List<String> hashGroup : hashes)
if (StringUtils.isEmpty(query) || limit < 1)
{
return Collections.emptyList();
}
if (query.length() < MIN_QUERY_LEN)
{
if (ids.size() >= limit)
{
break;
}
Condition cond = DSL.falseCondition();
for (String hash : hashGroup)
{
cond = cond.or(hashField.like(hash));
}
return Collections.emptyList();
}
ids.addAll(this.ctx
.select(field(name("id"), UUID.class))
.from(D_TABLE)
.where(cond)
.fetch(field(name("id"), UUID.class)));
final String term = query.trim();
/* search NPNs */
final Field<UUID> idField = field(name("id"), UUID.class);
ids.addAll(this.ctx
.select(field(name("id__entries"), UUID.class))
.from(N_TABLE)
.where(cond)
.fetch(field(name("id__entries"), UUID.class)));
}
List<NDCProduct> products = this.loadProducts(new ArrayList<>(ids));
return products
.parallelStream()
.map(r -> Pair.of(r, scoreProduct(r, query)))
.filter(r -> r.getRight() >= 0.5d)
.sorted((a, b) -> (int) Math.round((b.getRight() - a.getRight()) * 1000))
.limit(limit)
.map(p -> p.getLeft())
.collect(Collectors.toList());
final Function<NDCProduct, Pair<NDCProduct, Double>> scoreMapper =
product -> Pair.of(product, scoreProduct(product, term));
final Comparator<Pair<?, Double>> pairSort =
(a, b) -> (int) Math.round((b.getRight() - a.getRight()) * 1000);
/* Establish a fallback limit. Since trivially-short strings are likely
* to produce huge numbers of mostly-irrelevant results, we limit our
* queries to fewer results. This prevents us from burning a ton of
* time finding everything with, say "a" in the product name, while
* still allowing us to have plenty of results for longer queries
* (where matches are likely to be more significant).
*/
final int fbLimit = term.length() * 250;
final List<String> termWords =
Arrays.asList(StringUtils.split(term.toUpperCase()));
/* generate a stream of our search conditions (for latter mapping to stages) */
Builder<Condition> builder = Stream.builder();
/* exact term search */
builder.accept(field(name("proprietary_name"), String.class).containsIgnoreCase(term));
/* exact phonetic search */
builder.accept(field(name("phonetic_hash"), String.class)
.containsIgnoreCase(PhoneticHash.hash(term)));
/* pos-independent word search */
builder.accept(termWords.stream()
.map(t -> field(name("proprietary_name")).containsIgnoreCase(t))
.reduce(DSL.trueCondition(), (a, b) -> a.and(b)));
/* all word phonetic search */
builder.accept(termWords
.stream()
.map(PhoneticHash :: hash)
.map(t -> field(name("phonetic_hash")).containsIgnoreCase(t))
.reduce(DSL.trueCondition(), (a, b) -> a.and(b)));
return builder
.build()
.parallel()
.flatMap(c ->
this.loadProducts(
this.ctx.select(idField)
.from(D_TABLE)
.where(c)
.limit(fbLimit)
.fetch(idField))
.stream()
.map(scoreMapper)
.sorted(pairSort)
.limit(limit)
)
.unordered()
.distinct()
.sorted(pairSort)
.limit(limit)
.map(p -> p.getLeft())
.collect(Collectors.toList());
}
@Override
......