Commit 6c4ff3d9 authored by Florian Schäfer's avatar Florian Schäfer

Fix the validator that checks for unusual classes of Wikidata items

Stop splitting the check on so many requests, which drastically improves reliability of the query.
parent 5659dca1
Pipeline #26307070 passed with stages
in 6 minutes and 32 seconds
......@@ -4,6 +4,7 @@ import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.Objects;
import java.util.stream.Stream;
import org.openstreetmap.josm.tools.HttpClient;
import org.openstreetmap.josm.tools.Utils;
import org.wikipedia.api.ApiQuery;
......@@ -11,6 +12,7 @@ import org.wikipedia.api.ApiUrl;
import org.wikipedia.api.SerializationSchema;
import org.wikipedia.api.wdq.json.SparqlResult;
import org.wikipedia.tools.RegexUtil;
import org.wikipedia.tools.WikiProperties;
public class WdqApiQuery<T> extends ApiQuery<T> {
private static final String[] TICKET_KEYWORDS = {"wikidata", "QueryService"};
......@@ -37,22 +39,28 @@ public class WdqApiQuery<T> extends ApiQuery<T> {
}
/**
* @param items the items for which we check if they are instances of {@code x}
* or instances of any subclass of {@code x}.
* @param x the Q-ID of an item, for which the query checks if the provided items are instances of it,
* @param items the items for which we check if they are instances of one of the items provided by {@code classes}
* or instances of any subclass of the items provided by {@code classes}.
* @param classes the Q-IDs of items, for which the query checks if the provided items are instances of it,
* or instances of subclasses of it.
* @return the API query
*/
public static WdqApiQuery<SparqlResult> findInstancesOfXOrOfSubclass(final Collection<String> items, final String x) {
public static WdqApiQuery<SparqlResult> findInstancesOfClassesOrTheirSubclasses(final Collection<String> items, final Collection<String> classes) {
Objects.requireNonNull(items);
Objects.requireNonNull(x);
if (items.size() <= 0 || !items.stream().allMatch(RegexUtil::isValidQId) || !RegexUtil.isValidQId(x)) {
throw new IllegalArgumentException("All arguments for the 'is instance of X or of subclass' check must be valid Q-IDs!");
Objects.requireNonNull(classes);
if (items.size() >= 1 && classes.size() >= 1 && Stream.concat(items.stream(), classes.stream()).allMatch(RegexUtil::isValidQId)) {
return new WdqApiQuery<>(
ApiUrl.url("https://query.wikidata.org/sparql"),
"format=json&query=" + Utils.encodeUrl(String.format(
"SELECT DISTINCT ?item ?itemLabel ?classes ?classesLabel WHERE { VALUES ?item { wd:%s }. VALUES ?classes { wd:%s }. ?item wdt:P31/wdt:P279* ?supertype. ?supertype wdt:P279* ?classes. SERVICE wikibase:label { bd:serviceParam wikibase:language \"%s\" }. }",
String.join(" wd:", items),
String.join(" wd:", classes),
WikiProperties.WIKIPEDIA_LANGUAGE.get()
)),
SparqlResult.SCHEMA
);
} else {
throw new IllegalArgumentException("All arguments for the 'is instance of classes or their subclasses' check must be one or more valid Q-IDs!");
}
return new WdqApiQuery<>(
ApiUrl.url("https://query.wikidata.org/sparql"),
"format=json&query=" + Utils.encodeUrl(String.format("SELECT DISTINCT ?item WHERE { VALUES ?item { wd:%s } ?item wdt:P31/wdt:P279* wd:%s. }", String.join(" wd:", items), x)),
SparqlResult.SCHEMA
);
}
}
package org.wikipedia.validator;
import static org.wikipedia.validator.AllValidationTests.SEE_OTHER_CATEGORY_VALIDATOR_ERRORS;
import static org.wikipedia.validator.AllValidationTests.VALIDATOR_MESSAGE_MARKER;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.openstreetmap.josm.data.osm.OsmPrimitive;
import org.openstreetmap.josm.gui.Notification;
......@@ -41,45 +44,50 @@ public class UnusualWikidataClasses extends BatchProcessedTagTest<UnusualWikidat
@Override
protected void check(List<TestCompanion> allPrimitives) {
ListUtil.processInBatches(allPrimitives, 50, batch -> {
for (final String forbiddenType : WikiProperties.WIKIDATA_VALIDATOR_UNUSUAL_CLASSES.get()) {
ListUtil.processInBatches(
// group the same wikidata values to reduce number of queries
new ArrayList<>(allPrimitives.stream().collect(Collectors.groupingBy(it -> it.wikidataValue)).entrySet()),
50,
batch -> {
try {
checkBatch(batch, forbiddenType);
final SparqlResult result = ApiQueryClient.query(WdqApiQuery.findInstancesOfClassesOrTheirSubclasses(
batch.stream().map(Map.Entry::getKey).collect(Collectors.toList()),
WikiProperties.WIKIDATA_VALIDATOR_UNUSUAL_CLASSES.get()
));
for (final List<SparqlResult.Results.Entry> row : result.getRows()) {
final String itemUrl = row.get(0).getValue();
final String itemQId = itemUrl.substring(itemUrl.lastIndexOf('/') >= 0 ? itemUrl.lastIndexOf('/') + 1 : 0);
final String classUrl = row.get(2).getValue();
final String classQId = classUrl.substring(itemUrl.lastIndexOf('/') >= 0 ? itemUrl.lastIndexOf('/') + 1 : 0);
final Collection<OsmPrimitive> primitives = batch.stream()
.filter(it -> itemQId.equals(it.getKey()))
.flatMap(it -> it.getValue().stream().map(BatchProcessedTagTest.TestCompanion::getPrimitive))
.collect(Collectors.toList());
if (primitives.size() >= 1) {
errors.add(
AllValidationTests.WIKIDATA_TAG_HAS_UNUSUAL_TYPE.getBuilder(this)
.message(AllValidationTests.VALIDATOR_MESSAGE_MARKER + I18n.tr("Wikidata value is of unusual type for the wikidata=* tag on OSM objects"),
I18n.marktr("{0} is an instance of {1} (or any subclass thereof)"),
row.get(1).getValue() + " (" + itemQId + ")",
row.get(3).getValue() + " (" + classQId + ")"
)
.primitives(primitives)
.build()
);
}
}
} catch (IOException e) {
errors.add(
AllValidationTests.API_REQUEST_FAILED.getBuilder(this)
.primitives(batch.stream().map(BatchProcessedTagTest.TestCompanion::getPrimitive).collect(Collectors.toList()))
.message(AllValidationTests.VALIDATOR_MESSAGE_MARKER + e.getMessage())
.primitives(batch.stream().flatMap(it -> it.getValue().stream().map(BatchProcessedTagTest.TestCompanion::getPrimitive)).collect(Collectors.toList()))
.message(VALIDATOR_MESSAGE_MARKER + e.getMessage())
.build()
);
finalNotification = NETWORK_FAILED_NOTIFICATION;
}
}
});
}
private void checkBatch(final Collection<TestCompanion> batch, final String forbiddenType) throws IOException {
final SparqlResult result = ApiQueryClient.query(WdqApiQuery.findInstancesOfXOrOfSubclass(batch.stream().map(it -> it.wikidataValue).collect(Collectors.toList()), forbiddenType));
for (List<SparqlResult.Results.Entry> row : result.getRows()) {
final String entityURL = row.get(0).getValue();
final String qID = entityURL.substring(entityURL.lastIndexOf('/') >= 0 ? entityURL.lastIndexOf('/') + 1 : 0);
final Collection<OsmPrimitive> primitives = batch.stream()
.filter(it -> qID.equals(it.wikidataValue))
.map(BatchProcessedTagTest.TestCompanion::getPrimitive)
.collect(Collectors.toList());
if (primitives.size() >= 1) {
errors.add(
AllValidationTests.WIKIDATA_TAG_HAS_UNUSUAL_TYPE.getBuilder(this)
.primitives(primitives)
.message(
"Wikidata value is of unusual type for the wikidata=* tag on OSM objects",
I18n.marktr("{0} is an instance of {1} (or any subclass thereof)"),
qID,
forbiddenType
)
.build()
);
}
}
);
}
static class TestCompanion extends BatchProcessedTagTest.TestCompanion {
......
......@@ -6,7 +6,9 @@ import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.Rule;
import org.junit.Test;
import org.openstreetmap.josm.testutils.JOSMTestRules;
......@@ -63,24 +65,16 @@ public class WdqApiQueryTest {
@Test
public void test() throws IOException {
final SparqlResult bridgeResult = ApiQueryClient.query(WdqApiQuery.findInstancesOfXOrOfSubclass(MIXED_LIST, BRIDGE_CLASS));
bridgeResult.getRows().forEach(row -> assertEquals(1, row.size()));
for (final String bridge : BRIDGE_LIST) {
assertEquals("Bridge " + bridge + " not found in the result!", 1, bridgeResult.getRows().stream().filter(it -> ("http://www.wikidata.org/entity/" + bridge).equals(it.get(0).getValue())).count());
}
assertEquals(BRIDGE_LIST.size(), bridgeResult.size());
assertTrue(bridgeResult.getRows().stream().allMatch(row -> "uri".equals(row.get(0).getType())));
testFindInstancesOfClassesOrTheirSubclasses(MIXED_LIST, Arrays.asList(BRIDGE_CLASS, BUILDING_CLASS), MIXED_LIST);
testFindInstancesOfClassesOrTheirSubclasses(MIXED_LIST, Collections.singletonList(BRIDGE_CLASS), BRIDGE_LIST);
testFindInstancesOfClassesOrTheirSubclasses(MIXED_LIST, Collections.singletonList(BUILDING_CLASS), BUILDING_LIST);
}
final SparqlResult buildingResult = ApiQueryClient.query(WdqApiQuery.findInstancesOfXOrOfSubclass(MIXED_LIST, BUILDING_CLASS));
buildingResult.getRows().forEach(row -> assertEquals(1, row.size()));
for (final String building : BUILDING_LIST) {
assertEquals(
"Building " + building + " not found in the result!",
1,
buildingResult.getRows().stream().filter(row -> ("http://www.wikidata.org/entity/" + building).equals(row.get(0).getValue())).count()
);
private void testFindInstancesOfClassesOrTheirSubclasses(final Collection<String> itemList, final Collection<String> classesList, final Collection<String> expectedResultList) throws IOException {
final SparqlResult result = ApiQueryClient.query(WdqApiQuery.findInstancesOfClassesOrTheirSubclasses(itemList, classesList));
for (final String expectedEntry : expectedResultList) {
assertEquals("Entry " + expectedEntry + " not found in the result!", 1, result.getRows().stream().filter(row -> ("http://www.wikidata.org/entity/" + expectedEntry).equals(row.get(0).getValue())).count());
}
assertEquals(BUILDING_LIST.size(), buildingResult.size());
assertTrue(buildingResult.getRows().stream().allMatch(row -> "uri".equals(row.get(0).getType())));
assertEquals(expectedResultList.size(), result.size());
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment