Commit f443e9a5 authored by Florian Schäfer's avatar Florian Schäfer

Add validator test that checks if the Wikipedia article matches the Wikidata item

For all OSM objects with both a wikipedia=* tag and a wikidata=* tag, this check compares if the Wikidata item in the wikidata=* tag matches the Wikipedia article from the wikipedia=* tag.
parent c5a37b77
......@@ -21,6 +21,7 @@ import org.wikipedia.gui.WikidataItemSearchDialog;
import org.wikipedia.gui.WikidataTagCellRenderer;
import org.wikipedia.gui.WikipediaToggleDialog;
import org.wikipedia.validator.WikidataItemExists;
import org.wikipedia.validator.WikipediaAgainstWikidata;
public final class WikipediaPlugin extends Plugin {
......@@ -42,6 +43,7 @@ public final class WikipediaPlugin extends Plugin {
DownloadDialog.addDownloadSource(new SophoxDownloadReader());
OsmValidator.addTest(WikidataItemExists.class);
OsmValidator.addTest(WikipediaAgainstWikidata.class);
}
public static String getVersionInfo() {
......
// License: GPL. For details, see LICENSE file.
package org.wikipedia.tools;
import java.util.List;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
public final class ListUtil {
private ListUtil() {
// Private constructor to avoid instantiation
}
/**
* Splits the given list {@code fullList} into batches of a size of {@code maxBatchSize} or less and each batch is
* then consumed by the given {@link Consumer} {@code processBatch}.
* @param fullList the list that should be split into batches
* @param maxBatchSize the maximum size of one batch
* @param processBatch the consumer that is run on each batch
* @param <T> the type of the list elements
*/
public static <T> void processInBatches(final List<T> fullList, int maxBatchSize, final Consumer<List<T>> processBatch, final BiConsumer<Integer, Integer> updateProgress) {
final int numPrimitives = fullList.size();
final int numBatches = numPrimitives / maxBatchSize + (numPrimitives % maxBatchSize == 0 ? 0 : 1);
for (int batchIndex = 0; batchIndex * maxBatchSize < numPrimitives; batchIndex++) {
updateProgress.accept(batchIndex, numBatches);
processBatch.accept(fullList.subList(batchIndex * maxBatchSize, Math.min(numPrimitives, (batchIndex + 1) * maxBatchSize)));
}
}
}
\ No newline at end of file
......@@ -5,7 +5,8 @@ import java.util.regex.Pattern;
public class RegexUtil {
private static final Pattern Q_ID_PATTERN = Pattern.compile("^Q[1-9][0-9]*$");
private static final Pattern SITE_ID_PATTERN = Pattern.compile("^[a-z][a-z][a-z]?wiki");
private static final Pattern SITE_ID_PATTERN = Pattern.compile("^[a-z][a-z][a-z]?wiki$");
public static final Pattern WIKIPEDIA_TAG_VALUE_PATTERN = Pattern.compile("([a-z][a-z][a-z]?):(.+)");
private RegexUtil() {
// Private constructor to avoid instantiation
......@@ -25,4 +26,8 @@ public class RegexUtil {
public static boolean isValidSiteId(final String value) {
return value != null && SITE_ID_PATTERN.matcher(value).matches();
}
public static boolean isValidWikipediaTagValue(final String value) {
return value != null && WIKIPEDIA_TAG_VALUE_PATTERN.matcher(value).matches();
}
}
......@@ -9,9 +9,10 @@ import org.openstreetmap.josm.tools.I18n;
class AllValidationTests {
static final ValidationTest<WikidataItemExists> INVALID_QID = new ValidationTest<>(Severity.ERROR, 30_000);
static final ValidationTest<WikidataItemExists> API_REQUEST_FAILED = new ValidationTest<>(Severity.OTHER, 30_001);
static final ValidationTest<Test> API_REQUEST_FAILED = new ValidationTest<>(Severity.OTHER, 30_001);
static final ValidationTest<WikidataItemExists> WIKIDATA_ITEM_DOES_NOT_EXIST = new ValidationTest<>(Severity.ERROR, 30_002);
static final ValidationTest<WikidataItemExists> WIKIDATA_ITEM_IS_REDIRECT = new ValidationTest<>(Severity.WARNING, 30_003);
static final ValidationTest<WikipediaAgainstWikidata> WIKIDATA_ITEM_NOT_MATCHING_WIKIPEDIA = new ValidationTest<>(Severity.WARNING, 30_004);
// i18n: Prefix for the validator messages. Note the space at the end!
static final String VALIDATOR_MESSAGE_MARKER = I18n.tr("[Wiki] ");
......
// License: GPL. For details, see LICENSE file.
package org.wikipedia.validator;
import java.util.ArrayList;
import java.util.List;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import org.openstreetmap.josm.data.osm.OsmPrimitive;
import org.openstreetmap.josm.data.validation.Test;
import org.openstreetmap.josm.gui.Notification;
import org.openstreetmap.josm.gui.progress.ProgressMonitor;
import org.openstreetmap.josm.tools.I18n;
import org.wikipedia.tools.ListUtil;
public abstract class BatchProcessedTagTest<T extends BatchProcessedTagTest.TestCompanion> extends Test.TagTest {
Notification finalNotification = null;
BatchProcessedTagTest(String name, String description) {
super(name, description);
}
private List<T> primitivesForBatches = new ArrayList<>();
/**
* Creates a companion object for the given primitive, on which the test can later continue to operate.
* E.g. if you want to perform some check on tag xyz=*, you could return here the String value of tag xyz=*. When processing a batch later on you can
* @param primitive a primitive for which a companion object should be created
* @return the companion object if the primitive should be checked in a batch
* or {@code null} if the given primitive should be excluded from the check
*/
protected abstract T prepareTestCompanion(final OsmPrimitive primitive);
/**
* This can be used as last argument for {@link ListUtil#processInBatches(List, int, Consumer, BiConsumer)}
* @param batchIndex the index of the currently processed batch (starting at 0)
* @param numBatches the total number of batches that are processed
*/
final void updateBatchProgress(int batchIndex, int numBatches) {
progressMonitor.setExtraText(I18n.tr("({0} items, processing batch {1} of {2})", primitivesForBatches.size(), batchIndex + 1, numBatches));
}
@Override
public final void startTest(ProgressMonitor progressMonitor) {
primitivesForBatches.clear();
finalNotification = null;
super.startTest(progressMonitor);
}
@Override
public final void check(final OsmPrimitive primitive) {
final T testCompanion = primitive == null ? null : prepareTestCompanion(primitive);
if (testCompanion != null) {
primitivesForBatches.add(testCompanion);
}
}
protected abstract void check(final List<T> allPrimitives);
@Override
public final void endTest() {
check(primitivesForBatches);
if (finalNotification != null) {
finalNotification.show();
}
super.endTest();
}
static abstract class TestCompanion {
private final OsmPrimitive primitive;
TestCompanion(final OsmPrimitive primitive) {
this.primitive = primitive;
}
final OsmPrimitive getPrimitive() {
return primitive;
}
}
}
......@@ -23,6 +23,7 @@ import org.wikipedia.api.wikidata_action.json.CheckEntityExistsResult;
import org.wikipedia.api.wikidata_action.json.SerializationSchema;
import org.wikipedia.tools.RegexUtil;
// TODO: Make this a subclass of BatchProcessedTagTest!
/**
* Checks if for the wikidata=* tag on an {@link OsmPrimitive} a Wikidata item really exists.
* This check requires a working internet connection, because it queries the Wikidata Action API.
......
// License: GPL. For details, see LICENSE file.
package org.wikipedia.validator;
import java.io.IOException;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.openstreetmap.josm.data.osm.OsmPrimitive;
import org.openstreetmap.josm.data.preferences.sources.ValidatorPrefHelper;
import org.openstreetmap.josm.gui.Notification;
import org.openstreetmap.josm.tools.I18n;
import org.openstreetmap.josm.tools.ImageProvider;
import org.openstreetmap.josm.tools.Pair;
import org.wikipedia.api.wikidata_action.ApiQueryClient;
import org.wikipedia.api.wikidata_action.WikidataActionApiUrl;
import org.wikipedia.api.wikidata_action.json.SerializationSchema;
import org.wikipedia.tools.ListUtil;
import org.wikipedia.tools.RegexUtil;
public class WikipediaAgainstWikidata extends BatchProcessedTagTest<WikipediaAgainstWikidata.TestCompanion> {
private static final Notification NETWORK_FAILED_NOTIFICATION = new Notification(
I18n.tr("Could not check for all wikipedia=* tags if they match the wikidata=* tag.") + "\n" +
(ValidatorPrefHelper.PREF_OTHER.get()
? I18n.tr("See the validator messages of the category ''Other'' for more details.")
: I18n.tr("Turn on the informational level validator messages in the preferences to see more details.")
)
).setIcon(ImageProvider.get("dialogs/wikipedia"));
public WikipediaAgainstWikidata() {
super("Check wikipedia=* is interwiki link of wikidata=*", "make sure that the wikipedia=* article is connected to the wikidata=* item");
}
@Override
protected TestCompanion prepareTestCompanion(OsmPrimitive primitive) {
final String wikipediaValue = primitive.get("wikipedia");
final String wikidataValue = primitive.get("wikidata");
if (wikipediaValue != null && RegexUtil.isValidQId(wikidataValue)) {
final Matcher wpMatcher = RegexUtil.WIKIPEDIA_TAG_VALUE_PATTERN.matcher(wikipediaValue);
if (wpMatcher.matches()) {
return new TestCompanion(primitive, wpMatcher.group(1), wpMatcher.group(2), wikidataValue);
}
}
return null;
}
@Override
protected void check(List<TestCompanion> allPrimitives) {
allPrimitives.stream()
.collect(Collectors.groupingBy(it -> it.language)) // Group by wiki-language
.forEach((language, primitiveList) -> {
ListUtil.processInBatches(
primitiveList,
50,
primitiveBatch -> {
checkBatch(language, primitiveBatch);
},
this::updateBatchProgress
);
});
}
private void checkBatch(final String language, final List<TestCompanion> primitiveBatch) {
try {
ApiQueryClient.query(
WikidataActionApiUrl.getEntityForSitelink(language + "wiki", primitiveBatch.stream().map(it -> it.title).collect(Collectors.toList())),
SerializationSchema.WBGETENTITIES
).getEntities().values().stream()
.flatMap(entity -> entity.getSitelinks().isPresent() ? entity.getSitelinks().get().stream().map(it -> Pair.create(it, entity.getId())) : Stream.empty())
.forEach(sitelinkAndQId -> {
Optional<TestCompanion> curTestCompanion = primitiveBatch.stream().filter(it -> sitelinkAndQId.a.getTitle().equals(it.title)).findAny();
if (curTestCompanion.isPresent()) {
if (!curTestCompanion.get().qId.equals(sitelinkAndQId.b)) {
errors.add(AllValidationTests.WIKIDATA_ITEM_NOT_MATCHING_WIKIPEDIA.getBuilder(this)
.primitives(curTestCompanion.get().getPrimitive())
.message(
AllValidationTests.VALIDATOR_MESSAGE_MARKER + I18n.tr("Wikidata item and Wikipedia article do not match!"),
I18n.marktr("Wikidata item {0} is not associated with Wikipedia article {1} ({2})"),
curTestCompanion.get().qId,
sitelinkAndQId.a.getTitle(),
sitelinkAndQId.b
).build());
}
}
});
} catch (IOException e) {
errors.add(
AllValidationTests.API_REQUEST_FAILED.getBuilder(this)
.primitives(primitiveBatch.stream().map(BatchProcessedTagTest.TestCompanion::getPrimitive).collect(Collectors.toList()))
.message(AllValidationTests.VALIDATOR_MESSAGE_MARKER + e.getMessage())
.build()
);
finalNotification = NETWORK_FAILED_NOTIFICATION;
}
}
static class TestCompanion extends BatchProcessedTagTest.TestCompanion {
final String language;
final String title;
final String qId;
private TestCompanion(final OsmPrimitive primitive, String language, String title, final String qId) {
super(primitive);
this.language = Objects.requireNonNull(language);
this.title = Objects.requireNonNull(title);
this.qId = Objects.requireNonNull(qId);
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment