...
  View open merge request
Commits (9)

Too many changes to show.

To preserve performance only 1000 of 1000+ files are displayed.

......@@ -220,9 +220,35 @@ public class CustomAnalysisEngineFactory {
You can add as many factory methods as you want to any factory class with the pattern `*EngineFactory.java`. The maven
plugin expects methods returning an object of AnalysisEngineDescription without parameters. Additionally, the factory
class must have a default constructor.
class must have a default constructor.
## Use The Best-Perfroming NLP Pipeline in Your Application
### Exclude NLP Steps from your Pipeline
Sometimes you don't want to test all tools that are available through the Maven dependencies, therefore it is possible to exclude specific NLP tools.
For example, when you include OpenNLP and StanfordNLP, every available tool combination will be trained and evaluated in order to determine the best-performing NLP pipeline. However, if you do not want to train and evaluate the OpenNlpSegmenter, you can exclude it through the plugin configuration.
```xml
<build>
<plugins>
<plugin>
<groupId>de.schrieveslaach.nlpf</groupId>
<artifactId>nlp-maven-plugin</artifactId>
<version>1.1.0-SNAPSHOT</version>
<extensions>true</extensions>
<configuration>
<exclusions>
<exclude>de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter</exclude>
</exclusions>
</configuration>
</plugin>
</plugins>
</build>
```
The exclusion has to match the implementation name of the tool. The above example excludes the `OpenNlpSegmenter`.
You can add as many exclusions as you want in the `exclusions` configuration.
## Use The Best-Performing NLP Pipeline in Your Application
When you deployed your best-performing NLP pipeline to your Maven repository,
you can add your pipeline as Maven dependency to your project which requires
......
......@@ -23,12 +23,15 @@ package de.schrieveslaach.nlpf.maven.plugin;
*/
import org.apache.maven.plugin.testing.MojoRule;
import org.apache.maven.project.MavenProject;
import org.junit.Rule;
import org.junit.Test;
import java.io.File;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.io.FileMatchers.anExistingDirectory;
import static org.hamcrest.io.FileMatchers.anExistingFile;
import static org.junit.Assert.assertThat;
......@@ -56,4 +59,14 @@ public class TrainMojoIT extends BaseMojoIT {
assertThat(modelFile, is(anExistingFile()));
}
@Test
public void shouldNotTrain_TrainerExcludedInPom() throws Exception {
File testProjectBaseDir = getTestProjectBaseDir("/sample-project-with-excluded-trainer");
storeExampleCasFiles(testProjectBaseDir);
rule.executeMojo(testProjectBaseDir, "train");
File modelFile = new File(testProjectBaseDir, "target/models/de.company/domain-specific-corpus/de.tudarmstadt.ukp.dkpro.core.stanfordnlp-gpl/");
assertThat(modelFile, is(not(anExistingDirectory())));
}
}
<!--
========================LICENSE_START=================================
nlp-maven-plugin
%%
Copyright (C) 2017 Schrieveslaach
%%
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Lesser Public License for more details.
You should have received a copy of the GNU General Lesser Public
License along with this program. If not, see
<http://www.gnu.org/licenses/lgpl-3.0.html>.
=========================LICENSE_END==================================
-->
<project>
<modelVersion>4.0.0</modelVersion>
<groupId>de.company</groupId>
<artifactId>domain-specific-corpus</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<language>en</language>
<named.entity.types>person,organization</named.entity.types>
</properties>
<build>
<plugins>
<plugin>
<groupId>de.schrieveslaach.nlpf</groupId>
<artifactId>nlp-maven-plugin</artifactId>
<version>1.1.0-SNAPSHOT</version>
<extensions>true</extensions>
<configuration>
<excludedTrainers>
<excludedTrainer>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-gpl.*</excludedTrainer>
</excludedTrainers>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>de.tudarmstadt.ukp.dkpro.core.io.xmi-asl</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>de.tudarmstadt.ukp.dkpro.core.opennlp-asl</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
<artifactId>de.tudarmstadt.ukp.dkpro.core.stanfordnlp-gpl</artifactId>
<version>1.10.0</version>
</dependency>
</dependencies>
</project>
package de.company;
/*-
* ========================LICENSE_START=================================
* nlp-maven-plugin
* %%
* Copyright (C) 2017 Schrieveslaach
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Lesser Public License for more details.
*
* You should have received a copy of the GNU General Lesser Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/lgpl-3.0.html>.
* =========================LICENSE_END==================================
*/
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
public class CustomAnalysisEngineFactory {
public AnalysisEngineDescription createStanfordSegmenter() throws Exception {
return createEngineDescription(
StanfordSegmenter.class,
StanfordSegmenter.PARAM_LANGUAGE_FALLBACK, "en"
);
}
}
......@@ -25,6 +25,7 @@
<groupId>de.company</groupId>
<artifactId>domain-specific-corpus</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>nlp-models</packaging>
<properties>
<language>en</language>
......
......@@ -42,6 +42,7 @@ import lombok.Getter;
import lombok.SneakyThrows;
import org.apache.maven.plugin.AbstractMojo;
import org.apache.maven.plugin.MojoExecutionException;
import org.apache.maven.plugins.annotations.Parameter;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.fit.descriptor.TypeCapability;
......
......@@ -52,6 +52,7 @@ import javax.inject.Inject;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import static de.schrieveslaach.nlpf.maven.plugin.JCasDataUtil.copyJCas;
......@@ -67,6 +68,12 @@ public class TrainMojo extends AbstractMojo {
@Parameter(property = "default.document.language", defaultValue = "en")
private String language;
/**
* Tools that should be excluded
*/
@Parameter
private List<String> excludedTrainers;
@Inject
private AnalysisEngineDescriptionService analysisEngineService;
......@@ -113,10 +120,25 @@ public class TrainMojo extends AbstractMojo {
private List<AnalysisEngineDescription> runTrainerPipeline(List<JCas> trainingData) {
return analysisEngineService.findTrainerDescriptions()
.parallelStream()
.filter(aed -> filterExcludedTrainers(aed))
.filter(aed -> train(trainingData, aed))
.collect(Collectors.toList());
}
private boolean filterExcludedTrainers(AnalysisEngineDescription aed) {
if (aed == null) {
return false;
} else if (excludedTrainers != null && !excludedTrainers.isEmpty()) {
for (String excludedTrainer : excludedTrainers) {
String[] split = excludedTrainer.split("\\*");
if (aed.getAnnotatorImplementationName().contains(split[0])) {
return false;
}
}
}
return true;
}
private boolean train(List<JCas> trainingData, AnalysisEngineDescription aed) {
String analysisEngineName = analysisEngineService.getAnalysisEngineName(aed);
getLog().info("Starting to train with " + analysisEngineName);
......
......@@ -10,12 +10,12 @@ package de.schrieveslaach.nlpf.maven.plugin.service;
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Lesser Public License for more details.
*
*
* You should have received a copy of the GNU General Lesser Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/lgpl-3.0.html>.
......@@ -147,6 +147,7 @@ public class AnalysisEngineDescriptionService {
*/
public List<AnalysisEngineDescription> findAnnotatorDescriptions(List<AnalysisEngineDescription> trainerDescriptions) {
Multimap<Class, ModelParameter> modelParameterMappings = HashMultimap.create();
findAllTrainerAnnotatorPairs().stream()
// filter by given trainer descriptions
.filter(pair -> trainerDescriptions.stream()
......
......@@ -26,6 +26,7 @@ import de.schrieveslaach.nlpf.testing.annotators.MyNamedEntityRecognizer;
import de.schrieveslaach.nlpf.testing.annotators.MyPosTagger;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpNamedEntityRecognizer;
import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger;
import org.apache.commons.io.FileUtils;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.resource.ResourceInitializationException;
......@@ -35,15 +36,23 @@ import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.InjectMocks;
import org.mockito.junit.MockitoJUnitRunner;
import org.springframework.util.ReflectionUtils;
import java.io.File;
import java.lang.reflect.Field;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import static de.schrieveslaach.nlpf.plumbing.util.AnalysisEngineDescriptionUtil.hash;
import static java.util.Arrays.asList;
import static net.javacrumbs.jsonunit.JsonMatchers.jsonEquals;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.hamcrest.Matchers.contains;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.io.FileMatchers.anExistingFile;
import static org.junit.Assert.assertThat;
import static org.mockito.Mockito.when;
......
......@@ -30,6 +30,7 @@ import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ResourceParameter;
import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiReader;
import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiWriter;
import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTaggerTrainer;
import org.apache.maven.plugin.MojoExecutionException;
import org.apache.maven.project.MavenProject;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
......@@ -49,10 +50,12 @@ import org.junit.runner.RunWith;
import org.mockito.InjectMocks;
import org.mockito.Mock;
import org.mockito.junit.MockitoJUnitRunner;
import org.springframework.util.ReflectionUtils;
import java.io.File;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
......@@ -184,6 +187,36 @@ public class TrainMojoTest {
assertThat(jCasOfNamedEntityTrainer, is(notNullValue()));
}
@Test
public void shouldNotTrain_TrainerExcludedInConfiguration() throws Exception {
createExclusionInConfiguration(TestPosTaggerTrainer.class.getName());
mockTrainingPipeline(TestPosTaggerTrainer.class, TestNerTrainer.class);
mojo.execute();
assertThat(jCasOfPosTaggerTrainer, is(nullValue()));
assertThat(jCasOfNamedEntityTrainer, is(notNullValue()));
}
@Test
public void shouldNotTrain_TrainerExcludedInConfiguration_WithAsterik() throws Exception {
createExclusionInConfiguration("de.schrieveslaach.nlpf.maven.plugin.*");
mockTrainingPipeline(TestPosTaggerTrainer.class, TestNerTrainer.class);
mojo.execute();
assertThat(jCasOfPosTaggerTrainer, is(nullValue()));
assertThat(jCasOfNamedEntityTrainer, is(nullValue()));
}
private void createExclusionInConfiguration(String... classNames) {
//Use reflection to set the configuration parameter
Field excludedTrainersField = ReflectionUtils.findField(mojo.getClass(), "excludedTrainers");
excludedTrainersField.setAccessible(true);
ReflectionUtils.setField(excludedTrainersField, mojo, Arrays.asList(classNames));
}
private void mockTrainingPipeline(Class... trainerClasses) throws Exception {
List<AnalysisEngineDescription> descriptions = new ArrayList<>();
for (Class trainerClass : trainerClasses) {
......
BSD 2-Clause License
For Yarn software
Copyright (c) 2016-present, Yarn Contributors. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.