Commit df74f3f5 authored by jhargrave's avatar jhargrave

add PostSegmentationSimplifierIT and test resources

parent acfb328c
......@@ -63,9 +63,7 @@ public final class IntegrationtestUtils {
} else {
dir = new File(ClassUtil.getResourceParent(IntegrationtestUtils.class, resourcePath));
}
System.err.println("PATH: "+dir);
FilenameFilter filter = new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
......
package net.sf.okapi.simplifier.integration;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ErrorCollector;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.filters.InputDocument;
import net.sf.okapi.common.filters.RoundTripComparison;
import net.sf.okapi.filters.html.HtmlFilter;
import net.sf.okapi.filters.idml.IDMLFilter;
import net.sf.okapi.filters.xliff.XLIFFFilter;
import net.sf.okapi.steps.common.codesimplifier.PostSegmentationCodeSimplifierStep;
import net.sf.okapi.steps.segmentation.Parameters;
import net.sf.okapi.steps.segmentation.SegmentationStep;
public class PostSegmentationSimplifierIT {
private static final LocaleId EN = new LocaleId("en", "us");
private static final LocaleId ESES = new LocaleId("es", "es");
private String pathBase;
private SegmentationStep segmentationStep;
@Rule
public ErrorCollector errCol = new ErrorCollector();
@Before
public void setUp() throws Exception {
pathBase = Util.getDirectoryName(PostSegmentationSimplifierIT.class.getResource("/net/sf/okapi/common/codesimplifier/test1.xlf").getPath()) + "/";
segmentationStep = new SegmentationStep();
segmentationStep.setSourceLocale(EN);
List<LocaleId> tl = new LinkedList<>();
tl.add(ESES);
segmentationStep.setTargetLocales(tl);
Parameters params = (Parameters)segmentationStep.getParameters();
params.setSegmentSource(true);
params.setSegmentTarget(true);
params.setSourceSrxPath(PostSegmentationSimplifierIT.class.getClassLoader().getResource("default.srx").getPath());
params.setTargetSrxPath(PostSegmentationSimplifierIT.class.getClassLoader().getResource("default.srx").getPath());
segmentationStep.handleEvent(Event.START_BATCH_ITEM_EVENT);
}
@After
public void tearDown() throws Exception {
}
@SuppressWarnings("resource")
@Test
public void testDoubleExtraction() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "test1.html", null));
RoundTripComparison rtc = new RoundTripComparison();
assertTrue(rtc.executeCompare(new HtmlFilter(), list, "UTF-8", EN, ESES, "out",
segmentationStep,
new PostSegmentationCodeSimplifierStep()));
}
@SuppressWarnings("resource")
@Test
public void testDoubleExtraction2() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "aa324.html", null));
RoundTripComparison rtc = new RoundTripComparison();
assertTrue(rtc.executeCompare(new HtmlFilter(), list, "UTF-8", EN, ESES, "out",
segmentationStep,
new PostSegmentationCodeSimplifierStep()));
}
@SuppressWarnings("resource")
@Test
public void testDoubleExtraction3() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "form.html", null));
RoundTripComparison rtc = new RoundTripComparison();
assertTrue(rtc.executeCompare(new HtmlFilter(), list, "UTF-8", EN, ESES, "out",
segmentationStep,
new PostSegmentationCodeSimplifierStep()));
}
@SuppressWarnings("resource")
@Ignore("Only Fails becuase of wassegmented property difference")
public void testDoubleExtraction4() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "BinUnitTest01.xlf", null));
list.add(new InputDocument(pathBase + "JMP-11-Test01.xlf", null));
list.add(new InputDocument(pathBase + "Manual-12-AltTrans.xlf", null));
list.add(new InputDocument(pathBase + "test1.xlf", null));
RoundTripComparison rtc = new RoundTripComparison();
assertTrue(rtc.executeCompare(new XLIFFFilter(), list, "UTF-8", EN, ESES, "out",
segmentationStep,
new PostSegmentationCodeSimplifierStep()));
}
@SuppressWarnings("resource")
@Test
public void testDoubleExtraction5() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "idmltest.idml", null));
RoundTripComparison rtc = new RoundTripComparison(false);
assertTrue(rtc.executeCompare(new IDMLFilter(), list, "UTF-8", EN, EN, "out",
segmentationStep,
new PostSegmentationCodeSimplifierStep()));
}
}
<?xml version="1.0" encoding="UTF-8" ?>
<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">
<file datatype="x-test" original="test1"
source-language="en" target-language="fr">
<body>
<trans-unit id="0" translate="no">
<source xml:lang="en">Do not translate this.</source>
</trans-unit>
<trans-unit id="1">
<source xml:lang="en">Text number 1</source>
</trans-unit>
<bin-unit id="img1" mime-type="image">
<bin-source>
<external-file href="image1.png"/>
</bin-source>
<trans-unit id="img1-1">
<source>Text of image 1</source>
</trans-unit>
</bin-unit>
</body>
</file>
</xliff>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<!-- ============================================= -->
<!-- GDF Application Strings -->
<!-- Copyright © 2006 Gandalf Inc. All Rights Reserved. -->
<!-- ============================================= -->
<!-- FILE HISTORY -->
<!-- Wed Jan 10 14:22:22 2007: ./xliff-deflate.pl - extracted from GDF.xlf, target-lang=fr, alt-lang=(de) -->
<!-- Mon Jan 8 19:08:28 2007: ./xliff-add.pl - added 506 strings from collect/WinOS.xlf -->
<!-- ============================================= -->
<xliff version="1.1"
xmlns="urn:oasis:names:tc:xliff:document:1.1"
xmlns:gdf-xstr="http://www.gdf.com/xmlns/gdf-xstr.xsd">
<file source-language="en" target-language="fr" original="GDF.xlf" datatype="plaintext" xml:space="preserve">
<header>
<gdf-xstr:version>25</gdf-xstr:version>
<gdf-xstr:lang>de;en;fr;ja;ko;zh-CN</gdf-xstr:lang>
</header>
<body>
<group resname="!Specials-test=&lt;&quot;&apos;&amp;>">
<trans-unit id="S_XStr_LangCode">
<source>en</source>
<target>fr</target>
<note>This language's ISO-639 code</note>
<alt-trans>
<target xml:lang="de">de</target>
</alt-trans>
</trans-unit>
<trans-unit id="S_XStr_Language">
<source>English</source>
<target>French</target>
<note>This language's name in English</note>
<alt-trans>
<target xml:lang="de">German</target>
</alt-trans>
</trans-unit>
<trans-unit id="S_XStr_LangNative">
<source>English</source>
<target>Français</target>
<note>This language's name in its own language</note>
<alt-trans>
<target xml:lang="de">Deutsch</target>
</alt-trans>
</trans-unit>
<trans-unit id="S_XStr_Charset">
<source>utf-8</source>
<target>utf-8</target>
<alt-trans>
<target xml:lang="de">utf-8</target>
</alt-trans>
</trans-unit>
</group>
<group resname="LinuxOS">
<group resname="Qt">
<group resname="ActivationDlg.cpp">
<trans-unit id="S_LinActivationDlg_Verify_proxy_settings_or_connect_xxx" restype="x-misc">
<source>Verify proxy settings or connection to internet.&lt;br></source>
<target>Vérifiez les paramètres proxy ou la connexion à Internet.&lt;br></target>
<alt-trans>
<target xml:lang="de">Prüfen Sie die Proxy-Einstellungen oder die Verbindung zum Internet.&lt;br></target>
</alt-trans>
</trans-unit>
<trans-unit id="S_LinActivationDlg_2__Request_Code__xxx" restype="x-misc">
<source>2. Request Code:</source>
<target>2. Code de requête :</target>
<alt-trans>
<target xml:lang="de">2. Request-Code:</target>
</alt-trans>
</trans-unit>
<trans-unit id="TestAltTrans" restype="x-misc">
<source>File</source>
<target>Fichier</target>
<alt-trans>
<target xml:lang="de">Datei</target>
</alt-trans>
<alt-trans>
<target xml:lang="de">Ordner</target>
</alt-trans>
<alt-trans>
<source xml:lang="en">Open File</source>
<target xml:lang="fr">Ouvrir le fichier</target>
</alt-trans>
<alt-trans>
<source>Open File2</source>
<target xml:lang="fr">Ouvrir le fichier2</target>
</alt-trans>
</trans-unit>
<trans-unit id="S_LinActivationDlg__p_Tech_Support_will_then_supply_xxx" restype="x-misc">
<source>&lt;p>3. Operating System Type: &amp;lt Linux&amp;gt &lt;p>&lt;p>Tech Support will then supply a 20 digit Response Code that you will need to type into the empty boxes below. If you have access to e-mail, but don't wish to use Internet Activation, you may send the Request Code to GDF Technical Support at support@gdf.com. Tech Support will reply with the Response Code, which needs to be entered into the edit boxes.&lt;p></source>
<target>&lt;p>3. Type de système d'exploitation : &amp;lt Linux&amp;gt &lt;p>&lt;p>L'assistance technique fournit alors un code de réponse à 20 chiffres que vous devez saisir dans les cadres vides qui se trouvent ci-dessous. Si vous avez accès au courrier électronique, mais ne voulez pas utiliser l'activation Internet, vous pouvez envoyer le code de requête à l'adresse de l'assistance technique GDF : support@gdf.com. L'assistance technique répondra avec le code de réponse à saisir dans les boîtes d'édition.&lt;p></target>
<alt-trans>
<target xml:lang="de">&lt;p>3. Die Art des Betriebssystems: &amp;lt Linux&amp;gt &lt;p>&lt;p>Der technische Support gibt Ihnen dann einen 20-stelligen Response-Code, den Sie in die folgenden leeren Felder eintragen müssen. Wenn Sie E-Mail-Zugang haben, aber keine Internetaktivierung durchführen möchten, können Sie den Request-Code auch per E-Mail an den technischen Support von GDF schicken: support@gdf.com. Der technische Support schickt Ihnen dann eine Antwort mit dem Response-Code, den Sie in die Felder eintragen müssen.&lt;p></target>
</alt-trans>
</trans-unit>
</group>
<group resname="testGrp2">
<trans-unit id="S_Lininstaller_To_preserve_the_GDF_submenu_in_t2_xxx" restype="x-misc">
<source>To preserve the GDF submenu in the Gnome Foot-menu
it is necessary to copy files
to standard directories which are root protected.
Thus, the root password will be required to preserve
the GDF submenu in future versions
of Gnome.
Would you like to preserve the GDF submenu?</source>
<target>Pour conserver le sous-menu GDF dans le menu de pied de Gnome,
il est nécessaire de copier les fichiers dans des répertoires standard qui sont protégés à la racine.
Par conséquent, le mot de passe racine sera demandé pour conserver
le sous-menu GDF dans les versions futures de Gnome.
Souhaitez-vous conserver le sous-menu GDF ?</target>
<alt-trans>
<target xml:lang="de">Um das GDF-Untermenü im Gnome-Foot-Menü
zu erhalten, müssen die Dateien in
Standardverzeichnisse kopiert werden, deren Stammverzeichnis geschützt ist.
Deshalb ist das Kennwort für das Stammverzeichnis erforderlich, um das GDF-
Untermenü in zukünftigen Versionen von
Gnome zu erhalten.
Möchten Sie das GDF-Untermenü beibehalten?</target>
</alt-trans>
</trans-unit>
</group></group></group>
</body>
</file>
</xliff>
<?xml version="1.0" encoding="UTF-8" ?>
<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">
<file datatype="x-test" original="test1"
source-language="en" target-language="fr">
<body>
<trans-unit id="6">
<source xml:lang="en">Text <ph id='1'>startCode<sub>[nested<ph id='2'>ph-in-sub</ph>still in sub]</sub>endCode</ph> text.</source>
</trans-unit>
<trans-unit id="0" translate="no">
<source xml:lang="en">Do not translate this.</source>
</trans-unit>
<trans-unit id="1">
<source xml:lang="en">Text number 1</source>
</trans-unit>
<trans-unit id="2">
<source xml:lang="en">Text number 2</source>
<target xml:lang="fr">Texte numéro 2.</target>
</trans-unit>
<trans-unit id="3" approved="yes">
<source xml:lang="en">Text number 3</source>
<target xml:lang="fr">Texte numéro 3</target>
</trans-unit>
<trans-unit id="4">
<source xml:lang="en">Text number four</source>
<alt-trans>
<source xml:lang="en">The text number four.</source>
<target xml:lang="fr">Le texte numéro quatre.</target>
</alt-trans>
</trans-unit>
<trans-unit id="5">
<source xml:lang="en">Text <ph id='1'>startCode<sub>nested</sub>endCode</ph> text.</source>
</trans-unit>
</body>
</file>
</xliff>
\ No newline at end of file
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html>
<head>
<title>Personal Details</title>
<meta name="content-language" content="en" />
<link rel="stylesheet" type="text/css" href="main.css" />
</head>
<body>
<h1>Personal Details</h1>
<form name="MyForm" action="/servlet/YourResponseRecorder" method="post">
<input class="control" type="hidden" name="AuthenticationCode" value="pW5va9LGfxtYJ">
<table>
<tr>
<td>Name:</td>
<td><input class="control" name="Name" size="40"></td>
</tr>
<tr>
<td>Title:</td>
<td>
<select class="control" name="Title" size="1">
<option value="none" selected>Please select...</option>
<option>Lord</option>
<option>Lt General Sir</option>
<option>Major General</option>
<option>Professor</option>
<option>Rt Honourable</option>
<option>Prime Minister</option>
<option>Master &amp; Commander</option>
<option>Grand Poobah</option>
</select>
</td>
</tr>
<tr>
<td>Email Address:</td>
<td><input class="control" type="text" name="Email" size="40" /></td>
</tr>
<tr>
<td>Password:</td>
<td><input class="control" type="password" name="Password" size="40" value="secret"/></td>
</tr>
<tr>
<td>Member:</td>
<td><input type="checkbox" name="Member" checked /></td>
</tr>
<tr>
<td>Address:</td>
<td><textarea class="control" name="Address" rows="4" cols="45"></textarea></td>
</tr>
<tr>
<td>Mailing List Subscriptions:</td>
<td>
<input type="checkbox" name="MailingList" value="A"> Announcements<br>
<input type="checkbox" name="MailingList" value="B"> General<br>
<input type="checkbox" name="MailingList" value="C"> Cheap Viagra Offers<br>
<input type="checkbox" name="MailingList" value="D"> Anatomical Enlargements
</td>
</tr>
<tr>
<td>Favourite Fare:</td>
<td>
<input type="radio" name="FavouriteFare" value="spam" checked="checked" /> Spam
<input type="radio" name="FavouriteFare" value="rhubarb" /> Rhubarb
<input type="radio" name="FavouriteFare" value="honey" /> Honey
<input type="radio" name="FavouriteFare" value="rum" /> Rum
</td>
</tr>
<tr>
<td>Favourite Sports:</td>
<td>
<select class="control" name="FavouriteSports" multiple>
<option value="BB">Baseball
<option value="CR">Cricket
<option value="AFL">AFL
<option value="SOC">Soccer
</select>
<div class="instructions">(hold down CTRL key to select multiple items)</div>
</td>
</tr>
</table>
<p><input type="submit" value="Submit Form" name="button1"/></p>
</form>
</body>
</html>
<?xml version="1.0" encoding="UTF-8"?>
<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2" xmlns:okp="okapi-framework:xliff-extensions">
<file original="idmltest.idml" source-language="en" target-language="fr" datatype="x-application/vnd.adobe.indesign-idml-package" okp:inputEncoding="UTF-8">
<body>
<group id="spr1" resname="MasterSpreads/MasterSpread_ub4.xml" restype="x-spread">
</group>
<group id="spr2" resname="Spreads/Spread_uad.xml" restype="x-spread">
<group id="sto1" resname="uc2" restype="x-story">
<trans-unit id="uc2-1">
<source xml:lang="en">This is a first sentence.<x id="2"/>And a second line in that same paragraph.<x id="3"/>and a third one after Ctrl+Enter<x id="4"/>\=bs, &amp;=amp, &lt;=lt, “=quot, ^=caret<x id="5"/>Text in <x id="6"/>a different font<x id="7"/>.</source>
<target xml:lang="fr">This is a first sentence.<x id="2"/>And a second line in that same paragraph.<x id="3"/>and a third one after Ctrl+Enter<x id="4"/>\=bs, &amp;=amp, &lt;=lt, “=quot, ^=caret<x id="5"/>Text in <x id="6"/>a different font<x id="7"/>.</target>
</trans-unit>
</group>
<group id="sto2" resname="uf4" restype="x-story">
<trans-unit id="uf4-2">
<source xml:lang="en">Cell 1</source>
<target xml:lang="fr">Cell 1</target>
</trans-unit>
<trans-unit id="uf4-3">
<source xml:lang="en">Cell 2</source>
<target xml:lang="fr">Cell 2</target>
</trans-unit>
<trans-unit id="uf4-4">
<source xml:lang="en">Last cell.<x id="2"/>Second line of last cell.</source>
<target xml:lang="fr">Last cell.<x id="2"/>Second line of last cell.</target>
</trans-unit>
<trans-unit id="uf4-1">
<source xml:lang="en">Text before the table.<x id="2"/>Text after the table.<x id="3"/>Before the note <x id="4"/> and after it.<x id="5"/>Variables: <x id="6"/>=filename. =tab, <x id="7"/>=curPageNum, “=dlq and ”=drq.<x id="8"/> text</source>
<target xml:lang="fr">Text before the table.<x id="2"/>Text after the table.<x id="3"/>Before the note <x id="4"/> and after it.<x id="5"/>Variables: <x id="6"/>=filename. =tab, <x id="7"/>=curPageNum, “=dlq and ”=drq.<x id="8"/> text</target>
</trans-unit>
</group>
</group>
</body>
</file>
</xliff>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN">
<html><head>
<!--change title here-->
<!--change background color here code should be <body bgcolor="#FFFFFF"> for white
or replace the letters for whatever color you want-->
<body>University of Professor Peter Duesberg is the pre-eminent virologist in the world today and one of the top, if not the top, molecular biologists. <br /> <br /> Unfortunately, he has a political problem. <br /> <br /> He doesn&#39;t accept the still-unproven (and some say dubious) theory that the group of diseases known as AIDS is caused by HIV. <br /> <br /> If &quot;science&quot; is wrong about the cause of AIDS, then finding a cure is impossible.<br /> <br /> This may explain why a cure has not been found in spite of billions of dollars that has been raised and spent on AIDS &quot;research.&quot; <br /> <br /> But the waste of time and effort is only the tip of the iceberg.<br /> <br /> There may be something more sinister afoot. <br /> <br /> With a little understanding, the AIDS &quot;help&quot; trumpeted for Africa (by George Bush no less) and low income, minority families in the US looks less like medicine and more like a genocide program. <br /> <br /> Who&#39;s behind this scam?<br /> <br /> The answer may surprise you.<br /> <br /> Note: The t-shirted &quot;surfer dude&quot; at the end of the first video is Kary Mullis. <br /> <br /> He won the Nobel Prize winner for developing the polymerase chain reaction (PCR), one of the fundamental tools of biomedicine and hailed as one of the monumental scientific techniques of the 20th century.<br /> <br /> Not everyone buys the AIDS story as told by the media, the US government and the pharmaceutical companies. <br /> <br /> How this scam plays out in Africa: <br /> <br /> <A HREF="http://www.brasschecktv.com/page/277.html">African Genocide</A></body>
</html>
<?xml version="1.0" encoding="UTF-8"?><xliff xmlns="urn:oasis:names:tc:xliff:document:1.2" xmlns:xhtml="http://www.w3.org/1999/xhtml" version="1.2"><file datatype="plaintext" source-language="en-US" original="orig" target-language="es-es"><header><tool tool-company="company" tool-name="tool" tool-id="tool_id"></tool></header><body><group id="group1"><group restype="table" id="15-2" xhtml:class="TableBorder" xhtml:width="500" xhtml:border="1" xhtml:cellpadding="5" xhtml:height="300"><group restype="x-html-tbody" id="15-3"><group restype="row" id="15-14"><trans-unit xml:space="preserve" id="15-16"><source xml:lang="en-US"><g id="15-15" ctype="x-html-td">text<x id="15-17" ctype="lb"/></source><target xml:lang="es-es">text</target>
</trans-unit></group></group></group></group></body></file></xliff>
\ No newline at end of file
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN">
<html><head>
<!--change title here-->
<!--change background color here code should be <body bgcolor="#FFFFFF"> for white
or replace the letters for whatever color you want-->
<body>
University of Professor Peter Duesberg is the pre-eminent virologist in the world today and one of the top, if not the top, molecular biologists. <br />
<br />
Unfortunately, he has a political problem. <br />
<br />
He doesn't accept the still-unproven (and some say dubious) theory that the group of diseases known as AIDS is caused by HIV. <br />
<br />
If "science" is wrong about the cause of AIDS, then finding a cure is impossible.<br />
<br />
This may explain why a cure has not been found in spite of billions of dollars that has been raised and spent on AIDS "research." <br />
<br />
But the waste of time and effort is only the tip of the iceberg.<br />
<br />
There may be something more sinister afoot. <br />
<br />
With a little understanding, the AIDS "help" trumpeted for Africa (by George Bush no less) and low income, minority families in the US looks less like medicine and more like a genocide program. <br />
<br />
Who's behind this scam?<br />
<br />
The answer may surprise you.<br />
<br />
Note: The t-shirted "surfer dude" at the end of the first video is Kary Mullis. <br />
<br />
He won the Nobel Prize winner for developing the polymerase chain reaction (PCR), one of the fundamental tools of biomedicine and hailed as one of the monumental scientific techniques of the 20th century.<br />
<br />
Not everyone buys the AIDS story as told by the media, the US government and the pharmaceutical companies. <br />
<br />
How this scam plays out in Africa: <br />
<br />
<A HREF="http://www.brasschecktv.com/page/277.html">African Genocide</A>
</body>
</html>
<?xml version="1.0" encoding="UTF-8" standalone="yes"?><xliff xmlns="urn:oasis:names:tc:xliff:document:1.2" version="1.2" xmlns:xhtml="http://www.w3.org/1999/xhtml"><file datatype="plaintext" source-language="en-US" original="orig"><header><tool tool-company="company" tool-name="tool" tool-id="tool_id"/></header><body><group id="group1"><group restype="table" id="15-2" xhtml:class="TableBorder" xhtml:width="500" xhtml:border="1" xhtml:cellpadding="5" xhtml:height="300"><group restype="x-html-tbody" id="15-3"><group restype="row" id="15-14"><trans-unit xml:space="preserve" id="15-16"><source xml:lang="en-US"><g id="15-15" ctype="x-html-td">text<x id="15-17" ctype="lb"/>
</g></source></trans-unit></group></group></group></group></body></file></xliff>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment