Commit 8adf8b2e authored by jhargrave's avatar jhargrave

add PostSegmentationSimplifierWithConfigIT test for...

simplifier rules across formats
parent a900c9c1
package net.sf.okapi.simplifier.integration;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.annotation.SimplifierRulesAnnotaton;
import net.sf.okapi.common.pipeline.BasePipelineStep;
import net.sf.okapi.common.resource.Custom;
public class CustomEventStep extends BasePipelineStep {
public SimplifierRulesAnnotaton rules = null;
@Override
public String getName() {
return "CustomEventStep";
}
@Override
public String getDescription() {
return "CustomEventStep: For debugging and testing only.";
}
@Override
protected Event handleCustom(Event event) {
Custom c = (Custom)event.getResource();
rules = c.getAnnotation(SimplifierRulesAnnotaton.class);
return super.handleCustom(event);
}
public String getRules() {
if (rules == null) return null;
return rules.getRules();
}
}
package net.sf.okapi.simplifier.integration;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertNotNull;
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ErrorCollector;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.filters.IFilter;
import net.sf.okapi.common.filters.InputDocument;
import net.sf.okapi.common.filters.RoundTripComparison;
import net.sf.okapi.filters.html.HtmlFilter;
import net.sf.okapi.filters.idml.IDMLFilter;
import net.sf.okapi.filters.xliff.XLIFFFilter;
import net.sf.okapi.filters.xml.XMLFilter;
import net.sf.okapi.steps.common.codesimplifier.PostSegmentationCodeSimplifierStep;
import net.sf.okapi.steps.segmentation.Parameters;
import net.sf.okapi.steps.segmentation.SegmentationStep;
public class PostSegmentationSimplifierWithConfigIT {
private static final LocaleId EN = new LocaleId("en", "us");
private static final LocaleId ESES = new LocaleId("es", "es");
private String pathBase;
private SegmentationStep segmentationStep;
private CustomEventStep ces;
@Rule
public ErrorCollector errCol = new ErrorCollector();
@Before
public void setUp() throws Exception {
pathBase = Util.getDirectoryName(PostSegmentationSimplifierWithConfigIT.class.getResource("/net/sf/okapi/common/codesimplifier/test1.xlf").getPath()) + "/";
segmentationStep = new SegmentationStep();
segmentationStep.setSourceLocale(EN);
List<LocaleId> tl = new LinkedList<>();
tl.add(ESES);
segmentationStep.setTargetLocales(tl);
Parameters params = (Parameters)segmentationStep.getParameters();
params.setSegmentSource(true);
params.setSegmentTarget(true);
params.setSourceSrxPath(PostSegmentationSimplifierWithConfigIT.class.getClassLoader().getResource("default.srx").getPath());
params.setTargetSrxPath(PostSegmentationSimplifierWithConfigIT.class.getClassLoader().getResource("default.srx").getPath());
params.setCopySource(false);
segmentationStep.handleEvent(Event.START_BATCH_ITEM_EVENT);
ces = new CustomEventStep();
}
@After
public void tearDown() throws Exception {
}
@SuppressWarnings("resource")
@Test
public void testDoubleExtraction() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "test1.html", "html_with_simplifier_rules.yml"));
RoundTripComparison rtc = new RoundTripComparison();
assertTrue(rtc.executeCompare(new HtmlFilter(), list, "UTF-8", EN, ESES, "out",
segmentationStep,
ces,
new PostSegmentationCodeSimplifierStep()));
assertNotNull(ces.getRules());
}
@SuppressWarnings("resource")
@Test
public void testDoubleExtraction2() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "aa324.html", "html_with_simplifier_rules.yml"));
RoundTripComparison rtc = new RoundTripComparison();
assertTrue(rtc.executeCompare(new HtmlFilter(), list, "UTF-8", EN, ESES, "out",
segmentationStep,
ces,
new PostSegmentationCodeSimplifierStep()));
assertNotNull(ces.getRules());
}
@SuppressWarnings("resource")
@Test
public void testDoubleExtractionReferences() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "references_as_codes.html", "html_with_simplifier_rules.yml"));
RoundTripComparison rtc = new RoundTripComparison();
assertTrue(rtc.executeCompare(new HtmlFilter(), list, "UTF-8", EN, ESES, "out",
segmentationStep,
ces,
new PostSegmentationCodeSimplifierStep()));
assertNotNull(ces.getRules());
}
@SuppressWarnings("resource")
@Test
public void testDoubleExtractionMergedCodes() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "merged_codes.html", "html_with_simplifier_rules.yml"));
RoundTripComparison rtc = new RoundTripComparison();
assertTrue(rtc.executeCompare(new HtmlFilter(), list, "UTF-8", EN, ESES, "out",
segmentationStep,
ces,
new PostSegmentationCodeSimplifierStep()));
assertNotNull(ces.getRules());
}
@SuppressWarnings("resource")
@Test
public void testDoubleExtraction3() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "form.html", "html_with_simplifier_rules.yml"));
RoundTripComparison rtc = new RoundTripComparison();
assertTrue(rtc.executeCompare(new HtmlFilter(), list, "UTF-8", EN, ESES, "out",
segmentationStep,
ces,
new PostSegmentationCodeSimplifierStep()));
assertNotNull(ces.getRules());
}
@SuppressWarnings("resource")
@Test
public void testDoubleExtractionDita() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "dita.xml", "okf_xml@with-simplifier-rules.fprm"));
RoundTripComparison rtc = new RoundTripComparison();
assertTrue(rtc.executeCompare(new XMLFilter(), list, "UTF-8", EN, ESES, "out",
segmentationStep,
ces,
new PostSegmentationCodeSimplifierStep()));
assertNotNull(ces.getRules());
}
@SuppressWarnings("resource")
@Ignore("Only Fails becuase of wassegmented property difference")
public void testDoubleExtraction4() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "BinUnitTest01.xlf", "okf_xliff@with-simplifier-rules.fprm"));
list.add(new InputDocument(pathBase + "JMP-11-Test01.xlf", "okf_xliff@with-simplifier-rules.fprm"));
list.add(new InputDocument(pathBase + "Manual-12-AltTrans.xlf", "okf_xliff@with-simplifier-rules.fprm"));
list.add(new InputDocument(pathBase + "test1.xlf", "okf_xliff@with-simplifier-rules.fprm"));
RoundTripComparison rtc = new RoundTripComparison();
assertTrue(rtc.executeCompare(new XLIFFFilter(), list, "UTF-8", EN, ESES, "out",
segmentationStep,
ces,
new PostSegmentationCodeSimplifierStep()));
assertNotNull(ces.getRules());
}
@SuppressWarnings("resource")
@Test
public void testDoubleExtraction5() {
ArrayList<InputDocument> list = new ArrayList<InputDocument>();
list.add(new InputDocument(pathBase + "idmltest.idml", "okf_idml@with-simplifier-rules.fprm"));
RoundTripComparison rtc = new RoundTripComparison(false);
assertTrue(rtc.executeCompare(new IDMLFilter(), list, "UTF-8", EN, EN, "out",
segmentationStep,
ces,
new PostSegmentationCodeSimplifierStep()));
assertNotNull(ces.getRules());
}
}
# Rule types that drive HTML parser behavior
# INLINE inline element
# GROUP group element
# EXCLUDE exclude this element and all children
# INCLUDE exceptions to the exclude rules
# TEXTUNIT make this element a textunit with skeleton before/after
# PRESERVE_WHITESPACE turn on preserve whitespace.
# SCRIPT Embedded scripting languatge - pass to another extractor
# SERVER Embedded server language tags such as JSP, PHP, Mason etc.
# ATTRIBUTE_TRANS, ATTRIBUTE_WRITABLE, ATTRIBUTE_READONLY, ATTRIBUTE_ID these rules list an attribute, not an element
# ATTRIBUTES_ONLY only attribute is translatable or localizable
#*********************************************************************************************
# Operators for attribute value compare
#
# Rules are of the form:
# TO_EXTRACT_ATTRIBUTE:[IF_HAS_ATTRIBUTE, OPERATOR, VALUE]
# 'content':['http-equiv', EQUALS, 'keywords']
#
# This rule would read:
# extract the value of 'content' if the value of 'http-equiv' equals 'keywords'
#
# Multiple attribute values may be included in a list:
# 'content':['http-equiv', EQUALS, ['content-language', 'content-type']]
#
# This rule would be read:
# extract the value of 'content' if the value of 'http-equiv' equals 'content-language' or 'content-type'
#*******************************************************************************************/
# EQUALS
# NOT_EQUALS
# MATCH regex match. Must match the entire attribute value
# ELEMENT AND ATTRIBUTE NAMES MUST BE LOWER CASED!!!!!!
assumeWellformed: false
preserve_whitespace: false
simplifierRules: |
# if ADDABLE or DELETABLE or CLONEABLE;
if DATA = "<br/>" or DATA = "<font>" or DATA = "</font>" or DATA = "</a>";
if DATA ~ "\\<font.+" or DATA ~ "\\<img.+" or DATA ~ "\\<a.+";
attributes:
# attributes that occur on many elements
dir:
ruleTypes: [ATTRIBUTE_WRITABLE]
allElementsExcept: [base, basefront, head, html, meta, param, script]
title:
ruleTypes: [ATTRIBUTE_TRANS]
allElementsExcept: [base, basefront, head, html, meta, param, script, title]
lang:
ruleTypes: [ATTRIBUTE_WRITABLE]
'xml:lang':
ruleTypes: [ATTRIBUTE_WRITABLE]
#id:
# ruleTypes: [ATTRIBUTE_ID]
# onlyTheseElements: [address, dt, h1, h2, h3, h4, h5, h6, legend, li, marquee, p, pre, td, th]
elements:
# only attributes are localizable or translatable - no PCDATA
# ATTRIBUTES_ONLY with translatableAttributes implies the tag will be a TEXTUNIT
# with embedded skeleton
meta:
ruleTypes: [ATTRIBUTES_ONLY]
translatableAttributes: {content: [[http-equiv, EQUALS, keywords], [name, EQUALS, [keywords, description]]]}
writableLocalizableAttributes: {content: [http-equiv, EQUALS, [content-language, content-type]], charset}
readOnlyLocalizableAttributes: {content: [name, EQUALS, [generator, author, progid, date]]}
area:
ruleTypes: [ATTRIBUTES_ONLY]
translatableAttributes: [accesskey, area, alt]
isindex:
ruleTypes: [ATTRIBUTES_ONLY]
translatableAttributes: [prompt]
option:
ruleTypes: [ATTRIBUTES_ONLY]
translatableAttributes: [label, value]
optgroup:
ruleTypes: [ATTRIBUTES_ONLY]
translatableAttributes: [label]
# complex TextUnit rules
address:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
dt:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
h1:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
h2:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
h3:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
h4:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
h5:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
h6:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
legend:
ruleTypes: [TEXTUNIT]
translatableAttributes: [accesskey]
idAttributes: [id]
li:
ruleTypes: [TEXTUNIT]
translatableAttributes: [value]
idAttributes: [id]
marquee:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
p:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
elementType: paragraph
pre:
ruleTypes: [TEXTUNIT, PRESERVE_WHITESPACE]
idAttributes: [id]
td:
ruleTypes: [TEXTUNIT]
translatableAttributes: [abbr]
idAttributes: [id]
th:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
translatableAttributes: [abbr]
title:
ruleTypes: [TEXTUNIT]
idAttributes: [id]
# inline tags
a:
ruleTypes: [INLINE]
elementType: link
translatableAttributes: [title, accesskey]
writableLocalizableAttributes: [href]
abbr:
ruleTypes: [INLINE]
acronym:
ruleTypes: [INLINE]
acronym:
ruleTypes: [INLINE]
applet:
ruleTypes: [INLINE]
translatableAttributes: [alt]
acronym:
ruleTypes: [INLINE]
b:
ruleTypes: [INLINE]
elementType: bold
bdo:
ruleTypes: [INLINE]
big:
ruleTypes: [INLINE]
blink:
ruleTypes: [INLINE]
br:
ruleTypes: [INLINE]
button:
ruleTypes: [INLINE]
translatableAttributes: [accesskey, value]
cite:
ruleTypes: [INLINE]
code:
ruleTypes: [INLINE]
del:
ruleTypes: [INLINE]
dfn:
ruleTypes: [INLINE]
em:
ruleTypes: [INLINE]
embed:
ruleTypes: [INLINE]
font:
ruleTypes: [INLINE]
i:
ruleTypes: [INLINE]
elementType: italic
iframe:
ruleTypes: [INLINE]
img:
ruleTypes: [INLINE]
elementType: image
translatableAttributes: [title, alt]
writableLocalizableAttributes: [href, src]
input:
ruleTypes: [INLINE]
translatableAttributes:
alt: [type, NOT_EQUALS, [file, hidden, image, Password]]
value: [type, NOT_EQUALS, [file, hidden, image, Password]]
accesskey: [type, NOT_EQUALS, [file, hidden, image, Password]]
title: [type, NOT_EQUALS, [file, hidden, image, Password]]
placeholder: [type, NOT_EQUALS, 'dummy']
ins:
ruleTypes: [INLINE]
acronym:
ruleTypes: [INLINE]
kbd:
ruleTypes: [INLINE]
label:
ruleTypes: [INLINE]
translatableAttributes: [accesskey]
map:
ruleTypes: [INLINE]
nobr:
ruleTypes: [INLINE]
object:
ruleTypes: [INLINE]
translatableAttributes: [standby]
param:
ruleTypes: [INLINE]
translatableAttributes: [value]
q:
ruleTypes: [INLINE]
s:
ruleTypes: [INLINE]
samp:
ruleTypes: [INLINE]
small:
ruleTypes: [INLINE]
select:
ruleTypes: [INLINE]
span:
ruleTypes: [INLINE]
spacer:
ruleTypes: [INLINE]
strike:
ruleTypes: [INLINE]
strong:
ruleTypes: [INLINE]
sub:
ruleTypes: [INLINE]
sup:
ruleTypes: [INLINE]
symbol:
ruleTypes: [INLINE]
table:
ruleTypes: [ATTRIBUTES_ONLY]
translatableAttributes: [summary]
textarea:
ruleTypes: [INLINE]
translatableAttributes: [accesskey]
tt:
ruleTypes: [INLINE]
u:
ruleTypes: [INLINE]
elementType: underlined
var:
ruleTypes: [INLINE]
wbr:
ruleTypes: [INLINE]
# Ruby inline tags
ruby:
ruleTypes: [INLINE]
rb:
ruleTypes: [INLINE]
rt:
ruleTypes: [INLINE]
rc:
ruleTypes: [INLINE]
rp:
ruleTypes: [INLINE]
rbc:
ruleTypes: [INLINE]
rtc:
ruleTypes: [INLINE]
# Robo help inline tags
symbol:
ruleTypes: [INLINE]
face:
ruleTypes: [INLINE]
# Excluded elements
'.*':
ruleTypes: [EXCLUDE]
conditions: [translate, EQUALS, 'no']
style:
ruleTypes: [EXCLUDE]
stylesheet:
ruleTypes: [EXCLUDE]
# # Included elements
# '.*':
# ruleTypes: [INCLUDE]
# conditions: [translate, EQUALS, 'yes']
# javascript etc.
script:
ruleTypes: [EXCLUDE]
\ No newline at end of file
#v1
extractNotes.b=true
simplifyCodes.b=true
simplifierRules=if ADDABLE or DELETABLE or CLONEABLE; if DATA ~ ".+";
\ No newline at end of file
#v1
useCustomParser.b=true
factoryClass=com.ctc.wstx.stax.WstxInputFactory
fallbackToID.b=false
escapeGT.b=false
addTargetLanguage.b=true
overrideTargetLanguage.b=false
outputSegmentationType.i=0
ignoreInputSegmentation.b=false
addAltTrans.b=false
addAltTransGMode.b=true
editAltTrans.b=false
includeExtensions.b=true
includeIts.b=true
balanceCodes.b=true
allowEmptyTargets.b=false
targetStateMode.i=0
targetStateValue=needs-translation
alwaysUseSegSource.b=false
quoteModeDefined.b=true
quoteMode.i=0
useSdlXliffWriter.b=false
simplifierRules=if ADDABLE or DELETABLE or CLONEABLE; if DATA ~ ".+";
<?xml version="1.0" encoding="UTF-8"?>
<its:rules xmlns:its="http://www.w3.org/2005/11/its" version="1.0" xmlns:itsx="http://www.w3.org/2008/12/its-extensions" xmlns:okp="okapi-framework:xmlfilter-options">
<!-- See ITS specification at: http://www.w3.org/TR/its/ -->
<its:translateRule selector="//*" translate="yes"/>
<its:withinTextRule selector="//codeph" withinText="yes"/>
<its:withinTextRule selector="//ph" withinText="yes"/>
<okp:simplifierRules>
if ADDABLE or DELETABLE or CLONEABLE; if DATA ~ ".+";
</okp:simplifierRules>
</its:rules>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment