Commit 8400e8e3 authored by Felix Hennig's avatar Felix Hennig

Initial commit

parents
# Experiment Data
This repository contains the manually annotated trees, the generated
trees and logfiles for the experiments described in the paper [to be
published].
## Preparation Sentences
The directory [prep_sentences](prep_sentences) contains the tree data
for the sentences used to create the initial ruleset. The sentences
are available in the CDA format used by
the [jwcdg parser](https://gitlab.com/nats/jwcdg) for use with the
AnnoViewer, which was used in the manual annotation process; as well
as in the CoNLL format.
Also included are the script used to pick the test sentences as well
as a small script used to automatically convert a few edges for easier
manual annotation.
## Validation / Test Sentences
The [validation_sentences](validation_sentences) contains the 50
randomly chosen sentences used to test the [rulefile](rulefile.tud) as
well as the log output of the program used for comparing the manually
annotated and generated files and the output of the generation process.
## Coverage Experiment
The coverage experiment data was not uploaded as it did not contain
manual annotation data. To replicate the experiment, download the
Hamburg Dependeny Treebank files
from [here](http://hdl.handle.net/11022/0000-0000-7FC7), run the
transducer on part_B and check for coverage with the TrUDucer.
Log output used for the calculations in the paper:
```
16:30:36 INFO Main - 1570471 nodes converted correctly.
16:30:36 INFO Main - 234884 nodes punctuation.
16:30:36 INFO Main - 145785 nodes not converted.
```
#! /bin/bash
# pipe set of deprels into the script
sentsPerDeprel=3
listOfDeprels=deprels.list
dirOfSentences=part_A/
targetDir=test_sentences/
tempDir=tmp/
mkdir -p $tempDir $targetDir
for deprel in $(cat $listOfDeprels); do
echo "Looking at $deprel"
grep -r "'SYN' -> '$deprel'" $dirOfSentences | sed 's/:.*//g' | sort | uniq > $tempDir/$deprel.list
echo "Sentences with $deprel:" $(wc -l $tempDir/$deprel.list)
cat $tempDir/$deprel.list | head -n $sentsPerDeprel | xargs -I fn cp fn $targetDir
done
#! /bin/bash
dir=test_sentences_ud_preprocessed/
map="AVZ,mark CJ,conj DET,det GMOD,nmod KON,cc PN,nmod PP,case S,root SUBJ,nsubj"
for file in $(ls $dir); do
for mapping in $map; do
oldIFS=$IFS
IFS=","
set $mapping
echo "s/-> '"$1"'/-> '"$2"'/g"
sed -i "s/-> '"$1"'/-> '"$2"'/g" $dir/$file
IFS=$oldIFS
done
done
'hdt-s1000' : 'hdt-s1000' <->
0 1 'Mit'
'SYN' -> 'PP' -> 8 // ( riskiere )
'case' / 'dat'
'cat' / 'APPR'
'REF' -> '' -> 0
,
1 2 '"'
'SYN' -> '' -> 0
'cat' / '$('
'REF' -> '' -> 0
,
2 3 'Innovationen'
'person' / 'third'
'number' / 'pl'
'gender' / 'fem'
'cat' / 'NN'
'SYN' -> 'PN' -> 1 // ( Mit )
'base' / 'Innovation'
'REF' -> '' -> 0
,
3 4 '"'
'SYN' -> '' -> 0
'cat' / '$('
'REF' -> '' -> 0
,
4 5 'abseits'
'base' / 'abseits'
'SYN' -> 'ADV' -> 3 // ( Innovationen )
'degree' / 'positive'
'cat' / 'ADJD'
'REF' -> '' -> 0
,
5 6 'der'
'SYN' -> 'DET' -> 7 // ( W3C-Empfehlungen )
'number' / 'pl'
'case' / 'gen'
'cat' / 'ART'
'REF' -> '' -> 0
,
6 7 'W3C-Empfehlungen'
'person' / 'third'
'number' / 'pl'
'gender' / 'fem'
'cat' / 'NN'
'SYN' -> 'OBJG' -> 5 // ( abseits )
'base' / 'Empfehlung'
'REF' -> '' -> 0
,
7 8 'riskiere'
'person' / 'third'
'mood' / 'subjunctive1'
'tense' / 'present'
'SYN' -> 'S' -> 0
'cat' / 'VVFIN'
'number' / 'sg'
'base' / 'riskieren'
'REF' -> '' -> 0
,
8 9 'das'
'SYN' -> 'DET' -> 10 // ( Unternehmen )
'case' / 'nom'
'gender' / 'neut'
'cat' / 'ART'
'number' / 'sg'
'REF' -> '' -> 0
,
9 10 'Unternehmen'
'person' / 'third'
'number' / 'sg'
'case' / 'nom_dat_acc'
'gender' / 'neut'
'cat' / 'NN'
'SYN' -> 'SUBJ' -> 8 // ( riskiere )
'base' / 'Unternehmen'
'REF' -> '' -> 0
,
10 11 'noch'
'SYN' -> 'ADV' -> 13 // ( kompliziertere )
'subcat' / 'focus'
'cat' / 'ADV'
'REF' -> '' -> 0
,
11 12 'weit'
'base' / 'weit'
'SYN' -> 'ADV' -> 13 // ( kompliziertere )
'degree' / 'positive'
'cat' / 'ADJD'
'REF' -> '' -> 0
,
12 13 'kompliziertere'
'number' / 'pl'
'base' / 'komplizieren'
'cat' / 'ADJA'
'SYN' -> 'ATTR' -> 14 // ( Browser-Inkompatibilitäten )
'degree' / 'comparative'
'case' / 'nom_acc'
'REF' -> '' -> 0
'flexion' / 'strong'
,
13 14 'Browser-Inkompatibilitäten'
'person' / 'third'
'SYN' -> 'OBJA' -> 8 // ( riskiere )
'cat' / 'NN'
'pattern' / 'NN'
'base' / 'unknown'
'REF' -> '' -> 0
,
14 15 'als'
'SYN' -> 'KOM' -> 8 // ( riskiere )
'cat' / 'KOKOM'
'REF' -> '' -> 0
,
15 16 'die'
'SYN' -> 'DET' -> 18 // ( bestehenden )
'number' / 'pl'
'case' / 'acc'
'cat' / 'ART'
'REF' -> '' -> 0
,
16 17 'bereits'
'SYN' -> 'ADV' -> 18 // ( bestehenden )
'subcat' / 'temporal'
'cat' / 'ADV'
'REF' -> '' -> 0
,
17 18 'bestehenden'
'SYN' -> 'CJ' -> 15 // ( als )
'base' / 'bestehen'
'cat' / 'ADJA'
'number' / 'pl'
'degree' / 'positive'
'flexion' / 'weak_mixed'
'REF' -> '' -> 0
,
18 19 '.'
'SYN' -> '' -> 0
'cat' / '$.'
'REF' -> '' -> 0
;
'hdt-s100000' : 'hdt-s100000' <->
0 1 'Das'
'SYN' -> 'DET' -> 2 // ( Thema )
'case' / 'nom'
'gender' / 'neut'
'cat' / 'ART'
'number' / 'sg'
'REF' -> '' -> 0
,
1 2 'Thema'
'person' / 'third'
'number' / 'sg'
'case' / 'nom_dat_acc'
'gender' / 'neut'
'cat' / 'NN'
'SYN' -> 'SUBJ' -> 3 // ( werde )
'base' / 'Thema'
'REF' -> '' -> 0
,
2 3 'werde'
'person' / 'third'
'mood' / 'subjunctive1'
'tense' / 'present'
'SYN' -> 'S' -> 0
'cat' / 'VAFIN'
'number' / 'sg'
'base' / 'werden'
'REF' -> '' -> 0
,
3 4 'jedoch'
'subcat' / 'sentence'
'cat2' / 'KON'
'SYN' -> 'ADV' -> 13 // ( kommen )
'cat' / 'ADV'
'REF' -> '' -> 0
,
4 5 'in'
'SYN' -> 'PP' -> 13 // ( kommen )
'case' / 'dat'
'cat' / 'APPR'
'REF' -> '' -> 0
,
5 6 'dieser'
'number' / 'sg'
'case' / 'dat'
'gender' / 'fem'
'cat' / 'PDAT'
'SYN' -> 'DET' -> 7 // ( Legislaturperiode )
'base' / 'dies'
'REF' -> '' -> 0
,
6 7 'Legislaturperiode'
'person' / 'third'
'number' / 'sg'
'gender' / 'fem'
'cat' / 'NN'
'SYN' -> 'PN' -> 5 // ( in )
'base' / 'Periode'
'REF' -> '' -> 0
,
7 8 'sicherlich'
'SYN' -> 'ADV' -> 13 // ( kommen )
'subcat' / 'sentence'
'cat' / 'ADV'
'REF' -> '' -> 0
,
8 9 'wieder'
'SYN' -> 'ADV' -> 13 // ( kommen )
'subcat' / 'temporal'
'cat' / 'ADV'
'REF' -> '' -> 0
,
9 10 'auf'
'SYN' -> 'PP' -> 13 // ( kommen )
'case' / 'acc'
'cat' / 'APPR'
'REF' -> '' -> 0
,
10 11 'den'
'SYN' -> 'DET' -> 12 // ( Tisch )
'case' / 'acc'
'gender' / 'masc'
'cat' / 'ART'
'number' / 'sg'
'REF' -> '' -> 0
,
11 12 'Tisch'
'person' / 'third'
'number' / 'sg'
'case' / 'nom_dat_acc'
'gender' / 'masc'
'cat' / 'NN'
'SYN' -> 'PN' -> 10 // ( auf )
'base' / 'Tisch'
'REF' -> '' -> 0
,
12 13 'kommen'
'base' / 'kommen'
'SYN' -> 'AUX' -> 3 // ( werde )
'cat' / 'VVINF'
'REF' -> '' -> 0
,
13 14 '.'
'SYN' -> '' -> 0
'cat' / '$.'
'REF' -> '' -> 0
;
'hdt-s100001' : 'hdt-s100001' <->
0 1 'Immer'
'SYN' -> 'ADV' -> 3 // ( ist )
'subcat' / 'temporal'
'cat' / 'ADV'
'REF' -> '' -> 0
,
1 2 'noch'
'SYN' -> 'ADV' -> 1 // ( Immer )
'subcat' / 'focus'
'cat' / 'ADV'
'REF' -> '' -> 0
,
2 3 'ist'
'person' / 'third'
'mood' / 'indicative'
'tense' / 'present'
'SYN' -> 'S' -> 0
'cat' / 'VAFIN'
'number' / 'sg'
'base' / 'sein'
'REF' -> '' -> 0
,
3 4 'unklar'
'base' / 'unklar'
'SYN' -> 'PRED' -> 3 // ( ist )
'degree' / 'positive'
'cat' / 'ADJD'
'REF' -> '' -> 0
,
4 5 ','
'SYN' -> '' -> 0
'cat' / '$,'
'REF' -> '' -> 0
,
5 6 'wieviel'
'SYN' -> 'OBJA' -> 18 // ( abgeben )
'cat' / 'PWS'
'REF' -> '' -> 0
,
6 7 'Bulmahns'
'person' / 'third'
'SYN' -> 'GMOD' -> 8 // ( Ministerium )
'case' / 'gen'
'cat' / 'NE'
'pattern' / 'NE'
'REF' -> '' -> 0
,
7 8 'Ministerium'
'person' / 'third'
'number' / 'sg'
'case' / 'nom_dat_acc'
'gender' / 'neut'
'cat' / 'NN'
'SYN' -> 'SUBJ' -> 19 // ( muß )
'base' / 'Ministerium'
'REF' -> '' -> 0
,
8 9 'vom'
'SYN' -> 'PP' -> 18 // ( abgeben )
'case' / 'dat'
'gender' / 'not_fem'
'cat' / 'APPRART'
'REF' -> '' -> 0
,
9 10 'Bereich'
'person' / 'third'
'number' / 'sg'
'case' / 'nom_dat_acc'
'gender' / 'masc'
'cat' / 'NN'
'SYN' -> 'PN' -> 9 // ( vom )
'base' / 'Bereich'
'REF' -> '' -> 0
,
10 11 'der'
'SYN' -> 'DET' -> 12 // ( Forschungs- )
'case' / 'gen'
'gender' / 'fem'
'cat' / 'ART'
'number' / 'sg'
'REF' -> '' -> 0
,
11 12 'Forschungs-'
'SYN' -> 'GMOD' -> 10 // ( Bereich )
'cat2' / 'NN'
'cat' / 'TRUNC'
'REF' -> '' -> 0
'pattern' / 'NN-Anfang'
,
12 13 'und'
'base' / 'und'
'SYN' -> 'KON' -> 12 // ( Forschungs- )
'cat' / 'KON'
'REF' -> '' -> 0
,
13 14 'Technologiepolitik'
'person' / 'third'
'number' / 'sg'
'gender' / 'fem'
'cat' / 'NN'
'SYN' -> 'CJ' -> 13 // ( und )
'base' / 'Politik'
'REF' -> '' -> 0
,
14 15 'an'
'SYN' -> 'PP' -> 18 // ( abgeben )
'case' / 'acc'
'cat' / 'APPR'
'REF' -> '' -> 0
,
15 16 'das'
'SYN' -> 'DET' -> 17 // ( Wirtschaftsministerium )
'case' / 'acc'
'gender' / 'neut'
'cat' / 'ART'
'number' / 'sg'
'REF' -> '' -> 0
,
16 17 'Wirtschaftsministerium'
'person' / 'third'
'number' / 'sg'
'case' / 'nom_dat_acc'
'gender' / 'neut'
'cat' / 'NN'
'SYN' -> 'PN' -> 15 // ( an )
'base' / 'Ministerium'
'REF' -> '' -> 0
,
17 18 'abgeben'
'base' / 'abgeben'
'SYN' -> 'AUX' -> 19 // ( muß )
'cat' / 'VVINF'
'REF' -> '' -> 0
,
18 19 'muß'
'person' / 'third'
'mood' / 'indicative'
'tense' / 'present'
'SYN' -> 'SUBJC' -> 3 // ( ist )
'cat' / 'VMFIN'
'number' / 'sg'
'base' / 'müssen'
'REF' -> '' -> 0
,
19 20 '.'
'SYN' -> '' -> 0
'cat' / '$.'
'REF' -> '' -> 0
;
'hdt-s100002' : 'hdt-s100002' <->