Commit a5064077 authored by Arne Koehn's avatar Arne Koehn

[sentence-selctor] can decide if train or test should be extracted first from a corpus

parent 087e65b6
......@@ -21,6 +21,7 @@ sub main {
my $startsentence;
my $numsentences;
my $pick_random;
my $test_first;
my $help;
my $man;
......@@ -29,6 +30,7 @@ sub main {
'testfile=s' => \$testfilename,
'goldfile=s' => \$goldfilename,
'numtrain=i'=> \$numsentences,
'testfirst' => \$test_first,
'random'=> \$pick_random,
'help|?' => \$help,
'man' => \$man
......@@ -48,7 +50,7 @@ sub main {
open(my $trainfile, '>', $trainfilename);
__PACKAGE__->process($inputfile, $trainfile, $goldfile, $testfile,
$numsentences, $pick_random, 1);
$numsentences, $pick_random, 1, $test_first);
}
......@@ -61,7 +63,7 @@ sub process {
my $numsentences = shift;
my $pick_random = shift;
my $verbose = shift;
my $test_first = shift;
my $line = "1";
# skip the header
until ($line eq "#EOT SECEDGETAG\n") {
......@@ -93,15 +95,24 @@ sub process {
print STDERR "done reading $n sentences\n" if $verbose;
print STDERR "Wrting $numsentences sentences to trainfile...\n" if $verbose;
print $trainfile @sentences[0..$numsentences-1];
print STDERR "Writing the goldfile...\n" if $verbose;
print $goldfile @sentences[$numsentences..$n];
print STDERR "Wrting the other sentences to testfile...\n" if $verbose;
# delete the right answer - no cheating allowed!
foreach my $f (@sentences[$numsentences..$n]) { $f =~ s/ .*//g }
print $testfile @sentences[$numsentences..$n];
if ($test_first) {
print $trainfile @sentences[$n-$numsentences+1..$n];
print STDERR "Writing the goldfile...\n" if $verbose;
print $goldfile @sentences[0..$n-$numsentences];
print STDERR "Wrting the other sentences to testfile...\n" if $verbose;
# delete the right answer - no cheating allowed!
foreach my $f (@sentences[0..$n-$numsentences]) { $f =~ s/ .*//g }
print $testfile @sentences[0..$n-$numsentences];
}
else {
print $trainfile @sentences[0..$numsentences-1];
print STDERR "Writing the goldfile...\n" if $verbose;
print $goldfile @sentences[$numsentences..$n];
print STDERR "Wrting the other sentences to testfile...\n" if $verbose;
# delete the right answer - no cheating allowed!
foreach my $f (@sentences[$numsentences..$n]) { $f =~ s/ .*//g }
print $testfile @sentences[$numsentences..$n];
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment