Commit f50aeb39 authored by Arne Köhn's avatar Arne Köhn

add baseline tagger

parent 54a92060
#! /usr/bin/env perl
# Copyright (C) 2010 by Arne Köhn
# Do what you want to to with this. If you need a "proper" license,
# have a look at LICENSE.
# assumes lines with tagged text.
# extracts lexicon of form:
# word t1 t2 t3 .. tn
# where t1 is most likely tag for word, t2 ... tn are other seen
# taggings in no particular order.
use strict;
my @temp;
my %seen;
my $key;
my $val;
my %bestval;
my %bestkey;
my %toprint;
while(<>) {
if (! /^$/) {
@temp = split;
++$seen{$temp[0] . " " . $temp[1]};
}
}
while(($key,$val) = each %seen) {
@temp = split(/\s+/,$key);
if ($val > $bestval{$temp[0]}) {
$bestkey{$temp[0]} = $temp[1];
$bestval{$temp[0]} = $val;
}
}
while (($key,$val) = each %bestkey) {
print "$key $val\n"; }
#! /bin/sh
# usage: test [modelname] [inputfile] [outputfile]
abspath="$(cd "${0%/*}" 2>/dev/null; echo "$PWD"/"${0##*/}")"
WORKDIR=$(dirname $abspath)
cd $WORKDIR
python tag.py $WORKDIR/models/$1 < $2 > $3
#! /usr/bin/env python
# -*- coding:utf-8 -*-
# simple baseline tagger
# first argument: The lexicon (word SPACE tag\n ...)
# optional second argument: the default tag for unknown words
# Copyright (C) 2010 by Arne Köhn
# Feel free to do what you want to do with it.
import sys
lexfile = open(sys.argv[1])
fallbacktag = "NN"
if len(sys.argv)>2:
fallbacktag = sys.argv[2]
lexicon = {}
for l in lexfile:
word, tag = l.split()
lexicon[word] = tag
while True:
line = sys.stdin.readline()
if line == "":
exit(0) # eof
word = line.strip()
try:
tag = lexicon[word]
except KeyError:
tag = fallbacktag
if word <> "":
print word+" "+tag
else:
print
#! /bin/sh
# usage: train [trainfile] [modelname]
# where modelname is the name of this experiment
abspath="$(cd "${0%/*}" 2>/dev/null; echo "$PWD"/"${0##*/}")"
WORKDIR=$(dirname $abspath)
cd $WORKDIR
if [ ! -d models ]; then mkdir models; fi
perl make-lexicon.pl > $WORKDIR/models/$2 < $1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment