Commit 62d2505f authored by Dmitry Mozzherin's avatar Dmitry Mozzherin
Browse files

Close #68 add stemmed canonical form

parent 40113054
Pipeline #91376382 passed with stages
in 4 minutes and 21 seconds
......@@ -7,4 +7,5 @@ test_data.new.txt
.idea
gnparser.pb.go
gnparser/gnparser
bench*.txt
......@@ -2,6 +2,11 @@
## Unreleased
## [v0.11.0]
- Add [#68]: add stemmed version of canonical form to outputs.
- Add: benchmarks to gnparser_test.go
## [v0.10.0]
- Add [#67]: field `authorship` of the name for JSON output
......@@ -96,6 +101,7 @@ array of names instead of a stream.
This document follows [changelog guidelines]
[v0.11.0]: https://gitlab.com/gogna/gnparser/compare/v0.10.0...v0.11.0
[v0.10.0]: https://gitlab.com/gogna/gnparser/compare/v0.9.0...v0.10.0
[v0.9.0]: https://gitlab.com/gogna/gnparser/compare/v0.8.0...v0.9.0
[v0.8.0]: https://gitlab.com/gogna/gnparser/compare/v0.7.5...v0.8.0
......@@ -108,6 +114,7 @@ This document follows [changelog guidelines]
[v0.6.0]: https://gitlab.com/gogna/gnparser/compare/v0.5.1...v0.6.0
[v0.5.1]: https://gitlab.com/gogna/gnparser/tree/v0.5.1
[#68]: https://gitlab.com/gogna/gnparser/issues/68
[#67]: https://gitlab.com/gogna/gnparser/issues/67
[#66]: https://gitlab.com/gogna/gnparser/issues/66
[#65]: https://gitlab.com/gogna/gnparser/issues/65
......
......@@ -127,16 +127,27 @@ in less stable parts. Use the ``canonicalName -> simple`` or ``canonicalName
canonical form includes infra-specific ranks and hybrid character for named
hybrids.
The ``canonicalName -> full`` is good for presentation, as it keeps more
details.
The ``canonicalName -> simple`` field is good for matching names from different
sources, because sometimes dataset curators omit hybrid sign in named hybrids,
or remove ranks for infraspecific epithets.
The ``canonicalName -> full`` is good for presentation, as it keeps more
details.
The ``canonicalName -> stem`` field contains simple canonical normalized even
further. The normalization is done according to stemming rules for Latin
language described in [Schinke R et al (1996)]. For example letters `j` are
converted to `i`, letters `v` are converted to `u`, and suffixes are removed
from the specific and infraspecific epithets.
If you only care about canonical form of a name you can use ``--format simple``
flag with command line tool.
Simple format has the following fields separated by a pipe character(`|`):
`ID|Verbatim|CanonicalFull|CanonicalSimple|CanonicalStem|Authors|Year|Quality`
### Normalizing name-strings
There are many inconsistencies in how scientific names may be written.
......@@ -254,6 +265,10 @@ Relevant flags:
: output format. Can be ``compact``, ``pretty``, ``simple``, or ``debug``.
Default is ``compact``.
Simple format has the following fields separated by a pipe character(`|`):
`ID|Verbatim|CanonicalFull|CanonicalSimple|CanonicalStem|Authors|Year|Quality`
``--jobs -j``
: number of jobs running concurrently.
......@@ -481,7 +496,7 @@ Released under [MIT license]
[gnparser-scala]: https://github.com/GlobalNamesArchitecture/gnparser
[peg]: https://github.com/pointlander/peg
[gna]: http://globalnames.org
[test file]: https://gitlab.com/gogna/gnparser/raw/master/test-data/test_data.txt
[test file]: https://gitlab.com/gogna/gnparser/raw/master/testdata/test_data.txt
[uuid5]: http://globalnames.org/news/2015/05/31/gn-uuid-0-5-0
[winpath]: https://www.computerhope.com/issues/ch000549.htm
[gnparser ruby]: https://gitlab.com/gnames/gnparser_rb
......@@ -492,4 +507,5 @@ Released under [MIT license]
[parser-web]: https://parser.globalnames.org
[IRMNG]: http://www.irmng.org
[CONTRIBUTING]: https://gitlab.com/gogna/gnparser/blob/master/CONTRIBUTING.md
[gnparser.proto]: https://gitlab.com/gogna/gnparser/blob/master/pb/gnparser.proto
\ No newline at end of file
[gnparser.proto]: https://gitlab.com/gogna/gnparser/blob/master/pb/gnparser.proto
[Schinke R et al (1996)]: https://caio.ueberalles.net/a_stemming_algorithm_for_latin_text_databases-schinke_et_al.pdf
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -31,7 +31,7 @@ func testData() ([]testRecord, error) {
var test testRecord
empty := regexp.MustCompile(`^\s*$`)
comment := regexp.MustCompile(`^\s*#`)
path := filepath.Join("test-data", "test_data.txt")
path := filepath.Join("testdata", "test_data.txt")
f, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm)
if err != nil {
return nil, err
......@@ -68,7 +68,7 @@ func testData() ([]testRecord, error) {
}
func makeBigFile(t []testRecord) error {
path := filepath.Join("test-data", "200k-lines.txt")
path := filepath.Join("testdata", "200k-lines.txt")
iterNum := 200000 / len(t)
f, err := os.Create(path)
......
package gnparser
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strings"
"testing"
. "github.com/onsi/ginkgo"
. "github.com/onsi/ginkgo/extensions/table"
......@@ -33,6 +37,7 @@ var _ = Describe("GNparser", func() {
o := gnp.ParseToObject("Homo sapiens")
Expect(o.Parsed).To(Equal(true))
Expect(o.Canonical.Simple).To(Equal("Homo sapiens"))
Expect(o.Canonical.Stem).To(Equal("Homo sapiens"))
switch d := o.Details.(type) {
case *pb.Parsed_Species:
Expect(d.Species.Genus).To(Equal("Homo"))
......@@ -113,3 +118,62 @@ func astEntries() []TableEntry {
}
return entries
}
// BenchmarkParse checks parsing event speed. Run it with:
// `go test -bench=. -benchmem -count=10 -run=XXX > bench.txt && benchstat bench.txt`
func BenchmarkParse(b *testing.B) {
path := filepath.Join("testdata", "200k-lines.txt")
count := 1000
test := make([]string, count)
gnp := NewGNparser()
ops := []Option{Format("simple")}
gnpSimple := NewGNparser(ops...)
f, err := os.Open(path)
if err != nil {
panic(err)
}
scanner := bufio.NewScanner(f)
for scanner.Scan() {
if count == 0 {
break
}
test = append(test, scanner.Text())
count--
}
b.Run("ParseToObject", func(b *testing.B) {
var p *pb.Parsed
for i := 0; i < b.N; i++ {
for _, v := range test {
p = gnp.ParseToObject(v)
}
}
_ = fmt.Sprintf("%v", p.Parsed)
})
b.Run("ParseAndFormat", func(b *testing.B) {
var p string
for i := 0; i < b.N; i++ {
for _, v := range test {
p, err = gnp.ParseAndFormat(v)
if err != nil {
panic(err)
}
}
}
_ = fmt.Sprintf("%d", len(p))
})
b.Run("ParseAndFormat(Simple)", func(b *testing.B) {
var p string
for i := 0; i < b.N; i++ {
for _, v := range test {
p, err = gnpSimple.ParseAndFormat(v)
if err != nil {
panic(err)
}
}
}
_ = fmt.Sprintf("%d", len(p))
})
}
......@@ -5,6 +5,7 @@ import (
jsoniter "github.com/json-iterator/go"
grm "gitlab.com/gogna/gnparser/grammar"
"gitlab.com/gogna/gnparser/stemmer"
)
type Output struct {
......@@ -37,7 +38,11 @@ func NewOutput(sn *grm.ScientificNameNode) *Output {
det := sn.Details()
c := sn.Canonical()
if c != nil {
co = &canonical{Simple: c.Value, Full: c.ValueRanked}
co = &canonical{
Full: c.ValueRanked,
Simple: c.Value,
Stem: stemmer.StemCanonical(c.Value),
}
ws, quality = qualityAndWarnings(sn.Warnings)
ps = convertPos(sn.Pos())
hybrid = sn.Hybrid
......@@ -96,8 +101,9 @@ func FromJSON(data []byte) (Output, error) {
}
type canonical struct {
Simple string `json:"simple"`
Full string `json:"full"`
Simple string `json:"simple"`
Stem string `json:"stem"`
}
type pos struct {
......
package output
import (
"math/rand"
"time"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
grm "gitlab.com/gogna/gnparser/grammar"
......@@ -51,11 +48,11 @@ var _ = Describe("Private Functions", func() {
})
})
func randIntSlice(sl []int) []int {
res := make([]int, len(sl))
r := rand.New(rand.NewSource(time.Now().Unix()))
for i, v := range r.Perm(len(sl)) {
res[i] = sl[v]
}
return res
}
// func randIntSlice(sl []int) []int {
// res := make([]int, len(sl))
// r := rand.New(rand.NewSource(time.Now().Unix()))
// for i, v := range r.Perm(len(sl)) {
// res[i] = sl[v]
// }
// return res
// }
......@@ -5,13 +5,15 @@ import (
"strconv"
"gitlab.com/gogna/gnparser/grammar"
"gitlab.com/gogna/gnparser/stemmer"
)
type simple struct {
ID string
Verbatim string
Canonical string
CanonicalRanked string
Canonical string
CanonicalStem string
Authorship string
Year string
Quality int
......@@ -42,8 +44,9 @@ func NewSimpleOutput(sn *grammar.ScientificNameNode) *simple {
so := simple{
ID: sn.VerbatimID,
Verbatim: sn.Verbatim,
Canonical: c.Value,
CanonicalRanked: c.ValueRanked,
Canonical: c.Value,
CanonicalStem: stemmer.StemCanonical(c.Value),
Authorship: authorship,
Year: yr,
Quality: quality,
......@@ -61,8 +64,9 @@ func (so *simple) ToSlice() []string {
res := []string{
so.ID,
so.Verbatim,
so.Canonical,
so.CanonicalRanked,
so.Canonical,
so.CanonicalStem,
so.Authorship,
yr,
qual,
......
package output
const Version = "v0.9.0-8-geb4d9fa"
const Build = "2019-09-10_16:54:10UTC"
const Version = "v0.10.0-3-g63ed3c5"
const Build = "2019-10-24_20:30:25UTC"
......@@ -119,10 +119,13 @@ enum NameType {
}
message Canonical {
// stem contains simple canonical form with
// removed suffixes for species, infraspecies.
string stem = 1;
// simple contains canonical form without ranks.
string simple = 1;
string simple = 2;
// full contains canonical form with ranks.
string full = 2;
string full = 3;
}
message Position {
......
......@@ -2,6 +2,7 @@ package pb
import (
"gitlab.com/gogna/gnparser/output"
"gitlab.com/gogna/gnparser/stemmer"
)
func ToPB(o *output.Output) *Parsed {
......@@ -35,6 +36,7 @@ func canonicalName(o *output.Output) *Canonical {
return cn
}
cn = &Canonical{
Stem: stemmer.StemCanonical(o.CanonicalName.Simple),
Simple: o.CanonicalName.Simple,
Full: o.CanonicalName.Full,
}
......
......@@ -45,7 +45,7 @@ var _ = Describe("PB", func() {
func pbEntries() []TableEntry {
var td TestData
var entries []TableEntry
data, err := ioutil.ReadFile("../test-data/test_pb.json")
data, err := ioutil.ReadFile("../testdata/test_pb.json")
if err != nil {
log.Fatal(err)
}
......
// stemmer package is responsible for extracting a stem of a latinized word. It
// is used to create a stem for latinized specific epithets in scientific names.
// Specific epithets are always nouns, so we need to take this into account.
// http://snowballstem.org/otherapps/schinke/
// http://caio.ueberalles.net/a_stemming_algorithm_for_latin_text_databases-schinke_et_al.pdf
//
// The Schinke Latin stemming algorithm is described in,
// Schinke R, Greengrass M, Robertson AM and Willett P (1996)
// A stemming algorithm for Latin text databases. Journal of Documentation, 52: 172-187.
//
// It has the feature that it stems each word to two forms, noun and verb. For example,
//
// NOUN VERB
// ---- ----
// aquila aquil aquila
// portat portat porta
// portis port por
//
// Here (slightly reformatted) are the rules of the stemmer,
//
// 1. (start)
//
// 2. Convert all occurrences of the letters 'j' or 'v' to 'i' or 'u',
// respectively.
//
// 3. If the word ends in '-que' then
// if the word is on the list shown in Figure 4, then
// write the original word to both the noun-based and verb-based
// stem dictionaries and go to 8.
// else remove '-que'
//
// [Figure 4 was
//
// atque quoque neque itaque absque apsque abusque adaeque adusque denique
// deque susque oblique peraeque plenisque quandoque quisque quaeque
// cuiusque cuique quemque quamque quaque quique quorumque quarumque
// quibusque quosque quasque quotusquisque quousque ubique undique usque
// uterque utique utroque utribique torque coque concoque contorque
// detorque decoque excoque extorque obtorque optorque retorque recoque
// attorque incoque intorque praetorque]
//
// 4. Match the end of the word against the suffix list show in Figure 6(a),
// removing the longest matching suffix, (if any).
//
// [Figure 6(a) was
//
// -ibus -ius -ae -am -as -em -es -ia
// -is -nt -os -ud -um -us -a -e
// -i -o -u]
//
// 5. If the resulting stem contains at least two characters then write this stem
// to the noun-based stem dictionary.
//
// 6. Match the end of the word against the suffix list show in Figure 6(b),
// identifying the longest matching suffix, (if any).
//
// [Figure 6(b) was
//
// -iuntur-beris -erunt -untur -iunt -mini -ntur -stis
// -bor -ero -mur -mus -ris -sti -tis -tur
// -unt -bo -ns -nt -ri -m -r -s
// -t]
//
// If any of the following suffixes are found then convert them as shown:
//
// '-iuntur', '-erunt', '-untur', '-iunt', and '-unt', to '-i';
// '-beris', '-bor', and '-bo' to '-bi';
// '-ero' to '-eri'
//
// else remove the suffix in the normal way.
//
// 7. If the resulting stem contains at least two characters then write this stem
// to the verb-based stem dictionary.
//
// 8. (end)
//
package stemmer
import (
"strings"
)
var empty = struct{}{}
var queExceptions = map[string]struct{}{
"atque": empty, "quoque": empty, "neque": empty, "itaque": empty,
"absque": empty, "apsque": empty, "abusque": empty, "adaeque": empty,
"adusque": empty, "denique": empty, "deque": empty, "susque": empty,
"oblique": empty, "peraeque": empty, "plenisque": empty, "quandoque": empty,
"quisque": empty, "quaeque": empty, "cuiusque": empty, "cuique": empty,
"quemque": empty, "quamque": empty, "quaque": empty, "quique": empty,
"quorumque": empty, "quarumque": empty, "quibusque": empty,
"quosque": empty, "quasque": empty, "quotusquisque": empty,
"quousque": empty, "ubique": empty, "undique": empty, "usque": empty,
"uterque": empty, "utique": empty, "utroque": empty, "utribique": empty,
"torque": empty, "coque": empty, "concoque": empty, "contorque": empty,
"detorque": empty, "decoque": empty, "excoque": empty, "extorque": empty,
"obtorque": empty, "optorque": empty, "retorque": empty, "recoque": empty,
"attorque": empty, "incoque": empty, "intorque": empty, "praetorque": empty,
}
var nounSuffixes = []string{
"ibus", "ius", "ae", "am", "as",
"em", "es", "ia", "is",
"nt", "os", "ud", "um", "us",
"a", "e", "i", "o", "u",
}
type StemmedWord struct {
Orig string
Stem string
Suffix string
}
// StemCanonical takes a short form of a canonical name and returns back
// stemmed specific and infraspecific epithets.
// It assumes the following properties of a string:
//
// 1. There are no empty spaces over any side of a string.
// 2. All spaces withing the string are single.
// 3. All characters in the string are ASCII with exception of the
// hybrid sign.
// 4. The string always starts with a capitalized word.
func StemCanonical(c string) string {
words := strings.Split(c, " ")
if len(words) == 1 {
return c
}
res := make([]string, len(words))
for i, v := range words {
if i == 0 || len(v) < 3 {
res[i] = v
} else {
res[i] = Stem(v).Stem
}
}
return strings.Join(res, " ")
}
// Stem takes a word and, assuming the word is noun, removes its latin suffix
// if such suffix is detected.
func Stem(wrd string) StemmedWord {
wrdR := []rune(wrd)
for i, v := range wrdR {
switch v {
case 'j':
wrdR[i] = 'i'
case 'v':
wrdR[i] = 'u'
}
}
var sw StemmedWord
var isException bool
if sw, isException = processEndsWithQue(wrd, wrdR); isException {
return sw
}
return checkNounSuffix(sw)
}
func processEndsWithQue(wrd string, wrdR []rune) (StemmedWord, bool) {
sw := StemmedWord{Orig: wrd, Stem: string(wrdR)}
if len(wrdR) < 3 {
return sw, false
}
suffix := string(wrdR[len(wrdR)-3:])
endsWithQue := suffix == "que"
if endsWithQue {
if _, ok := queExceptions[sw.Stem]; ok {
return sw, true
} else {
sw.Stem = string(wrdR[:len(wrdR)-3])
}
}
return sw, false
}
func checkNounSuffix(sw StemmedWord) StemmedWord {
var found bool
for _, v := range nounSuffixes {
if strings.HasSuffix(sw.Stem, v) {
if found {
break
}
found = true
wrdR := []rune(sw.Stem)
stem := string(wrdR[:len(wrdR)-len(v)])
if len(stem) >= 2 {
sw.Stem = stem
sw.Suffix = string(wrdR[len(v):])
}
}
}
return sw
}
package stemmer_test
import (
"bufio"
"os"
"regexp"
"strings"
"testing"
"path/filepath"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
var stemsDict map[string]string
func TestStemmer(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Stemmer Suite")
}
var _ = BeforeSuite(func() {
stemsDict = stemData()
})
func stemData() map[string]string {
res := make(map[string]string)
path := filepath.Join("..", "testdata", "stems.txt")
f, err := os.Open(path)
Expect(err).To(BeNil())
scan := bufio.NewScanner(f)
for scan.Scan() {
l := strings.TrimSpace(scan.Text())
ws := regexp.MustCompile(`\s+`).Split(l, 2)
res[ws[0]] = ws[1]
}
Expect(scan.Err()).To(BeNil())
return res
}
package stemmer_test
import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/ginkgo/extensions/table"
. "github.com/onsi/gomega"
. "gitlab.com/gogna/gnparser/stemmer"
)
var _ = Describe("Stemmer", func() {
Describe("Stem", func() {
It("treats que suffix with exceptions", func() {
Expect(Stem("detorque").Stem).To(Equal("detorque"))
Expect(Stem("somethingque").Stem).To(Equal("something"))
})
It("removes suffixes correctly", func() {
for k, v := range stemsDict {
Expect