Commit 5c2a09ed authored by Dmitry Mozzherin's avatar Dmitry Mozzherin
Browse files

Close #80 cardinality in proto, JSON and CSV formats

parent eeb98d3c
Pipeline #143639903 passed with stages
in 3 minutes and 43 seconds
......@@ -2,6 +2,10 @@
## Unreleased
## [v0.14.0]
- Add [#81]: Add year range in format "1888/89"
- Add [#80]: Add Cardinality to parser outputs.
- Add [#78]: Take into account `non-virus` names that look like virus names.
## [v0.13.1]
......@@ -34,7 +38,7 @@
- Add [#66]: remove HTML tags during parsing instead of a separate step.
- Add [#61]: handle authors that end with a word "bis".
- Add [#60]: handle correctly deprecated ranks with greek letters.
- Add [#62]: parser breaks on ``Drepanolejeunea (Spruce) (Steph.)``.
- Fix [#62]: parser breaks on ``Drepanolejeunea (Spruce) (Steph.)``.
## [v0.9.0]
......@@ -122,6 +126,7 @@ array of names instead of a stream.
This document follows [changelog guidelines]
[v0.14.0]: https://gitlab.com/gogna/gnparser/compare/v0.13.1...v0.14.0
[v0.13.1]: https://gitlab.com/gogna/gnparser/compare/v0.13.0...v0.13.1
[v0.13.0]: https://gitlab.com/gogna/gnparser/compare/v0.12.0...v0.13.0
[v0.12.0]: https://gitlab.com/gogna/gnparser/compare/v0.11.0...v0.12.0
......@@ -138,6 +143,10 @@ This document follows [changelog guidelines]
[v0.6.0]: https://gitlab.com/gogna/gnparser/compare/v0.5.1...v0.6.0
[v0.5.1]: https://gitlab.com/gogna/gnparser/tree/v0.5.1
[#81]: https://gitlab.com/gogna/gnparser/issues/81
[#80]: https://gitlab.com/gogna/gnparser/issues/80
[#79]: https://gitlab.com/gogna/gnparser/issues/79
[#78]: https://gitlab.com/gogna/gnparser/issues/78
[#77]: https://gitlab.com/gogna/gnparser/issues/77
[#76]: https://gitlab.com/gogna/gnparser/issues/76
[#75]: https://gitlab.com/gogna/gnparser/issues/75
......
......@@ -41,6 +41,7 @@ gnparser -h
* [Features](#features)
* [Use Cases](#use-cases)
* [Getting the simplest possible canonical form](#getting-the-simplest-possible-canonical-form)
* [Quickly partition names by the type](#quickly-partition-names-by-the-type)
* [Normalizing name-strings](#normalizing-name-strings)
* [Removing authorships in the middle of the name](#removing-authorships-in-the-middle-of-the-name)
* [Figuring out if names are well-formed](#figuring-out-if-names-are-well-formed)
......@@ -151,8 +152,31 @@ flag with command line tool.
Simple format is CSV-compatible and has the following fields:
`Id,Verbatim,CanonicalFull,CanonicalSimple,CanonicalStem,Authors,Year,Quality`
`Id,Verbatim,CanonicalFull,Cardinality,CanonicalSimple,CanonicalStem,Authors,Year,Quality`
### Quickly partition names by the type
Usually scientific names can be broken into groups accoring by number of
elements:
* Uninomial
* Binomial
* Trinomial
* Quadrinomial
The output of `gnparser` contains a `Cardinality` field that tells, when
possible, how many elements are detected in the name.
| Cardinality | Name Type |
| ------------ | ------------ |
| 0 | Undetermined |
| 1 | Uninomial |
| 2 | Binomial |
| 3 | Trinomial |
| 4 | Quadrinomial |
For hybrid formulas, "approximate" names (with "sp.", "spp." etc.), unparsed
names, as well as names from `BOLD` project cardinality is 0 (Undetermined)
### Normalizing name-strings
......
......@@ -16,6 +16,7 @@ type ScientificNameNode struct {
Name
Verbatim string
VerbatimID string
Cardinality int
Hybrid bool
Virus bool
Bacteria bool
......@@ -46,18 +47,20 @@ func (p *Engine) NewScientificNameNode() {
i++
}
if str.IsBoldSurrogate(tail) {
p.Cardinality = 0
p.Surrogate = true
}
if p.Tail != "" && tail == "" {
tail = p.Tail
}
sn := ScientificNameNode{
Name: name,
Hybrid: p.Hybrid,
Surrogate: p.Surrogate,
Bacteria: p.Bacteria,
Tail: tail,
Warnings: warns,
Name: name,
Cardinality: p.Cardinality,
Hybrid: p.Hybrid,
Surrogate: p.Surrogate,
Bacteria: p.Bacteria,
Tail: tail,
Warnings: warns,
}
p.SN = &sn
}
......@@ -151,6 +154,7 @@ func (p *Engine) newHybridFormulaNode(n *node32) *hybridFormulaNode {
HybridElements: hes,
}
hf.normalizeAbbreviated()
p.Cardinality = 0
return hf
}
......@@ -249,6 +253,7 @@ func (p *Engine) newNamedSpeciesHybridNode(n *node32) *namedSpeciesHybridNode {
if hybrid.Pos.End == sp.Word.Pos.Start {
p.AddWarn(HybridCharNoSpaceWarn)
}
p.Cardinality = 2 + len(infs)
nhl = &namedSpeciesHybridNode{
Genus: gen,
Comparison: cf,
......@@ -299,6 +304,7 @@ func (p *Engine) newBotanicalUninomialNode(n *node32) *uninomialNode {
authorship := &authorshipNode{OriginalAuthors: ag, CombinationAuthors: at2}
u := &uninomialNode{Word: w, Authorship: authorship}
p.AddWarn(BotanyAuthorNotSubgenWarn)
p.Cardinality = 1
return u
}
......@@ -364,6 +370,7 @@ func (p *Engine) newApproxNode(n *node32) *approxNode {
Approx: annot,
Ignored: ign,
}
p.Cardinality = 0
return an
}
......@@ -385,10 +392,12 @@ func (p *Engine) newComparisonNode(n *node32) *comparisonNode {
switch n.pegRule {
case ruleGenusWord:
gen = p.newWordNode(n, GenusType)
p.Cardinality = 1
case ruleComparison:
comp = p.newWordNode(n, ComparisonType)
case ruleSpeciesEpithet:
spEp = p.newSpeciesEpithetNode(n)
p.Cardinality = 2
}
n = n.next
}
......@@ -435,6 +444,7 @@ func (p *Engine) newSpeciesNode(n *node32) *speciesNode {
}
n = n.next
}
p.Cardinality = 2 + len(infs)
sn := speciesNode{
Genus: gen,
SubGenus: sg,
......@@ -565,6 +575,7 @@ func (p *Engine) newUninomialNode(n *node32) *uninomialNode {
Word: w,
Authorship: au,
}
p.Cardinality = 1
return &un
}
......@@ -606,6 +617,7 @@ func (p *Engine) newUninomialComboNode(n *node32) *uninomialComboNode {
Rank: r,
Uninomial2: u2,
}
p.Cardinality = 1
return &ucn
}
......
......@@ -7,17 +7,19 @@ import (
)
type BaseEngine struct {
SN *ScientificNameNode
root *node32
Error error
Hybrid bool
Surrogate bool
Bacteria bool
Warnings map[Warning]struct{}
Tail string
SN *ScientificNameNode
root *node32
Cardinality int
Error error
Hybrid bool
Surrogate bool
Bacteria bool
Warnings map[Warning]struct{}
Tail string
}
func (p *Engine) FullReset() {
p.Cardinality = 0
p.Error = nil
p.Hybrid = false
p.Surrogate = false
......
......@@ -8,23 +8,48 @@ import (
"gitlab.com/gogna/gnparser/stemmer"
)
// Output is a result of parsing that can be returned in JSON or CSV formats.
type Output struct {
Parsed bool `json:"parsed"`
Quality int `json:"quality"`
Warnings []Warning `json:"qualityWarnings,omitempty"`
Verbatim string `json:"verbatim"`
Normalized string `json:"normalized,omitempty"`
CanonicalName *canonical `json:"canonicalName,omitempty"`
Authorship string `json:"authorship,omitempty"`
Details []interface{} `json:"details,omitempty"`
Positions []pos `json:"positions,omitempty"`
Surrogate bool `json:"surrogate"`
Virus bool `json:"virus"`
Hybrid bool `json:"hybrid"`
Bacteria bool `json:"bacteria"`
Tail string `json:"unparsedTail,omitempty"`
NameStringID string `json:"nameStringId"`
ParserVersion string `json:"parserVersion"`
// Parsed is true if parsing of a name-string succeeded.
Parsed bool `json:"parsed"`
// Quality of parsing. 1 - no problems, 2 - some small problems,
// 3 - significant problems with a name-string.
Quality int `json:"quality"`
// Warnings generated by parsing. A warning contains a message and
// an associated with it quality of parsing. The largest quality number
// becomes an overal quality of parsing.
Warnings []Warning `json:"qualityWarnings,omitempty"`
// Verbatim input of a name-string.
Verbatim string `json:"verbatim"`
// Normalized is a cleaned-up version of a name.
Normalized string `json:"normalized,omitempty"`
// Cardinality is a number of main elements in a name. 0 - N/A, 1 - Uninomial,
// 2 - Binomial, 3 - Trinomial etc.
Cardinality int `json:"cardinality"`
// CanonicalName -- three versions of a canonical form of a name.
CanonicalName *canonical `json:"canonicalName,omitempty"`
// Authorship of a name-string, if available.
Authorship string `json:"authorship,omitempty"`
// Parsing details.
Details []interface{} `json:"details,omitempty"`
// Positions and a semantic meanings of words in the name-strings.
Positions []pos `json:"positions,omitempty"`
// Unofficial name-string label (for example names from BOLD project,
// names with annotations etc.).
Surrogate bool `json:"surrogate"`
// Name seem to be a virus, vector, prion etc.
Virus bool `json:"virus"`
// Hybrid is true if a name-string is classied as a hybrid.
Hybrid bool `json:"hybrid"`
// Bacteria is true if a name-string is classified as a bacteria.
Bacteria bool `json:"bacteria"`
// Tail is an unparseable tail of a name-string.
Tail string `json:"unparsedTail,omitempty"`
// NameStringID is a UUID v5 of a verbatim version of a name-string. This
// UUID uses globalnames.org DNS as a seed.
NameStringID string `json:"nameStringId"`
// ParserVersion is a version of the gnparser used to generate the output.
ParserVersion string `json:"parserVersion"`
}
func NewOutput(sn *grm.ScientificNameNode) *Output {
......@@ -65,6 +90,7 @@ func NewOutput(sn *grm.ScientificNameNode) *Output {
Virus: sn.Virus,
Hybrid: hybrid,
Normalized: sn.Value(),
Cardinality: sn.Cardinality,
Positions: ps,
Bacteria: sn.Bacteria,
Tail: sn.Tail,
......
......@@ -12,6 +12,7 @@ import (
type simple struct {
ID string
Verbatim string
Cardinality int
CanonicalRanked string
Canonical string
CanonicalStem string
......@@ -45,6 +46,7 @@ func NewSimpleOutput(sn *grammar.ScientificNameNode) *simple {
so := simple{
ID: sn.VerbatimID,
Verbatim: sn.Verbatim,
Cardinality: sn.Cardinality,
CanonicalRanked: c.ValueRanked,
Canonical: c.Value,
CanonicalStem: stemmer.StemCanonical(c.Value),
......@@ -59,6 +61,7 @@ func CSVHeader() string {
header := ([]string{
"Id",
"Verbatim",
"Cardinality",
"CanonicalFull",
"CanonicalSimple",
"CanonicalStem",
......@@ -76,9 +79,11 @@ func (so *simple) ToSlice() []string {
}
qual := strconv.Itoa(so.Quality)
card := strconv.Itoa(so.Cardinality)
res := []string{
so.ID,
so.Verbatim,
card,
so.CanonicalRanked,
so.Canonical,
so.CanonicalStem,
......
......@@ -56,7 +56,6 @@ func hybridName(po *Parsed, o *output.Output) {
}
po.Authorship = nil
po.NameType = NameType_HYBRID_FORMULA
po.Cardinality = 0
po.DetailsHybridFormula = hf
}
......@@ -74,7 +73,6 @@ func uninomial(po *Parsed, o *output.Output,
po.Authorship = au
}
po.NameType = NameType_UNINOMIAL
po.Cardinality = 1
return u
}
......@@ -107,20 +105,17 @@ func species(po *Parsed, o *output.Output,
}
po.Authorship = au
po.NameType = NameType_SPECIES
po.Cardinality = int32(2 + len(so.InfraSpecies))
return s
}
func comparison(po *Parsed, o *output.Output,
co *grammar.ComparisonOutput) *Comparison {
po.Cardinality = 1
c := &Comparison{
Genus: co.Genus.Value,
}
if co.SpecEpithet != nil {
c.Species = co.SpecEpithet.Value
po.Cardinality = 2
if co.SpecEpithet.Authorship != nil {
c.SpeciesAuthorship = authorship(co.SpecEpithet.Authorship)
}
......@@ -135,7 +130,6 @@ func comparison(po *Parsed, o *output.Output,
func approx(po *Parsed, o *output.Output,
ao *grammar.ApproxOutput) *Approximation {
po.Cardinality = 0
po.NameType = NameType_APPROX_SURROGATE
a := &Approximation{Genus: ao.Genus.Value}
if ao.SpecEpithet != nil {
......
......@@ -15,6 +15,7 @@ func ToPB(o *output.Output) *Parsed {
Canonical: canonicalName(o),
Hybrid: o.Hybrid,
Normalized: o.Normalized,
Cardinality: int32(o.Cardinality),
Positions: positions(o),
Bacteria: o.Bacteria,
Tail: o.Tail,
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment