Commit 23447e58 authored by Dmitry Mozzherin's avatar Dmitry Mozzherin
Browse files

Fix #51 parse ICN genera authors

parent e714e153
Pipeline #56002131 passed with stages
in 8 minutes and 24 seconds
......@@ -16,24 +16,46 @@ type Dictionary struct {
// Bacteria contains bacterial genera, where boolean value is true if
// we are aware of homonyms from other codes.
Bacteria map[string]bool
// AuthorICN contains family names of ICN authors of genera names.
// This list is used to detect ICN name-strings so we can parse a word in
// parenthesis after genus word as an author instead of subgenus.
AuthorICN map[string]struct{}
}
// LoadDictionary creates dictionary from text files.
func LoadDictionary() *Dictionary {
d := Dictionary{
Bacteria: readBacterialData(),
Bacteria: readBacterialData(),
AuthorICN: readAuthorICNData(),
}
return &d
}
func readBacterialData() map[string]bool {
m := make(map[string]bool)
scanFile("bacteria_genera.txt", false, m)
scanFile("bacteria_genera_homonyms.txt", true, m)
scanBacterialFile("bacteria_genera.txt", false, m)
scanBacterialFile("bacteria_genera_homonyms.txt", true, m)
return m
}
func scanFile(path string, isHomonym bool, m map[string]bool) {
func readAuthorICNData() map[string]struct{} {
m := make(map[string]struct{})
scanAuthorICNFIle("genera_auth_icn.txt", m)
return m
}
func scanAuthorICNFIle(path string, m map[string]struct{}) {
f, err := fs.Files.Open(path)
if err != nil {
log.Fatal(err)
}
sc := bufio.NewScanner(f)
for sc.Scan() {
m[sc.Text()] = struct{}{}
}
}
func scanBacterialFile(path string, isHomonym bool, m map[string]bool) {
f, err := fs.Files.Open(path)
if err != nil {
log.Fatal(err)
......
......@@ -28,5 +28,13 @@ var _ = Describe("Dict", func() {
Expect(ok).To(Equal(false))
Expect(hom).To(Equal(false))
})
It("does not find not ICN author", func() {
_, ok := d.AuthorICN["Arizona"]
Expect(ok).To(Equal(false))
})
It("finds ICN author", func() {
_, ok := d.AuthorICN["Abramov"]
Expect(ok).To(Equal(true))
})
})
})
# Vocabularies
`bacteria_genera.txt`
: this list is used to mark parsed names as bacterial names.
`bacteria_genera_homonyms.txt`
: this list contains bacterial generic names that exist under ICZN or ICN codes.
`genera_auth_icn.txt`
: this list contains authors of genera under ICN codes.
## Creation of genera_auth_icn.txt
1. Get the latest IRMNG file.
2. Extract authors of ICN genera
3. Parse the authors and take only "basionym" authors (makes list 500 authors smaller)
4. Break authors to words, collect words that are capitalized, have no periods, larger than 2 characters.
5. Clean up authors from spaces, commas, parentheses.
6. Create list of all genera (canonical form)
7. Remove from authors list all genera names.
\ No newline at end of file
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -8,6 +8,7 @@ import (
"gitlab.com/gogna/gnparser/preprocess"
"github.com/gnames/uuid5"
"gitlab.com/gogna/gnparser/dict"
"gitlab.com/gogna/gnparser/str"
)
......@@ -123,16 +124,13 @@ func (p *Engine) newHybridFormulaNode(n *node32) *hybridFormulaNode {
case ruleSpeciesEpithet:
p.AddWarn(HybridFormulaIncompleteWarn)
var g *wordNode
switch firstName.(type) {
switch node := firstName.(type) {
case *speciesNode:
sp := firstName.(*speciesNode)
g = sp.Genus
g = node.Genus
case *uninomialNode:
u := firstName.(*uninomialNode)
g = u.Word
g = node.Word
case *comparisonNode:
cn := firstName.(*comparisonNode)
g = cn.Genus
g = node.Genus
}
spe := p.newSpeciesEpithetNode(n)
g = &wordNode{Value: g.Value, NormValue: g.NormValue}
......@@ -258,6 +256,44 @@ func (p *Engine) newNamedSpeciesHybridNode(n *node32) *namedSpeciesHybridNode {
return nhl
}
func (p *Engine) botanicalUninomial(n *node32) bool {
n = n.up
if n.token32.pegRule == ruleUninomial {
return false
}
n = n.next
n = n.up
if n.token32.pegRule != ruleUninomialWord {
return false
}
w := p.newWordNode(n, UnknownType)
if _, ok := dict.Dict.AuthorICN[w.NormValue]; ok {
return true
}
return false
}
func (p *Engine) newBotanicalUninomialNode(n *node32) *uninomialNode {
var at2 *authorsGroupNode
n = n.up
w := p.newWordNode(n, UninomialType)
n = n.next // fake Subgenus
au := p.newWordNode(n.up, AuthorWordType)
an := &authorNode{Value: au.NormValue, Words: []*wordNode{au}}
at := &authorsTeamNode{Authors: []*authorNode{an}}
ag := &authorsGroupNode{Team1: at, Parens: true}
n = n.next
if n != nil {
n = n.up // fake OriginalAuthorship
at2 = p.newAuthorsGroupNode(n.up)
}
authorship := &authorshipNode{OriginalAuthors: ag, CombinationAuthors: at2}
u := &uninomialNode{Word: w, Authorship: authorship}
p.AddWarn(BotanyAuthorNotSubgenWarn)
return u
}
func (p *Engine) newSingleName(n *node32) Name {
var name Name
n = n.up
......@@ -275,6 +311,9 @@ func (p *Engine) newSingleName(n *node32) Name {
case ruleUninomial:
name = p.newUninomialNode(n)
case ruleUninomialCombo:
if p.botanicalUninomial(n) {
return p.newBotanicalUninomialNode(n)
}
p.AddWarn(UninomialComboWarn)
name = p.newUninomialComboNode(n)
}
......@@ -373,7 +412,12 @@ func (p *Engine) newSpeciesNode(n *node32) *speciesNode {
for n != nil {
switch n.token32.pegRule {
case ruleSubGenus:
sg = p.newWordNode(n.up, SubGenusType)
w := p.newWordNode(n.up, SubGenusType)
if _, ok := dict.Dict.AuthorICN[w.NormValue]; ok {
p.AddWarn(BotanyAuthorNotSubgenWarn)
} else {
sg = w
}
case ruleSubGenusOrSuperspecies:
p.AddWarn(SuperSpeciesWarn)
case ruleSpeciesEpithet:
......@@ -901,7 +945,7 @@ func (p *Engine) newWordNode(n *node32, wt WordType) *wordNode {
switch {
case v == '-':
afterDash = true
case afterDash == true:
case afterDash:
v = unicode.ToLower(v)
afterDash = false
}
......
......@@ -71,13 +71,9 @@ SubGenusOrSuperspecies <- '(' _? NameLowerChar+ _? ')'
SubGenus <- '(' _? UninomialWord _? ')'
UninomialCombo <- Uninomial _ RankUninomial _ Uninomial
# TODO: To find how to use botanica author dictionary to treat Zoological
# names of this sort correctly
# UninomialCombo <- UninomialCombo1 / UninomialCombo2
# UninomialCombo1 <- UninomialWord _? SubGenus (_? Authorship)?
# UninomialCombo2 <- Uninomial _ RankUninomial _ Uninomial
UninomialCombo <- UninomialCombo1 / UninomialCombo2
UninomialCombo1 <- UninomialWord _? SubGenus (_? Authorship)?
UninomialCombo2 <- Uninomial _ RankUninomial _ Uninomial
RankUninomial <- RankUninomialPlain / RankUninomialNotho
......
This diff is collapsed.
......@@ -17,6 +17,7 @@ const (
AuthUnknownWarn
AuthUpperCaseWarn
BacteriaMaybeWarn
BotanyAuthorNotSubgenWarn
CanonicalApostropheWarn
CapWordQuestionWarn
CharBadWarn
......
package output
const Version = "v0.7.5-1-g41acd42"
const Build = "2019-04-01_01:04:26UTC"
const Version = "v0.8.0"
const Build = "2019-04-09_19:48:26UTC"
......@@ -69,6 +69,10 @@ var warningMap = map[grm.Warning]Warning{
Quality: 1,
Message: "The genus is a homonym of a bacterial genus",
},
grm.BotanyAuthorNotSubgenWarn: Warning{
Quality: 2,
Message: "Possible ICN author instead of subgenus",
},
grm.CanonicalApostropheWarn: Warning{
Quality: 3,
Message: "Apostrophe is not allowed in canonical",
......
......@@ -261,12 +261,10 @@ Aconitum ser. Tangutica W.T. Wang
{"parsed":true,"quality":2,"qualityWarnings":[[2,"Combination of two uninomials"]],"verbatim":"Aconitum ser. Tangutica W.T. Wang","normalized":"Aconitum ser. Tangutica W. T. Wang","canonicalName":{"simple":"Tangutica","full":"Aconitum ser. Tangutica"},"details":[{"uninomial":{"value":"Tangutica","rank":"ser.","parent":"Aconitum","authorship":{"value":"W. T. Wang","basionymAuthorship":{"authors":["W. T. Wang"]}}}}],"positions":[["uninomial",0,8],["rank",9,13],["uninomial",14,23],["authorWord",24,26],["authorWord",26,28],["authorWord",29,33]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"8f5d7bd0-90a1-556d-a8ef-1a440b157c34","parserVersion":"test_version"}
8f5d7bd0-90a1-556d-a8ef-1a440b157c34|Aconitum ser. Tangutica W.T. Wang|Tangutica|Aconitum ser. Tangutica|W. T. Wang||2
# This is not correct, but allows not to generate fake subgenera
# from botanical names
Calathus (Lindrothius) KURNAKOV 1961
Calathus (Lindrothius) KURNAKOV 1961
{"parsed":true,"quality":2,"qualityWarnings":[[2,"Author in upper case"]],"verbatim":"Calathus (Lindrothius) KURNAKOV 1961","normalized":"Calathus (Lindrothius) Kurnakov 1961","canonicalName":{"simple":"Calathus","full":"Calathus"},"details":[{"uninomial":{"value":"Calathus","authorship":{"value":"(Lindrothius) Kurnakov 1961","basionymAuthorship":{"authors":["Lindrothius"]},"combinationAuthorship":{"authors":["Kurnakov"],"year":{"value":"1961"}}}}}],"positions":[["uninomial",0,8],["authorWord",10,21],["authorWord",23,31],["year",32,36]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"aa113505-61a1-58fe-92f3-8fd511dcfd61","parserVersion":"test_version"}
aa113505-61a1-58fe-92f3-8fd511dcfd61|Calathus (Lindrothius) KURNAKOV 1961|Calathus|Calathus|(Lindrothius) Kurnakov 1961||2
{"parsed":true,"quality":2,"qualityWarnings":[[2,"Author in upper case"],[2,"Combination of two uninomials"]],"verbatim":"Calathus (Lindrothius) KURNAKOV 1961","normalized":"Calathus subgen. Lindrothius Kurnakov 1961","canonicalName":{"simple":"Lindrothius","full":"Calathus subgen. Lindrothius"},"details":[{"uninomial":{"value":"Lindrothius","rank":"subgen.","parent":"Calathus","authorship":{"value":"Kurnakov 1961","basionymAuthorship":{"authors":["Kurnakov"],"year":{"value":"1961"}}}}}],"positions":[["uninomial",0,8],["uninomial",10,21],["authorWord",23,31],["year",32,36]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"aa113505-61a1-58fe-92f3-8fd511dcfd61","parserVersion":"test_version"}
aa113505-61a1-58fe-92f3-8fd511dcfd61|Calathus (Lindrothius) KURNAKOV 1961|Lindrothius|Calathus subgen. Lindrothius|Kurnakov 1961|1961|2
Eucalyptus subser. Regulares Brooker
Eucalyptus subser. Regulares Brooker
......@@ -275,8 +273,31 @@ Eucalyptus subser. Regulares Brooker
Aaleniella (Danocythere)
Aaleniella (Danocythere)
{"parsed":true,"quality":1,"verbatim":"Aaleniella (Danocythere)","normalized":"Aaleniella (Danocythere)","canonicalName":{"simple":"Aaleniella","full":"Aaleniella"},"details":[{"uninomial":{"value":"Aaleniella","authorship":{"value":"(Danocythere)","basionymAuthorship":{"authors":["Danocythere"]}}}}],"positions":[["uninomial",0,10],["authorWord",12,23]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"8b7eddb1-b9a4-5cca-8fa8-25527e25d8df","parserVersion":"test_version"}
8b7eddb1-b9a4-5cca-8fa8-25527e25d8df|Aaleniella (Danocythere)|Aaleniella|Aaleniella|(Danocythere)||1
{"parsed":true,"quality":2,"qualityWarnings":[[2,"Combination of two uninomials"]],"verbatim":"Aaleniella (Danocythere)","normalized":"Aaleniella subgen. Danocythere","canonicalName":{"simple":"Danocythere","full":"Aaleniella subgen. Danocythere"},"details":[{"uninomial":{"value":"Danocythere","rank":"subgen.","parent":"Aaleniella"}}],"positions":[["uninomial",0,10],["uninomial",12,23]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"8b7eddb1-b9a4-5cca-8fa8-25527e25d8df","parserVersion":"test_version"}
8b7eddb1-b9a4-5cca-8fa8-25527e25d8df|Aaleniella (Danocythere)|Danocythere|Aaleniella subgen. Danocythere|||2
#>
#SECTION: ICN names that look like combined uninomials for ICZN
Clathrotropis (Bentham) Harms in Dalla Torre & Harms, 1901
Clathrotropis (Bentham) Harms in Dalla Torre & Harms, 1901
{"parsed":true,"quality":2,"qualityWarnings":[[2,"Ex authors are not required"],[2,"Possible ICN author instead of subgenus"]],"verbatim":"Clathrotropis (Bentham) Harms in Dalla Torre \u0026 Harms, 1901","normalized":"Clathrotropis (Bentham) Harms ex Dalla Torre \u0026 Harms 1901","canonicalName":{"simple":"Clathrotropis","full":"Clathrotropis"},"details":[{"uninomial":{"value":"Clathrotropis","authorship":{"value":"(Bentham) Harms ex Dalla Torre \u0026 Harms 1901","basionymAuthorship":{"authors":["Bentham"]},"combinationAuthorship":{"authors":["Harms"],"exAuthors":{"authors":["Dalla Torre","Harms"],"year":{"value":"1901"}}}}}}],"positions":[["uninomial",0,13],["authorWord",15,22],["authorWord",24,29],["authorWord",33,38],["authorWord",39,44],["authorWord",47,52],["year",54,58]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"6b730cea-e81b-53ba-a511-caaa233b9b84","parserVersion":"test_version"}
6b730cea-e81b-53ba-a511-caaa233b9b84|Clathrotropis (Bentham) Harms in Dalla Torre & Harms, 1901|Clathrotropis|Clathrotropis|(Bentham) Harms ex Dalla Torre & Harms 1901||2
Humiriastrum (Urban) Cuatrecasas, 1961
Humiriastrum (Urban) Cuatrecasas, 1961
{"parsed":true,"quality":2,"qualityWarnings":[[2,"Possible ICN author instead of subgenus"]],"verbatim":"Humiriastrum (Urban) Cuatrecasas, 1961","normalized":"Humiriastrum (Urban) Cuatrecasas 1961","canonicalName":{"simple":"Humiriastrum","full":"Humiriastrum"},"details":[{"uninomial":{"value":"Humiriastrum","authorship":{"value":"(Urban) Cuatrecasas 1961","basionymAuthorship":{"authors":["Urban"]},"combinationAuthorship":{"authors":["Cuatrecasas"],"year":{"value":"1961"}}}}}],"positions":[["uninomial",0,12],["authorWord",14,19],["authorWord",21,32],["year",34,38]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"98f8aa31-1cc3-59c2-a4f2-ebf18e0929ab","parserVersion":"test_version"}
98f8aa31-1cc3-59c2-a4f2-ebf18e0929ab|Humiriastrum (Urban) Cuatrecasas, 1961|Humiriastrum|Humiriastrum|(Urban) Cuatrecasas 1961||2
Pampocactus (Doweld) Doweld
Pampocactus (Doweld) Doweld
{"parsed":true,"quality":2,"qualityWarnings":[[2,"Possible ICN author instead of subgenus"]],"verbatim":"Pampocactus (Doweld) Doweld","normalized":"Pampocactus (Doweld) Doweld","canonicalName":{"simple":"Pampocactus","full":"Pampocactus"},"details":[{"uninomial":{"value":"Pampocactus","authorship":{"value":"(Doweld) Doweld","basionymAuthorship":{"authors":["Doweld"]},"combinationAuthorship":{"authors":["Doweld"]}}}}],"positions":[["uninomial",0,11],["authorWord",13,19],["authorWord",21,27]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"82494c70-6400-51a3-b786-2a8a747f8305","parserVersion":"test_version"}
82494c70-6400-51a3-b786-2a8a747f8305|Pampocactus (Doweld) Doweld|Pampocactus|Pampocactus|(Doweld) Doweld||2
Pampocactus (Doweld)
Pampocactus (Doweld)
{"parsed":true,"quality":2,"qualityWarnings":[[2,"Possible ICN author instead of subgenus"]],"verbatim":"Pampocactus (Doweld)","normalized":"Pampocactus (Doweld)","canonicalName":{"simple":"Pampocactus","full":"Pampocactus"},"details":[{"uninomial":{"value":"Pampocactus","authorship":{"value":"(Doweld)","basionymAuthorship":{"authors":["Doweld"]}}}}],"positions":[["uninomial",0,11],["authorWord",13,19]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"3ed64c9a-ec8a-52c9-a913-eae09b6c71b9","parserVersion":"test_version"}
3ed64c9a-ec8a-52c9-a913-eae09b6c71b9|Pampocactus (Doweld)|Pampocactus|Pampocactus|(Doweld)||2
#>
### Binomials
......@@ -1169,6 +1190,12 @@ Acanthoderes (acanthoderes) satanas Aurivillius, 1923
Acanthoderes (acanthoderes) satanas Aurivillius, 1923
{"parsed":true,"quality":2,"qualityWarnings":[[2,"Ambiguity: subgenus or superspecies found"]],"verbatim":"Acanthoderes (acanthoderes) satanas Aurivillius, 1923","normalized":"Acanthoderes satanas Aurivillius 1923","canonicalName":{"simple":"Acanthoderes satanas","full":"Acanthoderes satanas"},"details":[{"genus":{"value":"Acanthoderes"},"specificEpithet":{"value":"satanas","authorship":{"value":"Aurivillius 1923","basionymAuthorship":{"authors":["Aurivillius"],"year":{"value":"1923"}}}}}],"positions":[["genus",0,12],["specificEpithet",28,35],["authorWord",36,47],["year",49,53]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"f1082b19-d13f-54a2-95a9-6e342f2a9e6b","parserVersion":"test_version"}
f1082b19-d13f-54a2-95a9-6e342f2a9e6b|Acanthoderes (acanthoderes) satanas Aurivillius, 1923|Acanthoderes satanas|Acanthoderes satanas|Aurivillius 1923|1923|2
#fake name to illustrate botaincal author instead of subgenus
Acanthoderes (Abramov) satanas Aurivillius
Acanthoderes (Abramov) satanas Aurivillius
{"parsed":true,"quality":2,"qualityWarnings":[[2,"Possible ICN author instead of subgenus"]],"verbatim":"Acanthoderes (Abramov) satanas Aurivillius","normalized":"Acanthoderes satanas Aurivillius","canonicalName":{"simple":"Acanthoderes satanas","full":"Acanthoderes satanas"},"details":[{"genus":{"value":"Acanthoderes"},"specificEpithet":{"value":"satanas","authorship":{"value":"Aurivillius","basionymAuthorship":{"authors":["Aurivillius"]}}}}],"positions":[["genus",0,12],["specificEpithet",23,30],["authorWord",31,42]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"8eb2a9be-eb11-537e-8488-eacdb6e2b9e7","parserVersion":"test_version"}
8eb2a9be-eb11-537e-8488-eacdb6e2b9e7|Acanthoderes (Abramov) satanas Aurivillius|Acanthoderes satanas|Acanthoderes satanas|Aurivillius||2
#>
#SECTION: Names with multiple dashes in specific epithet<
......@@ -2364,8 +2391,8 @@ Acanthochiton
#SECTION: Names with unparsed_tail at the end<
Morea (Morea) Burt 2342343242 23424322342 23424234
Morea (Morea) Burt
{"parsed":true,"quality":3,"qualityWarnings":[[3,"Unparsed tail"]],"verbatim":"Morea (Morea) Burt 2342343242 23424322342 23424234","normalized":"Morea (Morea) Burt","canonicalName":{"simple":"Morea","full":"Morea"},"details":[{"uninomial":{"value":"Morea","authorship":{"value":"(Morea) Burt","basionymAuthorship":{"authors":["Morea"]},"combinationAuthorship":{"authors":["Burt"]}}}}],"positions":[["uninomial",0,5],["authorWord",7,12],["authorWord",14,18]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"unparsedTail":" 2342343242 23424322342 23424234","nameStringId":"ca23679f-f3d8-5194-a406-048f970c4020","parserVersion":"test_version"}
ca23679f-f3d8-5194-a406-048f970c4020|Morea (Morea) Burt 2342343242 23424322342 23424234|Morea|Morea|(Morea) Burt||3
{"parsed":true,"quality":3,"qualityWarnings":[[3,"Unparsed tail"],[2,"Combination of two uninomials"]],"verbatim":"Morea (Morea) Burt 2342343242 23424322342 23424234","normalized":"Morea subgen. Morea Burt","canonicalName":{"simple":"Morea","full":"Morea subgen. Morea"},"details":[{"uninomial":{"value":"Morea","rank":"subgen.","parent":"Morea","authorship":{"value":"Burt","basionymAuthorship":{"authors":["Burt"]}}}}],"positions":[["uninomial",0,5],["uninomial",7,12],["authorWord",14,18]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"unparsedTail":" 2342343242 23424322342 23424234","nameStringId":"ca23679f-f3d8-5194-a406-048f970c4020","parserVersion":"test_version"}
ca23679f-f3d8-5194-a406-048f970c4020|Morea (Morea) Burt 2342343242 23424322342 23424234|Morea|Morea subgen. Morea|Burt||3
Nautilus asterizans von
Nautilus asterizans
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment