Commit 5d8f4f81 authored by Dmitry Mozzherin's avatar Dmitry Mozzherin
Browse files

Close #78 handle virus-like epithets

parent 3314b726
Pipeline #132417457 passed with stages
in 4 minutes and 11 seconds
......@@ -4,6 +4,7 @@
## [v0.13.1]
- Add [#78]: Take into account non-virus names that look like virus names.
- Fix [#77]: Memory leak when used as clib
- Fix [#76]: Non ASCII apostrophe does not show up in canonical.
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -12,6 +12,7 @@ require (
github.com/onsi/gomega v1.9.0
github.com/rendon/testcli v0.0.0-20161027181003-6283090d169f
github.com/shurcooL/httpfs v0.0.0-20181222201310-74dc9339e414
github.com/shurcool/vfsgen v0.0.0-20181202132449-6a9ea43bcacd // indirect
github.com/spf13/cobra v0.0.5
github.com/stretchr/testify v1.3.0 // indirect
golang.org/x/net v0.0.0-20190620200207-3b0461eec859
......
......@@ -4,6 +4,7 @@ import (
"bytes"
"io"
"regexp"
"strings"
"unicode"
)
......@@ -54,7 +55,10 @@ func Preprocess(bs []byte) *Preprocessor {
return pr
}
i := len(bs)
pr.Virus = IsVirus(bs[0:i])
name := string(bs)
if !VirusLikeName(name) {
pr.Virus = IsVirus(bs[0:i])
}
if pr.Virus {
pr.NoParse = true
return pr
......@@ -80,6 +84,58 @@ func Preprocess(bs []byte) *Preprocessor {
return pr
}
// LikeVirus takes a string and checks it against known species that can
// easily be misparsed as viruses. If the string belongs to one of such species
// returns true.
// The following names are covered:
// Aspilota vector Belokobylskij, 2007
// Ceylonesmus vector Chamberlin, 1941
// Cryptops (Cryptops) vector Chamberlin, 1939
// Culex vector Dyar & Knab, 1906
// Dasyproctus cevirus Leclercq, 1963
// Desmoxytes vector (Chamberlin, 1941)
// Dicathais vector Thornley, 1952
// Euragallia prion Kramer, 1976
// Exochus virus Gauld & Sithole, 2002
// Hilara vector Miller, 1923
// Microgoneplax prion Castro, 2007
// Neoaemula vector Mackinnon, Hiller, Long & Marshall, 2008
// Ophion virus Gauld & Mitchell, 1981
// Psenulus trevirus Leclercq, 1961
// Tidabius vector Chamberlin, 1931
func VirusLikeName(name string) bool {
names := map[string]string{
"Aspilota": "vector",
"Ceylonesmus": "vector",
"Cryptops": "vector",
"Culex": "vector",
"Dasyproctus": "cevirus",
"Desmoxytes": "vector",
"Dicathais": "vector",
"Euragallia": "prion",
"Exochus": "virus",
"Hilara": "vector",
"Microgoneplax": "prion",
"Neoaemula": "vector",
"Ophion": "virus",
"Psenulus": "trevirus",
"Tidabius": "vector",
}
words := strings.Fields(name)
if len(words) < 2 {
return false
}
if epithet, ok := names[words[0]]; ok {
for _, w := range words[1:] {
if strings.HasPrefix(w, epithet) {
return true
}
}
}
return false
}
// NormalizeHybridChar substitutes hybrid chars 'X' or 'x' with
// the multiplication sign char.
func NormalizeHybridChar(bs []byte) []byte {
......
......@@ -104,6 +104,29 @@ var _ = Describe("Preprocess", func() {
),
)
DescribeTable("VirusLikeName",
func(s string, expected bool) {
Expect(VirusLikeName(s)).To(Equal(expected))
},
Entry("name1", "Aspilota vector Belokobylskij, 2007", true),
Entry("name2", "Ceylonesmus vector Chamberlin, 1941", true),
Entry("name3", "Cryptops (Cryptops) vector Chamberlin, 1939", true),
Entry("name4", "Culex vector Dyar & Knab, 1906", true),
Entry("name5", "Dasyproctus cevirus Leclercq, 1963", true),
Entry("name6", "Desmoxytes vector (Chamberlin, 1941)", true),
Entry("name7", "Dicathais vector Thornley, 1952", true),
Entry("name8", "Euragallia prion Kramer, 1976", true),
Entry("name9", "Exochus virus Gauld & Sithole, 2002", true),
Entry("name10", "Hilara vector Miller, 1923", true),
Entry("name11", "Microgoneplax prion Castro, 2007", true),
Entry("name12", "Neoaemula vector Mackinnon, Hiller, Long & Marshall, 2008", true),
Entry("name13", "Ophion virus Gauld & Mitchell, 1981", true),
Entry("name14", "Psenulus trevirus Leclercq, 1961", true),
Entry("name15", "Tidabius vector Chamberlin, 1931", true),
Entry("name16", "Ceylonesmus prion", false),
Entry("name17", "Homo sapiens coronavirus", false),
)
DescribeTable("IsVirus",
func(s string, itIs bool) {
res := IsVirus([]byte(s))
......
......@@ -1935,6 +1935,12 @@ Coleoptera
### Viruses etc.
#SECTION Parse virus-like "normal" names
Ceylonesmus vector Chamberlin, 1941
Ceylonesmus vector Chamberlin, 1941
{"parsed":true,"quality":1,"verbatim":"Ceylonesmus vector Chamberlin, 1941","normalized":"Ceylonesmus vector Chamberlin 1941","canonicalName":{"full":"Ceylonesmus vector","simple":"Ceylonesmus vector","stem":"Ceylonesmus uector"},"authorship":"Chamberlin 1941","details":[{"genus":{"value":"Ceylonesmus"},"specificEpithet":{"value":"vector","authorship":{"value":"Chamberlin 1941","basionymAuthorship":{"authors":["Chamberlin"],"year":{"value":"1941"}}}}}],"positions":[["genus",0,11],["specificEpithet",12,18],["authorWord",19,29],["year",31,35]],"surrogate":false,"virus":false,"hybrid":false,"bacteria":false,"nameStringId":"00b874b9-c9ac-5b8a-9821-0a641ca26ca0","parserVersion":"test_version"}
00b874b9-c9ac-5b8a-9821-0a641ca26ca0,"Ceylonesmus vector Chamberlin, 1941",Ceylonesmus vector,Ceylonesmus vector,Ceylonesmus uector,Chamberlin 1941,1941,1
#SECTION: No parsing -- viruses, plasmids, prions etc.<
Arv1virus
noparse
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment