Commit 39fb1117 authored by Dmitry Mozzherin's avatar Dmitry Mozzherin
Browse files

Close #73 ragel's FSM instad of regex

parent cc5cf4f4
Pipeline #117411214 passed with stages
in 4 minutes and 37 seconds
......@@ -5,6 +5,7 @@
## [v0.13.0]
- Add [#74]: Simple format output is now in CSV format.
- Add [#73]: Improve speed by using ragel's FSM instead of regex.
- Fix [#75]: Normalize subspecies to `subsp.` instead of `ssp.`.
- Fix [#72]: Surrogate detection by `gnparser.ParseToObject` method.
......@@ -129,6 +130,11 @@ This document follows [changelog guidelines]
[v0.6.0]: https://gitlab.com/gogna/gnparser/compare/v0.5.1...v0.6.0
[v0.5.1]: https://gitlab.com/gogna/gnparser/tree/v0.5.1
[#77]: https://gitlab.com/gogna/gnparser/issues/77
[#76]: https://gitlab.com/gogna/gnparser/issues/76
[#75]: https://gitlab.com/gogna/gnparser/issues/75
[#74]: https://gitlab.com/gogna/gnparser/issues/74
[#73]: https://gitlab.com/gogna/gnparser/issues/73
[#72]: https://gitlab.com/gogna/gnparser/issues/72
[#71]: https://gitlab.com/gogna/gnparser/issues/71
[#70]: https://gitlab.com/gogna/gnparser/issues/70
......
......@@ -37,6 +37,11 @@ peg:
peg grammar.peg; \
goimports -w grammar.peg.go; \
ragel:
cd preprocess; \
ragel -Z -G2 virus.rl; \
ragel -Z -G2 noparse.rl
asset:
cd fs; \
$(FLAGS_SHARED) go run -tags=dev assets_gen.go
......
......@@ -144,9 +144,9 @@ from the specific and infraspecific epithets.
If you only care about canonical form of a name you can use ``--format simple``
flag with command line tool.
Simple format has the following fields separated by a pipe character(`|`):
Simple format is CSV-compatible and has the following fields:
`ID|Verbatim|CanonicalFull|CanonicalSimple|CanonicalStem|Authors|Year|Quality`
`Id,Verbatim,CanonicalFull,CanonicalSimple,CanonicalStem,Authors,Year,Quality`
### Normalizing name-strings
......
......@@ -5,12 +5,12 @@ import (
"fmt"
"os"
"path/filepath"
"strings"
"testing"
. "github.com/onsi/ginkgo"
. "github.com/onsi/ginkgo/extensions/table"
. "github.com/onsi/gomega"
"gitlab.com/gogna/gnparser/output"
"gitlab.com/gogna/gnparser/pb"
"gitlab.com/gogna/gnparser/preprocess"
)
......@@ -85,7 +85,7 @@ func outputEntries() []TableEntry {
json := string(res)
gnp.Parse(v.NameString)
simple := strings.Join(gnp.ToSlice(), "|")
simple := output.ToCSV(gnp.ToSlice())
testName := fmt.Sprintf("%000d: |%s|", i+1, v.NameString)
te := Entry(testName, json, v.Compact, simple, v.Simple)
entries = append(entries, te)
......
......@@ -60,7 +60,7 @@ func CSVHeader() string {
"Id",
"Verbatim",
"CanonicalFull",
"Canonical",
"CanonicalSimple",
"CanonicalStem",
"Authorship",
"Year",
......
package preprocess
import (
)
func AnnotationRL(data []byte) bool {
%%{
machine annot;
write data;
}%%
cs, p, pe, eof := 0, 0, len(data), len(data)
_ = eof
_ = annot_en_main
_ = annot_error
_ = annot_first_final
var match bool
%%{
action setMatch {match = true}
action setPos {pos = append(pos,p)}
notes = ("species"i | "group"i | "authors"i);
tc1 = ("sensu"i | "auct"i | "sec"i | "near" | "str") "."?;
tc2 = "("? "s." space? ([sl] | "str" | "lat") ".";
tc3 = "pro parte"i | "p."i space? "p."i;
tc4 = "("? ("nomen"i | "nom."i | "comb.");
main := any* ((space+ | "," space?)
(notes | tc1 |tc2 | tc3 | tc4)) %/setMatch
((space | punct) >setMatch);
write init;
write exec;
}%%
return match
}
\ No newline at end of file
This diff is collapsed.
package preprocess
func NoParse(data []byte) bool {
%%{
machine noparse;
write data;
}%%
cs, p, pe, eof := 0, 0, len(data), len(data)
_ = eof
_ = noparse_first_final
_ = noparse_error
_ = noparse_en_main
var match bool
%%{
action setMatch {match = true}
noparse1 = ("Not" | "None" | "Un" ("n"? "amed" | "identified"));
noparse2 = any* [Ii] "nc" ("." | "ertae") space* [Ss] "ed" ("." | "is");
noparse3 = any* ("phytoplasma" | "plasmid" "s"? | [^A-Z] "RNA" [^A-Z]*);
main := (noparse1 | noparse2 | noparse3) %/setMatch
((space | punct) >setMatch);
write init;
write exec;
}%%
return match
}
......@@ -10,18 +10,6 @@ import (
var hybridCharRe1 = regexp.MustCompile(`(^)[Xx](\p{Lu})`)
var hybridCharRe2 = regexp.MustCompile(`(\s|^)[Xx](\s|$)`)
var virusRe = regexp.MustCompile(
`(?i)(\b|\d)` +
`(ictv|[a-z]*virus(es)?|` +
`particles?|vectors?|` +
`(bacterio|viro)?phages?|` +
`viroids?|prions?|[a-z]*npv|` +
`(alpha|beta)?satellites?)\b`,
)
var noParseRe = regexp.MustCompile(
`(^(Not|None|Un(n?amed|identified))[\W_].*|.*[Ii]ncertae\s+[Ss]edis.*` +
`|[Ii]nc\.\s*[Ss]ed\.|phytoplasma\b|plasmids?\b|[^A-Z]RNA[^A-Z]*)`,
)
var notesRe = regexp.MustCompile(
`(?i)\s+(species\s+group|species\s+complex|group|author)\b.*$`,
)
......@@ -101,16 +89,6 @@ func NormalizeHybridChar(bs []byte) []byte {
return res
}
// IsVirus returns if a string is a virus name.
func IsVirus(bs []byte) bool {
return virusRe.Match(bs)
}
// NoParse retuns if a string need to be parsed.
func NoParse(bs []byte) bool {
return noParseRe.Match(bs)
}
// Annotation returns index where unparsed part starts. In case if
// the full string can be parsed, returns returns the index of the end of the
// input.
......
This diff is collapsed.
package preprocess
func IsVirus(data []byte) bool {
%%{
machine virus;
write data;
}%%
cs, p, pe, eof := 0, 0, len(data), len(data)
_ = eof
_ = virus_en_main
_ = virus_error
_ = virus_first_final
var match bool
%%{
action setMatch {match = true}
vir_str = (alnum* "virus"i "es"i?) |
'ICTV' | 'Ictv' |
("bacterio"i | "viro"i)? "phage"i "s"i? |
("vector"i | "viroid"i | "particle"i | "prion"i) "s"i? |
alnum* "npv"i |
("alpha"i | "beta"i)? "satellite"i "s"i?;
main := ('' | any* (space | punct))
vir_str %/setMatch
((space | punct) >setMatch);
write init;
write exec;
}%%
return match
}
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -10,9 +10,9 @@ import (
"os"
"path/filepath"
"regexp"
"strings"
"gitlab.com/gogna/gnparser"
"gitlab.com/gogna/gnparser/output"
)
func genTestData() error {
......@@ -56,7 +56,7 @@ func genTestData() error {
w.Write(bs)
w.Write([]byte("\n"))
sl := gnp.ToSlice()
res = strings.Join(sl, "|") + "\n"
res = output.ToCSV(sl) + "\n"
w.Write([]byte(res))
case 4:
count = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment