Commit 320020e7 authored by Dmitry Mozzherin's avatar Dmitry Mozzherin
Browse files

Close #74 CSV output for simple format

parent 662af5fd
Pipeline #117349884 failed with stages
in 2 minutes and 35 seconds
......@@ -2,6 +2,10 @@
## Unreleased
## [v0.13.0]
- Add [#74]: Simple format output is now in CSV format.
- Fix [#75]: Normalize subspecies to `subsp.` instead of `ssp.`.
- Fix [#72]: Surrogate detection by `gnparser.ParseToObject` method.
## [v0.12.0]
......@@ -110,6 +114,7 @@ array of names instead of a stream.
This document follows [changelog guidelines]
[v0.13.0]: https://gitlab.com/gogna/gnparser/compare/v0.12.0...v0.13.0
[v0.12.0]: https://gitlab.com/gogna/gnparser/compare/v0.11.0...v0.12.0
[v0.11.0]: https://gitlab.com/gogna/gnparser/compare/v0.10.0...v0.11.0
[v0.10.0]: https://gitlab.com/gogna/gnparser/compare/v0.9.0...v0.10.0
......
......@@ -266,9 +266,7 @@ Relevant flags:
: output format. Can be ``compact``, ``pretty``, ``simple``, or ``debug``.
Default is ``compact``.
Simple format has the following fields separated by a pipe character(`|`):
`ID|Verbatim|CanonicalFull|CanonicalSimple|CanonicalStem|Authors|Year|Quality`
Simple format returns a header row and the CSV-compatible parsed result.
``--jobs -j``
: number of jobs running concurrently.
......@@ -287,7 +285,7 @@ gnparser "Parus major Linnaeus, 1788"
# pretty format
gnparser -f pretty "Parus major Linnaeus, 1788"
# simple pipe-delimited flat format
# simple CSV-compatible flat format
gnparser -f simple "Parus major Linnaeus, 1788"
# to parse a name from standard input
......@@ -530,4 +528,4 @@ Released under [MIT license]
[gnparser.proto]: https://gitlab.com/gogna/gnparser/blob/master/pb/gnparser.proto
[Schinke R et al (1996)]: https://caio.ueberalles.net/a_stemming_algorithm_for_latin_text_databases-schinke_et_al.pdf
[ruby_ffi_go_usage]: https://stackoverflow.com/questions/58866962/how-to-pass-an-array-of-strings-and-get-an-array-of-strings-in-ruby-using-go-sha
[export file]: https://gitlab.com/gogna/gnparser/blob/master/binding/main.go
\ No newline at end of file
[export file]: https://gitlab.com/gogna/gnparser/blob/master/binding/main.go
......@@ -5,11 +5,11 @@ import (
"log"
)
type format int
type Format int
const (
// Compact is a JSON format without new lines and spaces.
Compact format = iota
Compact Format = iota
// Pretty is a JSON nested easy to read format.
Pretty
// Simple is a flat format with only few most 'popular' fields.
......@@ -20,27 +20,27 @@ const (
var formats = []string{"compact", "pretty", "simple", "debug"}
func (of format) String() string {
func (of Format) String() string {
return formats[of]
}
func newFormat(f string) format {
func newFormat(f string) Format {
gnp := NewGNparser()
for i, v := range formats {
if v == f {
return format(i)
return Format(i)
}
}
err := fmt.Errorf("unknown format '%s', using default '%s' format",
f, gnp.format.String())
f, gnp.Format.String())
log.Println(err)
return gnp.format
return gnp.Format
}
// OutputFormat returns string representation of the current output format
// for GNparser
func (gnp *GNparser) OutputFormat() string {
return gnp.format.String()
return gnp.Format.String()
}
// AvailableFormats function returns a string representation of supported
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -4,7 +4,6 @@ import (
"bytes"
"fmt"
"runtime"
"strings"
"gitlab.com/gogna/gnparser/pb"
"gitlab.com/gogna/gnparser/preprocess"
......@@ -15,10 +14,10 @@ import (
// GNparser is responsible for parsing operations.
type GNparser struct {
// Format defines the output format of the parser.
Format
// workersNum defines the number of goroutines running parser in parallel.
workersNum int
// format defines the output format of the parser.
format
// removeHTML indicates that HTML tags have to be removed.
removeHTML bool
// nameString keeps parsed string
......@@ -35,32 +34,32 @@ type GNparser struct {
// Option is a function that creates a new option for GNparser.
type Option func(*GNparser)
// WorkersNum Option sets the quantity of workers to run parsing jobs.
func WorkersNum(wn int) Option {
// OptWorkersNum Option sets the quantity of workers to run parsing jobs.
func OptWorkersNum(wn int) Option {
return func(gnp *GNparser) {
gnp.workersNum = wn
}
}
// Format Option sets the output format to return/display parsing results.
func Format(f string) Option {
// OptFormat Option sets the output format to return/display parsing results.
func OptFormat(f string) Option {
return func(gnp *GNparser) {
fo := newFormat(f)
gnp.format = fo
gnp.Format = fo
}
}
// IsTest Option to substitute real version of the parser with 'test_version'
// OptIsTest Option to substitute real version of the parser with 'test_version'
// string.
func IsTest() Option {
func OptIsTest() Option {
return func(gnp *GNparser) {
gnp.isTest = true
}
}
// RemoveHTML Option is true of false. When true, the preprocess removes
// OptRemoveHTML Option is true of false. When true, the preprocess removes
// HTML tags from name-strings.
func RemoveHTML(r bool) Option {
func OptRemoveHTML(r bool) Option {
return func(gnp *GNparser) {
gnp.removeHTML = r
}
......@@ -69,7 +68,7 @@ func RemoveHTML(r bool) Option {
// NewGNparser constructor function takes options and returns
// configured GNparser.
func NewGNparser(opts ...Option) GNparser {
gnp := GNparser{workersNum: runtime.NumCPU(), format: Compact, removeHTML: true}
gnp := GNparser{workersNum: runtime.NumCPU(), Format: Compact, removeHTML: true}
for _, opt := range opts {
opt(&gnp)
}
......@@ -131,13 +130,13 @@ func (gnp *GNparser) Parse(s string) {
// to format setting of GNparser.
func (gnp *GNparser) ParseAndFormat(s string) (string, error) {
var err error
if gnp.format == Debug {
if gnp.Format == Debug {
bs := gnp.Debug(s)
return string(bs), nil
}
gnp.Parse(s)
var bs []byte
switch gnp.format {
switch gnp.Format {
case Compact:
bs, err = gnp.ToJSON()
if err != nil {
......@@ -151,7 +150,7 @@ func (gnp *GNparser) ParseAndFormat(s string) (string, error) {
}
s = string(bs)
case Simple:
s = strings.Join(gnp.ToSlice(), "|")
s = output.ToCSV(gnp.ToSlice())
}
return s, nil
}
......
......@@ -32,6 +32,7 @@ import (
"github.com/spf13/cobra"
"gitlab.com/gogna/gnparser"
"gitlab.com/gogna/gnparser/output"
"gitlab.com/gogna/gnparser/rpc"
"gitlab.com/gogna/gnparser/web"
)
......@@ -96,9 +97,9 @@ gnparser -j 5 -g 8080
}
f := formatFlag(cmd)
opts := []gnparser.Option{
gnparser.WorkersNum(wn),
gnparser.Format(f),
gnparser.RemoveHTML(!nocleanup),
gnparser.OptWorkersNum(wn),
gnparser.OptFormat(f),
gnparser.OptRemoveHTML(!nocleanup),
}
if len(args) == 0 {
processStdin(cmd, wn, opts)
......@@ -201,10 +202,11 @@ func workersNumFlag(cmd *cobra.Command) int {
func processStdin(cmd *cobra.Command, jobs int, opts []gnparser.Option) {
if !checkStdin() {
cmd.Help()
_ = cmd.Help()
return
}
parseFile(os.Stdin, jobs, opts)
gnp := gnparser.NewGNparser(opts...)
parseFile(gnp, os.Stdin, jobs, opts)
}
func checkStdin() bool {
......@@ -222,7 +224,7 @@ func getInput(cmd *cobra.Command, args []string) string {
case 1:
data = args[0]
default:
cmd.Help()
_ = cmd.Help()
os.Exit(0)
}
return data
......@@ -238,7 +240,7 @@ func parse(data string, jobs int, opts []gnparser.Option) {
log.Fatal(err)
os.Exit(1)
}
parseFile(f, jobs, opts)
parseFile(gnp, f, jobs, opts)
f.Close()
} else {
parseString(gnp, data)
......@@ -254,14 +256,15 @@ func fileExists(path string) bool {
return false
}
func parseFile(f io.Reader, jobs int, opts []gnparser.Option) {
func parseFile(gnp gnparser.GNparser, f io.Reader, jobs int,
opts []gnparser.Option) {
in := make(chan string)
out := make(chan *gnparser.ParseResult)
var wg sync.WaitGroup
wg.Add(1)
go gnparser.ParseStream(jobs, in, out, opts...)
go processResults(out, &wg)
go processResults(gnp, out, &wg)
sc := bufio.NewScanner(f)
count := 0
for sc.Scan() {
......@@ -276,8 +279,12 @@ func parseFile(f io.Reader, jobs int, opts []gnparser.Option) {
wg.Wait()
}
func processResults(out <-chan *gnparser.ParseResult, wg *sync.WaitGroup) {
func processResults(gnp gnparser.GNparser, out <-chan *gnparser.ParseResult,
wg *sync.WaitGroup) {
defer wg.Done()
if gnp.Format == gnparser.Simple {
fmt.Println(output.CSVHeader())
}
for r := range out {
if r.Error != nil {
log.Println(r.Error)
......@@ -292,5 +299,8 @@ func parseString(gnp gnparser.GNparser, data string) {
log.Fatal(err)
os.Exit(1)
}
if gnp.Format == gnparser.Simple {
fmt.Println(output.CSVHeader())
}
fmt.Println(res)
}
......@@ -34,13 +34,13 @@ var _ = Describe("Main", func() {
c := testcli.Command("gnparser", "Homo sapiens", "-f", "simple")
c.Run()
Expect(c.Success()).To(BeTrue())
Expect(c.Stdout()).To(ContainSubstring("|Homo sapiens|"))
Expect(c.Stdout()).To(ContainSubstring(",Homo sapiens,"))
})
It("is ignored with --version", func() {
c := testcli.Command("gnparser", "Homo sapiens", "-f", "simple", "--version")
c.Run()
Expect(c.Success()).To(BeTrue())
Expect(c.Stdout()).ToNot(ContainSubstring("|Homo sapiens|"))
Expect(c.Stdout()).ToNot(ContainSubstring(",Homo sapiens,"))
Expect(c.Stdout()).To(ContainSubstring("version:"))
})
It("is set to default format if -f value is unknown",
......@@ -58,15 +58,15 @@ var _ = Describe("Main", func() {
c.SetStdin(strings.NewReader("Homo sapiens"))
c.Run()
Expect(c.Success()).To(BeTrue())
Expect(c.Stdout()).To(ContainSubstring("|Homo sapiens|"))
Expect(c.Stdout()).To(ContainSubstring(",Homo sapiens,"))
})
It("takes multiple names from Stdin", func() {
c := testcli.Command("gnparser", "-f", "simple")
c.SetStdin(strings.NewReader("Plantago\nBubo L.\n"))
c.Run()
Expect(c.Success()).To(BeTrue())
Expect(c.Stdout()).To(ContainSubstring("|Plantago|"))
Expect(c.Stdout()).To(ContainSubstring("|Bubo|"))
Expect(c.Stdout()).To(ContainSubstring(",Plantago,"))
Expect(c.Stdout()).To(ContainSubstring(",Bubo,"))
})
})
})
......@@ -74,7 +74,7 @@ func outputEntries() []TableEntry {
if err != nil {
panic(err)
}
gnp := NewGNparser(IsTest())
gnp := NewGNparser(OptIsTest())
for i, v := range tests {
gnp.Parse(v.NameString)
res, err := gnp.ToJSON()
......@@ -126,7 +126,7 @@ func BenchmarkParse(b *testing.B) {
count := 1000
test := make([]string, count)
gnp := NewGNparser()
ops := []Option{Format("simple")}
ops := []Option{OptFormat("simple")}
gnpSimple := NewGNparser(ops...)
f, err := os.Open(path)
......
package grammar
// Code generated by peg grammar.peg DO NOT EDIT.
//go:generate peg grammar.peg
import (
"fmt"
"io"
"math"
"os"
"sort"
"strconv"
......@@ -362,12 +363,16 @@ func (t *tokens32) PrettyPrintSyntaxTree(buffer string) {
}
func (t *tokens32) Add(rule pegRule, begin, end, index uint32) {
tree, i := t.tree, int(index)
if i >= len(tree) {
t.tree = append(tree, token32{pegRule: rule, begin: begin, end: end})
return
if tree := t.tree; int(index) >= len(tree) {
expanded := make([]token32, 2*len(tree))
copy(expanded, tree)
t.tree = expanded
}
t.tree[index] = token32{
pegRule: rule,
begin: begin,
end: end,
}
tree[i] = token32{pegRule: rule, begin: begin, end: end}
}
func (t *tokens32) Tokens() []token32 {
......@@ -431,7 +436,7 @@ type parseError struct {
}
func (e *parseError) Error() string {
tokens, err := []token32{e.max}, "\n"
tokens, error := []token32{e.max}, "\n"
positions, p := make([]int, 2*len(tokens)), 0
for _, token := range tokens {
positions[p], p = int(token.begin), p+1
......@@ -444,14 +449,14 @@ func (e *parseError) Error() string {
}
for _, token := range tokens {
begin, end := int(token.begin), int(token.end)
err += fmt.Sprintf(format,
error += fmt.Sprintf(format,
rul3s[token.pegRule],
translations[begin].line, translations[begin].symbol,
translations[end].line, translations[end].symbol,
strconv.Quote(string(e.p.buffer[begin:end])))
}
return err
return error
}
func (p *Engine) PrintSyntaxTree() {
......@@ -479,31 +484,12 @@ func (p *Engine) Execute() {
_, _, _, _, _ = buffer, _buffer, text, begin, end
}
func Pretty(pretty bool) func(*Engine) error {
return func(p *Engine) error {
p.Pretty = pretty
return nil
}
}
func Size(size int) func(*Engine) error {
return func(p *Engine) error {
p.tokens32 = tokens32{tree: make([]token32, 0, size)}
return nil
}
}
func (p *Engine) Init(options ...func(*Engine) error) error {
func (p *Engine) Init() {
var (
max token32
position, tokenIndex uint32
buffer []rune
)
for _, option := range options {
err := option(p)
if err != nil {
return err
}
}
p.reset = func() {
max = token32{}
position, tokenIndex = 0, 0
......@@ -517,7 +503,7 @@ func (p *Engine) Init(options ...func(*Engine) error) error {
p.reset()
_rules := p.rules
tree := p.tokens32
tree := tokens32{tree: make([]token32, math.MaxInt16)}
p.parse = func(rule ...int) error {
r := 1
if len(rule) > 0 {
......@@ -8858,5 +8844,4 @@ func (p *Engine) Init(options ...func(*Engine) error) error {
},
}
p.rules = _rules
return nil
}
package output
import (
"bytes"
"runtime"
"strings"
"unicode"
"unicode/utf8"
)
func fieldNeedsQuotes(field string) bool {
if field == "" {
return false
}
if field == `\.` || strings.ContainsRune(field, ',') || strings.ContainsAny(field, "\"\r\n") {
return true
}
r1, _ := utf8.DecodeRuneInString(field)
return unicode.IsSpace(r1)
}
func ToCSV(record []string) string {
var b bytes.Buffer
useCRLF := runtime.GOOS == "windows"
for i, field := range record {
if i > 0 {
b.WriteRune(',')
}
if !fieldNeedsQuotes(field) {
b.WriteString(field)
continue
}
b.WriteByte('"')
for len(field) > 0 {
// Search for special characters.
ii := strings.IndexAny(field, "\"\r\n")
if ii < 0 {
ii = len(field)
}
// Copy verbatim everything before the special character.
b.WriteString(field[:ii])
field = field[ii:]
// Encode the special character.
if len(field) > 0 {
switch field[0] {
case '"':
b.WriteString(`""`)
case '\r':
if !useCRLF {
b.WriteByte('\r')
}
case '\n':
if useCRLF {
b.WriteString("\r\n")
} else {
b.WriteByte('\n')
}
}
field = field[1:]
}
}
b.WriteByte('"')
}
return b.String()
}
......@@ -2,12 +2,39 @@ package output
import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/ginkgo/extensions/table"
. "github.com/onsi/gomega"
grm "gitlab.com/gogna/gnparser/grammar"
)
var _ = Describe("Output", func() {
DescribeTable("ToCSV", func(r []string, expected string) {
Expect(ToCSV(r)).To(Equal(expected))
},
Entry("simple case", []string{"abc"}, "abc"),
Entry("", []string{`"abc"`}, `"""abc"""`),
Entry("", []string{`a"b`}, `"a""b"`),
Entry("", []string{`"a"b"`}, `"""a""b"""`),
Entry("", []string{" abc"}, `" abc"`),
Entry("", []string{"abc,def"}, `"abc,def"`),
Entry("", []string{"abc", "def"}, "abc,def"),
Entry("", []string{"abc\ndef"}, "\"abc\ndef\""),
Entry("", []string{"abc\ndef"}, "\"abc\ndef\""),
Entry("", []string{"abc\rdef"}, "\"abc\rdef\""),
Entry("", []string{""}, ""),
Entry("", []string{"", ""}, ","),
Entry("", []string{"", "", ""}, ",,"),
Entry("", []string{"", "", "a"}, ",,a"),
Entry("", []string{"", "a", ""}, ",a,"),
Entry("", []string{"", "a", "a"}, ",a,a"),
Entry("", []string{"a", "", ""}, "a,,"),
Entry("", []string{"a", "", "a"}, "a,,a"),
Entry("", []string{"a", "a", ""}, "a,a,"),
Entry("", []string{"a", "a", "a"}, "a,a,a"),
Entry("", []string{`\.`}, "\"\\.\""),
Entry("", []string{"x09\x41\xb4\x1c", "aktau"}, "x09\x41\xb4\x1c,aktau"),
Entry("", []string{",x09\x41\xb4\x1c", "aktau"}, "\",x09\x41\xb4\x1c\",aktau"),
)
})
var _ = Describe("Private Functions", func() {
......
......@@ -3,6 +3,7 @@ package output
import (
"fmt"
"strconv"
"strings"
"gitlab.com/gogna/gnparser/grammar"
"gitlab.com/gogna/gnparser/stemmer"
......@@ -54,6 +55,20 @@ func NewSimpleOutput(sn *grammar.ScientificNameNode) *simple {
return &so
}
func CSVHeader() string {
header := ([]string{
"Id",
"Verbatim",
"CanonicalFull",
"Canonical",
"CanonicalStem",
"Authorship",
"Year",
"Quality",
})
return strings.Join(header, ",")
}
func (so *simple) ToSlice() []string {
yr := so.Year
if yr == "0" {
......
package output
var (
Version = "v0.11.0-dev"
Version = "v0.13.0-dev"
Build string
)
......@@ -220,8 +220,7 @@ func (w *Warning) MarshalJSON() ([]byte, error) {
func (w *Warning) UnmarshalJSON(bs []byte) error {
arr := []interface{}{}
jsoniter.Unmarshal(bs, &arr)
// TODO: add error handling here.
_ = jsoniter.Unmarshal(bs, &arr)
w.Quality = int(arr[0].(float64))
w.Message = arr[1].(string)
return nil
......
......@@ -109,7 +109,7 @@ func (gnps gnparserServer) parseArray(ia *pb.InputArray) []*pb.Parsed {
func parseWorker(inCh <-chan string, outCh chan<- *parseArrayOutput,
skipClean bool, wg *sync.WaitGroup) {
defer wg.Done()
opts := []gnparser.Option{gnparser.RemoveHTML(!skipClean)}