Commit 13529645 authored by Dmitry Mozzherin's avatar Dmitry Mozzherin
Browse files

Close #69 shared C library

parent d01f507d
Pipeline #96755383 passed with stages
in 4 minutes and 14 seconds
......@@ -7,4 +7,6 @@ test_data.new.txt
.idea
gnparser/gnparser
bench*.txt
binding/*.h
binding/*.so
......@@ -2,6 +2,9 @@
## Unreleased
## [v0.12.0]
- Add [#69]: gnparser as a shared C library.
- Make dynamic version using ldflags.
## [v0.11.0]
......
......@@ -3,7 +3,8 @@ VER = $(shell git describe --tags --abbrev=0)
DATE = $(shell date -u '+%Y-%m-%d_%H:%M:%S%Z')
FLAG_MODULE = GO111MODULE=on
FLAGS_SHARED = $(FLAG_MODULE) CGO_ENABLED=0 GOARCH=amd64
FLAGS_SHARED = $(FLAG_MODULE) GOARCH=amd64
NO_C = CGO_ENABLED=0
FLAGS_LINUX = $(FLAGS_SHARED) GOOS=linux
FLAGS_MAC = $(FLAGS_SHARED) GOOS=darwin
FLAGS_WIN = $(FLAGS_SHARED) GOOS=windows
......@@ -43,23 +44,23 @@ asset:
build: peg pb asset
cd gnparser; \
$(GOCLEAN); \
$(FLAGS_SHARED) $(GOBUILD)
$(FLAGS_SHARED) $(NO_C) $(GOBUILD)
install: peg pb asset
cd gnparser; \
$(GOCLEAN); \
$(FLAGS_SHARED) $(GOINSTALL)
$(FLAGS_SHARED) $(NO_C) $(GOINSTALL)
release: peg pb asset dockerhub
cd gnparser; \
$(GOCLEAN); \
$(FLAGS_LINUX) $(GOBUILD); \
$(FLAGS_LINUX) $(NO_C) $(GOBUILD); \
tar zcf /tmp/gnparser-$(VER)-linux.tar.gz gnparser; \
$(GOCLEAN); \
$(FLAGS_MAC) $(GOBUILD); \
$(FLAGS_MAC) $(NO_C) $(GOBUILD); \
tar zcf /tmp/gnparser-$(VER)-mac.tar.gz gnparser; \
$(GOCLEAN); \
$(FLAGS_WIN) $(GOBUILD); \
$(FLAGS_WIN) $(NO_C) $(GOBUILD); \
zip -9 /tmp/gnparser-$(VER)-win-64.zip gnparser.exe; \
$(GOCLEAN);
......@@ -76,3 +77,7 @@ docker: build
dockerhub: docker
docker push gnames/gognparser; \
docker push gnames/gognparser:$(VERSION)
clib:
cd binding; \
$(GOBUILD) -buildmode=c-shared -o libgnparser.so;
\ No newline at end of file
......@@ -6,11 +6,11 @@ Try `gnparser` [online][parser-web].
associated meta information. For example, ``"Homo sapiens Linnaeus"`` is
parsed into:
| Element | Meaning | Position
| -------- | ---------------- | --------
| Homo | genus | (0,4)
| sapiens | specificEpithet | (5,12)
| Linnaeus | author | (13,21)
| Element | Meaning | Position |
| -------- | --------------- | -------- |
| Homo | genus | (0,4) |
| sapiens | specificEpithet | (5,12) |
| Linnaeus | author | (13,21) |
This parser, written in Go, is the 3rd iteration of the project. The first,
[biodiversity] had been written in Ruby, the second, [also
......@@ -55,6 +55,7 @@ gnparser -h
- [Usage as a REST API Interface](#usage-as-a-rest-api-interface)
- [Use as a Docker image](#use-as-a-docker-image)
- [Use as a library in Go](#use-as-a-library-in-go)
- [Use as a shared C library](#use-as-a-shared-c-library)
- [Parsing ambiguities](#parsing-ambiguities)
- [Names with `filius` (ICN code)](#names-with-filius-icn-code)
- [Names with subgenus (ICZN code) and genus author (ICN code)](#names-with-subgenus-iczn-code-and-genus-author-icn-code)
......@@ -87,14 +88,14 @@ its input and output.
Number of names parsed per hour on a i7-8750H CPU
(6 cores, 12 threads, at 2.20 GHz), parser v0.5.1
| Threads | names/hr
| -------- | ------------
| 1 | 48,000,000
| 2 | 63,000,000
| 4 | 128,000,000
| 8 | 202,000,000
| 16 | 248,000,000
| 100 | 293,000,000
| Threads | names/hr |
| ------- | ----------- |
| 1 | 48,000,000 |
| 2 | 63,000,000 |
| 4 | 128,000,000 |
| 8 | 202,000,000 |
| 16 | 248,000,000 |
| 100 | 293,000,000 |
For simplest output Go ``gnparser`` is roughly 2 times faster than Scala
``gnparser`` and about 100 times faster than Ruby ``biodiversity`` parser. For
......@@ -451,6 +452,25 @@ case *pb.Parsed_Uninomial:
}
```
### Use as a shared C library
It is possible to bind `gnparser` functionality with languages that can use
C Application Binary Interface. For example such languages include
Python, Ruby, Rust, C, C++, Java (via JNI).
To compile `gnparser` shared library for your platform/operating system of
choice you need `GNU make` and `GNU gcc compiler` installed:
```bash
make clib
cd binding
cp libgnparser* /path/to/some/project
```
As an example how to use the shared library check this
[StackOverflow question][ruby_ffi_go_usage]. You can find shared functions
at their [export file].
## Parsing ambiguities
Some name-strings cannot be parsed unambiguously without some additional data.
......@@ -508,4 +528,6 @@ Released under [MIT license]
[IRMNG]: http://www.irmng.org
[CONTRIBUTING]: https://gitlab.com/gogna/gnparser/blob/master/CONTRIBUTING.md
[gnparser.proto]: https://gitlab.com/gogna/gnparser/blob/master/pb/gnparser.proto
[Schinke R et al (1996)]: https://caio.ueberalles.net/a_stemming_algorithm_for_latin_text_databases-schinke_et_al.pdf
\ No newline at end of file
[Schinke R et al (1996)]: https://caio.ueberalles.net/a_stemming_algorithm_for_latin_text_databases-schinke_et_al.pdf
[ruby_ffi_go_usage]: https://stackoverflow.com/questions/58866962/how-to-pass-an-array-of-strings-and-get-an-array-of-strings-in-ruby-using-go-sha
[export file]: https://gitlab.com/gogna/gnparser/blob/master/binding/main.go
\ No newline at end of file
package main
/*
#include "stdlib.h"
*/
import "C"
import (
"fmt"
"runtime"
"log"
"sync"
"unsafe"
"gitlab.com/gogna/gnparser"
)
// ParseToString function takes a name-string, desired format, and parses
// the name-string to either JSON, or pipe-separated values, depending on
// the desired format. Format can take values of 'simple', 'compact', 'pretty'.
//export ParseToString
func ParseToString(name *C.char, format *C.char) *C.char {
goname := C.GoString(name)
opts := []gnparser.Option{gnparser.Format(C.GoString(format))}
gnp := gnparser.NewGNparser(opts...)
parsed, err := gnp.ParseAndFormat(goname)
if err != nil {
fmt.Println(err)
return C.CString("")
}
return C.CString(parsed)
}
// ParseAryToStrings function takes an array of names, parsing format and a
// reference to an output: an empty array of strings to return the the data
// back. It populates the output array with raw strings of either JSON or
// pipe-separated parsed values (depending on a given format). Format can take
// values of 'simple', 'compact', or 'pretty'.
//export ParseAryToStrings
func ParseAryToStrings(in **C.char, length C.int, format *C.char, out ***C.char) {
names := make([]string, int(length))
inCh := make(chan string)
outCh := make(chan *gnparser.ParseResult)
resMap := make(map[string]string)
var wg sync.WaitGroup
wg.Add(1)
opts := []gnparser.Option{
gnparser.Format(C.GoString(format)),
}
jobs := runtime.NumCPU()
go gnparser.ParseStream(jobs, inCh, outCh, opts...)
go func() {
defer wg.Done()
for parsed := range outCh {
resMap[parsed.Input] = parsed.Output
}
}()
start := unsafe.Pointer(in)
pointerSize := unsafe.Sizeof(in)
for i := 0; i < int(length); i++ {
// Copy each input string into a Go string and add it to the slice.
pointer := (**C.char)(unsafe.Pointer(uintptr(start) + uintptr(i)*pointerSize))
name := C.GoString(*pointer)
inCh <- name
names[i] = name
}
close(inCh)
wg.Wait()
outArray := (C.malloc(C.ulong(length) * C.ulong(pointerSize)))
*out = (**C.char)(outArray)
for i := 0; i < int(length); i++ {
pointer := (**C.char)(unsafe.Pointer(uintptr(outArray) + uintptr(i)*pointerSize))
if parsed, ok := resMap[names[i]]; ok {
*pointer = C.CString(parsed)
} else {
log.Printf("Cannot find result for %s", names[i])
*pointer = C.CString("[]")
}
}
}
func main() {}
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -9,54 +9,55 @@ import (
// ParseResult structure contains parsing output and/or error generated
// by the parser.
type ParseResult struct {
Input string
Output string
Error error
}
// ParseStream function takes input/output channels to do concurrent
// parsing jobs. Output is pushed as ParseResult objects.
func (gnp *GNparser) ParseStream(in <-chan string, out chan<- *ParseResult,
func ParseStream(jobs int, in <-chan string, out chan<- *ParseResult,
opts ...Option) {
var wg sync.WaitGroup
wg.Add(gnp.workersNum)
for i := 0; i < gnp.workersNum; i++ {
go gnp.parserWorker(i, in, out, &wg, opts...)
wg.Add(jobs)
for i := 0; i < jobs; i++ {
go parserWorker(i, in, out, &wg, opts...)
}
wg.Wait()
close(out)
}
func (gnp *GNparser) parserWorker(i int, in <-chan string, out chan<- *ParseResult,
func parserWorker(i int, in <-chan string, out chan<- *ParseResult,
wg *sync.WaitGroup, opts ...Option) {
gnp1 := NewGNparser(opts...)
gnp := NewGNparser(opts...)
defer wg.Done()
for s := range in {
res, err := gnp1.ParseAndFormat(s)
res, err := gnp.ParseAndFormat(s)
if err != nil {
out <- &ParseResult{Output: "", Error: err}
out <- &ParseResult{Input: s, Output: "", Error: err}
}
out <- &ParseResult{Output: res, Error: nil}
out <- &ParseResult{Input: s, Output: res, Error: nil}
}
}
// ParseStreamToObjects function takes input/output channels to do concurrent
// parsing to object jobs. Output is pushed as ParseObjectResult objects.
func (gnp *GNparser) ParseStreamToObjects(in <-chan string,
func ParseStreamToObjects(jobs int, in <-chan string,
out chan<- *pb.Parsed, opts ...Option) {
var wg sync.WaitGroup
wg.Add(gnp.workersNum)
for i := 0; i < gnp.workersNum; i++ {
go gnp.parserObjectWorker(i, in, out, &wg, opts...)
wg.Add(jobs)
for i := 0; i < jobs; i++ {
go parserObjectWorker(i, in, out, &wg, opts...)
}
wg.Wait()
close(out)
}
func (gnp *GNparser) parserObjectWorker(i int, in <-chan string,
func parserObjectWorker(i int, in <-chan string,
out chan<- *pb.Parsed, wg *sync.WaitGroup, opts ...Option) {
gnp1 := NewGNparser(opts...)
gnp := NewGNparser(opts...)
defer wg.Done()
for s := range in {
out <- gnp1.ParseToObject(s)
out <- gnp.ParseToObject(s)
}
}
......@@ -26,6 +26,7 @@ import (
"io"
"log"
"os"
"runtime"
"strings"
"sync"
......@@ -100,11 +101,11 @@ gnparser -j 5 -g 8080
gnparser.RemoveHTML(!nocleanup),
}
if len(args) == 0 {
processStdin(cmd, opts)
processStdin(cmd, wn, opts)
os.Exit(0)
}
data := getInput(cmd, args)
parse(data, opts)
parse(data, wn, opts)
},
}
......@@ -127,8 +128,7 @@ func init() {
formatHelp := fmt.Sprintf("sets output format. Can be one of:\n %s.", formats)
rootCmd.Flags().StringP("format", "f", df, formatHelp)
dj := gnp.WorkersNum()
rootCmd.Flags().IntP("jobs", "j", dj,
rootCmd.Flags().IntP("jobs", "j", runtime.NumCPU(),
"nubmer of threads to run. CPU's threads number is the default.")
rootCmd.Flags().BoolP("nocleanup", "n", false, "keep HTML entities and tags when parsing.")
......@@ -199,12 +199,12 @@ func workersNumFlag(cmd *cobra.Command) int {
return i
}
func processStdin(cmd *cobra.Command, opts []gnparser.Option) {
func processStdin(cmd *cobra.Command, jobs int, opts []gnparser.Option) {
if !checkStdin() {
cmd.Help()
return
}
parseFile(os.Stdin, opts)
parseFile(os.Stdin, jobs, opts)
}
func checkStdin() bool {
......@@ -228,7 +228,7 @@ func getInput(cmd *cobra.Command, args []string) string {
return data
}
func parse(data string, opts []gnparser.Option) {
func parse(data string, jobs int, opts []gnparser.Option) {
gnp := gnparser.NewGNparser(opts...)
path := string(data)
......@@ -238,7 +238,7 @@ func parse(data string, opts []gnparser.Option) {
log.Fatal(err)
os.Exit(1)
}
parseFile(f, opts)
parseFile(f, jobs, opts)
f.Close()
} else {
parseString(gnp, data)
......@@ -254,14 +254,13 @@ func fileExists(path string) bool {
return false
}
func parseFile(f io.Reader, opts []gnparser.Option) {
func parseFile(f io.Reader, jobs int, opts []gnparser.Option) {
in := make(chan string)
out := make(chan *gnparser.ParseResult)
gnp := gnparser.NewGNparser(opts...)
var wg sync.WaitGroup
wg.Add(1)
go gnp.ParseStream(in, out, opts...)
go gnparser.ParseStream(jobs, in, out, opts...)
go processResults(out, &wg)
sc := bufio.NewScanner(f)
count := 0
......
......@@ -41,10 +41,9 @@ func apiPostParse(w http.ResponseWriter, r *http.Request) {
func parseSlice(w http.ResponseWriter, ns []string) {
in := make(chan string)
out := make(chan *gnparser.ParseResult)
gnp := gnparser.NewGNparser()
var wg sync.WaitGroup
wg.Add(1)
go gnp.ParseStream(in, out)
go gnparser.ParseStream(8, in, out)
go processResults(w, out, &wg)
for _, v := range ns {
in <- v
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment