Commit c2356835 authored by gerd's avatar gerd

Implement PXP's own version of Ulexing (now as part of

Pxp_reader) to eliminate the need for one character buffer.


git-svn-id: https://godirepo.camlcity.org/svn/lib-pxp/trunk@680 dbe99aee-44db-0310-b2b3-d33182c8eb97
parent 3c2b66df
......@@ -30,19 +30,19 @@ toploop.lex-utf8:
top=`dirname $$PWD` && \
OCAMLPATH="$$top/gensrc:$$top/src:$$OCAMLPATH" && \
export OCAMLPATH && \
ocamlfind ocamlmktop -o toploop.lex-utf8 -linkpkg -package threads,pxp,pxp-lex-utf8,findlib,str -thread
ocamlfind ocamlmktop -o toploop.lex-utf8 -linkpkg -package threads,pxp-engine,pxp-lex-iso88591,pxp-lex-utf8,findlib,str -thread
toploop.wlex-utf8:
top=`dirname $$PWD` && \
OCAMLPATH="$$top/gensrc:$$top/src:$$OCAMLPATH" && \
export OCAMLPATH && \
ocamlfind ocamlmktop -o toploop.wlex-utf8 -linkpkg -package threads,pxp,pxp-wlex-utf8,str -thread
ocamlfind ocamlmktop -o toploop.wlex-utf8 -linkpkg -package threads,pxp-engine,pxp-lex-iso88591,pxp-wlex-utf8,str -thread
toploop.ulex-utf8:
top=`dirname $$PWD` && \
OCAMLPATH="$$top/gensrc:$$top/src:$$OCAMLPATH" && \
export OCAMLPATH && \
ocamlfind ocamlmktop -o toploop.ulex-utf8 -linkpkg -package threads,pxp,pxp-ulex-utf8,str -thread
ocamlfind ocamlmktop -o toploop.ulex-utf8 -linkpkg -package threads,pxp-engine,pxp-lex-iso88591,pxp-ulex-utf8,str -thread
exit.cmo:
ocamlc -c exit.ml
......
......@@ -299,6 +299,106 @@ module ULB = struct
end
module Ulexing = struct
type lexbuf =
{ ulb : ULB.unicode_lexbuf;
mutable offset : int;
mutable pos : int;
mutable start : int;
mutable marked_pos : int;
mutable marked_val : int;
}
exception Error
let from_ulb_lexbuf ulb =
{ ulb = ulb;
offset = 0;
pos = 0;
start = 0;
marked_pos = 0;
marked_val = 0;
}
let lexeme_start lexbuf = lexbuf.start + lexbuf.offset
let lexeme_end lexbuf = lexbuf.pos + lexbuf.offset
let lexeme_length lexbuf = lexbuf.pos - lexbuf.start
let lexeme lexbuf =
let buf = lexbuf.ulb.ULB.ulb_chars in
Array.sub buf lexbuf.start (lexbuf.pos - lexbuf.start)
let sub_lexeme lexbuf pos len =
let buf = lexbuf.ulb.ULB.ulb_chars in
Array.sub buf (lexbuf.start + pos) len
let lexeme_char lexbuf pos =
let buf = lexbuf.ulb.ULB.ulb_chars in
buf.(lexbuf.start + pos)
let utf8_lexeme lexbuf =
ULB.utf8_sub_string lexbuf.start (lexbuf.pos - lexbuf.start) lexbuf.ulb
let utf8_sub_lexeme lexbuf pos len =
ULB.utf8_sub_string (lexbuf.start + pos) len lexbuf.ulb
let utf8_sub_lexeme_length lexbuf pos len =
ULB.utf8_sub_string_length (lexbuf.start + pos) len lexbuf.ulb
(* "Internal" interface *)
let start lexbuf =
lexbuf.start <- lexbuf.pos;
lexbuf.marked_pos <- lexbuf.pos;
lexbuf.marked_val <- (-1)
let mark lexbuf i =
lexbuf.marked_pos <- lexbuf.pos;
lexbuf.marked_val <- i
let backtrack lexbuf =
lexbuf.pos <- lexbuf.marked_pos;
lexbuf.marked_val
let rollback lexbuf =
lexbuf.pos <- lexbuf.start
let eof = (-1)
let refill lexbuf =
try
(* Delete all characters in ulexbuf before the current lexeme: *)
if lexbuf.start > 0 then (
let n = lexbuf.start in
ULB.delete n lexbuf.ulb;
lexbuf.offset <- lexbuf.offset + n;
lexbuf.pos <- lexbuf.pos - n;
lexbuf.marked_pos <- lexbuf.marked_pos - n;
lexbuf.start <- 0;
);
ULB.refill lexbuf.ulb;
(* raises either End_of_file, or ensures there is one char in ulb *)
lexbuf.ulb.ULB.ulb_chars.(lexbuf.pos)
with
End_of_file ->
(* We cannot modify the buffer as the original Ulexing implementation
*)
eof
let next lexbuf =
let ulb = lexbuf.ulb in
let i =
if lexbuf.pos = ulb.ULB.ulb_chars_len then
refill lexbuf
else
ulb.ULB.ulb_chars.(lexbuf.pos)
in
if i <> eof then lexbuf.pos <- lexbuf.pos + 1;
i
end
exception Not_competent;;
exception Not_resolvable of exn;;
......
......@@ -193,6 +193,28 @@ module ULB : sig
end (* module ULB *)
module Ulexing : sig
type lexbuf
exception Error
val from_ulb_lexbuf : ULB.unicode_lexbuf -> lexbuf
val lexeme_start: lexbuf -> int
val lexeme_end: lexbuf -> int
val lexeme_length: lexbuf -> int
val lexeme: lexbuf -> int array
val lexeme_char: lexbuf -> int -> int
val sub_lexeme: lexbuf -> int -> int -> int array
val utf8_lexeme: lexbuf -> string
val utf8_sub_lexeme: lexbuf -> int -> int -> string
val utf8_sub_lexeme_length: lexbuf -> int -> int -> int
(* "Internal" interface *)
val start: lexbuf -> unit
val next: lexbuf -> int
val mark: lexbuf -> int -> unit
val backtrack: lexbuf -> int
end
(* One must only use either [lsrc_lexbuf], or [lsrc_unicode_lexbuf] ! *)
type lexer_source =
......
......@@ -132,6 +132,8 @@ let lexeme_char lo lb k = Char.code(Lexing.lexeme_char lb k)
(* The specific header for the "ulex" output format *)
module Ulexing = Pxp_reader.Ulexing
let lexeme_len lo = Ulexing.lexeme_length
let sub_lexeme lo lb p n = lo # sub_lexeme p n
let lexeme lo lb = lo # lexeme
......
......@@ -26,7 +26,7 @@ let _ =
assert("${encoding}" = "utf8");;
(* The rest of this module assumes that the internal encoding is UTF-8 *)
(*
let from_function ulb ulb_pos ulb_copy lexbuf_ref ubuf upos ulen =
(* ulb_pos: Absolute position of the first char in ulb
* ulb_copy: Relative position of the next char to copy
......@@ -68,7 +68,7 @@ let from_function ulb ulb_pos ulb_copy lexbuf_ref ubuf upos ulen =
End_of_file ->
0
;;
*)
class lfactory : lexer_factory =
object(self)
......@@ -91,72 +91,39 @@ object(self)
end
and lobj factory (_ulb : ULB.unicode_lexbuf) : lexer_obj =
let ulb_pos = ref 0
and ulb_copy = ref 0
and lexbuf_ref = ref None in
object(self)
val mutable ulb = _ulb
val mutable lexbuf = Ulexing.create
(from_function _ulb ulb_pos ulb_copy lexbuf_ref)
initializer
lexbuf_ref := Some lexbuf
val mutable lexbuf = Ulexing.from_ulb_lexbuf _ulb
method factory = factory
method encoding = `Enc_utf8
method open_source src =
lexbuf_ref := None;
ulb <- Lazy.force src.lsrc_unicode_lexbuf;
lexbuf <- Ulexing.create (from_function ulb ulb_pos ulb_copy lexbuf_ref);
lexbuf_ref := Some lexbuf;
ulb_pos := 0;
ulb_copy := 0;
let ulb = Lazy.force src.lsrc_unicode_lexbuf in
lexbuf <- Ulexing.from_ulb_lexbuf ulb
method open_string s =
lexbuf_ref := None;
ulb <- ULB.from_string `Enc_utf8 s;
lexbuf <- Ulexing.create (from_function ulb ulb_pos ulb_copy lexbuf_ref);
lexbuf_ref := Some lexbuf;
ulb_pos := 0;
ulb_copy := 0;
let ulb = ULB.from_string `Enc_utf8 s in
lexbuf <- Ulexing.from_ulb_lexbuf ulb
method open_string_inplace s =
lexbuf_ref := None;
ulb <- ULB.from_string_inplace `Enc_utf8 s;
lexbuf <- Ulexing.create (from_function ulb ulb_pos ulb_copy lexbuf_ref);
lexbuf_ref := Some lexbuf;
ulb_pos := 0;
ulb_copy := 0;
let ulb = ULB.from_string_inplace `Enc_utf8 s in
lexbuf <- Ulexing.from_ulb_lexbuf ulb
method lexeme_length =
Ulexing.lexeme_length lexbuf
method lexeme_char pos =
let ls = Ulexing.lexeme_start lexbuf - !ulb_pos in
ulb.ULB.ulb_chars.( ls + pos )
Ulexing.lexeme_char lexbuf pos
method lexeme =
let ls = Ulexing.lexeme_start lexbuf - !ulb_pos in
let le = Ulexing.lexeme_end lexbuf - !ulb_pos in
assert(ls >= 0);
ULB.utf8_sub_string ls (le-ls) ulb
(* Ulexing.utf8_lexeme lexbuf *)
Ulexing.utf8_lexeme lexbuf
method lexeme_strlen =
let ls = Ulexing.lexeme_start lexbuf - !ulb_pos in
let le = Ulexing.lexeme_end lexbuf - !ulb_pos in
assert(ls >= 0);
ULB.utf8_sub_string_length ls (le-ls) ulb
(* String.length (Ulexing.utf8_lexeme lexbuf) *)
Ulexing.utf8_sub_lexeme_length lexbuf 0 (Ulexing.lexeme_length lexbuf)
method sub_lexeme pos len =
let ls = Ulexing.lexeme_start lexbuf - !ulb_pos in
let le = Ulexing.lexeme_end lexbuf - !ulb_pos in
assert(ls >= 0);
ULB.utf8_sub_string (ls+pos) len ulb
(* Ulexing.utf8_sub_lexeme lexbuf pos len *)
Ulexing.utf8_sub_lexeme lexbuf pos len
method scan_document =
fun () ->
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment