Commit de43bd6c authored by gerd's avatar gerd

Moved ULB and Ulexing to ocamlnet.


git-svn-id: https://godirepo.camlcity.org/svn/lib-pxp/trunk@681 dbe99aee-44db-0310-b2b3-d33182c8eb97
parent c2356835
......@@ -35,6 +35,10 @@ gen_done: $(INPUT)/*.def $(INPUT)/*.src
-outlinkprefix "pxp_ulex_link_$(ENC)"
touch gen_done
.PHONY: expand
expand:
camlp4 -I `ocamlfind query ulex` pa_o.cmo pa_op.cmo pa_ulex.cma pr_o.cmo pxp_ulex_utf8_01.ml
clean:
rm -f $(CLEAN_LIST) *.ml *.mll gen_done
......
open Pxp_reader;;
open Pxp_types;;
open Netulex;;
open Minilex;; (* defines [nextchar] *)
let lex_next_iso88591 lsrc =
......
This diff is collapsed.
......@@ -33,193 +33,10 @@ exception Not_resolvable of exn;;
* Not_resolvable(Not_found) serves as indicator for an unknown reason.
*)
module ULB : sig
type unicode_lexbuf =
private
{ mutable ulb_encoding : encoding;
(* The character encoding of [ulb_rawbuf] *)
mutable ulb_encoding_start : int;
(* The first character position to which [ulb_encoding]
* applies
*)
mutable ulb_rawbuf : string;
(* The encoded string to analyse *)
mutable ulb_rawbuf_len : int;
(* The filled part of [ulb_rawbuf] *)
mutable ulb_rawbuf_end : int;
(* The analysed part of [ulb_rawbuf]. We have always
* [ulb_rawbuf_end <= ulb_rawbuf_len]. The analysed part
* may be shorter than the filled part because there is
* not enough space in [ulb_chars], or because the filled
* part ends with an incomplete multi-byte character
*)
mutable ulb_rawbuf_const : bool;
(* Whether [ulb_rawbuf] is considered as a constant. If
* [true], it is never blitted.
*)
mutable ulb_chars : int array;
(* The analysed part of [ulb_rawbuf] as array of Unicode
* code points. Only the positions 0 to [ulb_chars_len-1]
* of the array are filled.
*)
mutable ulb_chars_pos : int array;
(* For every analysed character this array stores the
* byte position where the character begins in [ulb_rawbuf].
* In addition, the array contains at [ulb_chars_len] the
* value of [ulb_rawbuf_end].
*
* This array is one element longer than [ulb_chars].
*)
mutable ulb_chars_len : int;
(* The filled part of [ulb_chars] *)
mutable ulb_eof : bool;
(* Whether EOF has been seen *)
mutable ulb_refill : string -> int -> int -> int;
(* The refill function *)
mutable ulb_enc_change_hook : unicode_lexbuf -> unit
(* This function is called when the encoding changes *)
}
val from_function :
?raw_size:int ->
?char_size:int ->
?enc_change_hook:(unicode_lexbuf -> unit) ->
refill:(string -> int -> int -> int) ->
encoding ->
unicode_lexbuf
(** Creates a [unicode_lexbuf] to analyse strings of the
* passed [encoding] coming from the [refill] function.
*
* @param raw_size The initial size for [ulb_rawbuf]. Defaults to 512
* @param char_size The initial size for [ulb_chars]. Defaults to 256
* @param enc_change_hook This function is called when the encoding
* is changed, either by this module, or by the user
* @param refill This function is called with arguments [ulb_rawbuf],
* [ulb_rawbuf_len], and [l], where
* [l = String.length ulb_rawbuf - ulb_rawbuf_len] is the free
* space in the buffer. The function should fill new bytes into
* this substring, and return the number of added bytes. The
* return value 0 signals EOF.
*)
val from_string :
?enc_change_hook:(unicode_lexbuf -> unit) ->
encoding -> string -> unicode_lexbuf
(** Creates a [unicode_lexbuf] analysing the passed string encoded in
* the passed encoding. This function copies the input string.
*
* @param enc_change_hook This function is called when the encoding
* is changed, either by this module, or by the user
*)
val from_string_inplace :
?enc_change_hook:(unicode_lexbuf -> unit) ->
encoding -> string -> unicode_lexbuf
(** Creates a [unicode_lexbuf] analysing the passed string encoded in
* the passed encoding. This function does not copy the input string,
* but uses it directly as [ulb_rawbuf]. The string is not modified by ULB,
* but the caller must ensure that other program parts do not
* modify it either.
*
* @param enc_change_hook This function is called when the encoding
* is changed, either by this module, or by the user
*)
(*
val append :
src:(int array) -> ?pos:int -> ?len:int -> unicode_lexbuf -> unit
-- better we have something like from_unicode that refills directly into
the ulb_chars array.
*)
(** Appends a sub array of [src] to the [unicode_lexbuf].
* When the buffer is already at EOF, the function fails.
*
* @param pos Where the sub array begins, by default 0
* @param len The length of the sub array, by default [Array.length - pos]
*)
val delete :
int -> unicode_lexbuf -> unit
(** Deletes the number of characters from [unicode_lexbuf].
* These characters
* are removed from the beginning of the buffer, i.e.
* [ulb_chars.(n)] becomes the new first character of the
* buffer. All three buffers [ulb_rawbuf], [ulb_chars], and
* [ulb_chars_pos] are blitted as necessary.
*
* When the buffer is already at EOF, the function fails.
*
* For efficiency, it should be tried to call [delete] as seldom as
* possible. Its speed is linear to the number of characters to move.
*)
val refill :
unicode_lexbuf ->
unit
(** Tries to add characters to the [unicode_lexbuf] by calling the
* [ulb_refill] function. When the buffer is already at EOF, the
* exception [End_of_file] is raised, and the buffer is not modified.
* Otherwise, the [ulb_refill] function is called to
* add new characters. If necessary, [ulb_rawbuf], [ulb_chars], and
* [ulb_chars_pos] are enlarged such that it is ensured that either
* at least one new character is added, or that EOF is found.
* In the latter case, [ulb_eof] is set to [true] (and the next call
* of [refill_unicode_lexbuf] will raise [End_of_file]).
*)
val set_encoding :
encoding -> unicode_lexbuf -> unit
(** Sets the [encoding] to the passed value. This only affects future
* [refill] calls.
*)
val close :
unicode_lexbuf -> unit
(** Sets [ulb_eof] of the [unicode_lexbuf]. The rest of the buffer
* is not modified
*)
val utf8_sub_string : int -> int -> unicode_lexbuf -> string
(** The two [int] arguments are the position and length of a sub
* string of the lexbuf that is returned as UTF8 string. Position
* and length are given as character multiples.
*)
val utf8_sub_string_length : int -> int -> unicode_lexbuf -> int
(** Returns [String.length(utf8_sub_string args)]. Tries not to
* allocate the UTF-8 string.
*)
end (* module ULB *)
module Ulexing : sig
type lexbuf
exception Error
val from_ulb_lexbuf : ULB.unicode_lexbuf -> lexbuf
val lexeme_start: lexbuf -> int
val lexeme_end: lexbuf -> int
val lexeme_length: lexbuf -> int
val lexeme: lexbuf -> int array
val lexeme_char: lexbuf -> int -> int
val sub_lexeme: lexbuf -> int -> int -> int array
val utf8_lexeme: lexbuf -> string
val utf8_sub_lexeme: lexbuf -> int -> int -> string
val utf8_sub_lexeme_length: lexbuf -> int -> int -> int
(* "Internal" interface *)
val start: lexbuf -> unit
val next: lexbuf -> int
val mark: lexbuf -> int -> unit
val backtrack: lexbuf -> int
end
(* One must only use either [lsrc_lexbuf], or [lsrc_unicode_lexbuf] ! *)
type lexer_source =
{ lsrc_lexbuf : Lexing.lexbuf Lazy.t;
lsrc_unicode_lexbuf : ULB.unicode_lexbuf Lazy.t;
lsrc_unicode_lexbuf : Netulex.ULB.unicode_lexbuf Lazy.t;
}
......
......@@ -132,7 +132,7 @@ let lexeme_char lo lb k = Char.code(Lexing.lexeme_char lb k)
(* The specific header for the "ulex" output format *)
module Ulexing = Pxp_reader.Ulexing
module Ulexing = Netulex.Ulexing
let lexeme_len lo = Ulexing.lexeme_length
let sub_lexeme lo lb p n = lo # sub_lexeme p n
......
......@@ -21,6 +21,7 @@
open Pxp_types
open Pxp_lexer_types
open Pxp_reader
open Netulex
let _ =
assert("${encoding}" = "utf8");;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment