Commit c02aa698 authored by gerd's avatar gerd

Added Lazystring, Lazystring_from.

	Added replace_matched_substring function.
	Changed the structure of 'variable'. 'sref' is either an arbitrary
string, or it is the input string of the matching function. 'from' and
'len' are always used.


git-svn-id: https://godirepo.camlcity.org/svn/lib-xstr/[email protected] e4cd5726-45db-0310-8eb3-84e3bb775810
parent e271b3bd
(* $Id: xstr_match.ml,v 1.1 1999/06/27 23:03:37 gerd Exp $
(* $Id: xstr_match.ml,v 1.2 1999/07/04 20:02:07 gerd Exp $
* ----------------------------------------------------------------------
* String matching
*)
......@@ -20,8 +20,10 @@ type charset = int array;;
type matcher =
Literal of string
| Anystring
| Lazystring
| Anychar
| Anystring_from of charset
| Lazystring_from of charset
| Anychar_from of charset
| Nullstring
| Alternative of matcher list list
......@@ -147,28 +149,10 @@ let set_as_string set =
let match_string ml s =
let match_string_at ml s k =
let len = String.length s in
let rec reset ml =
match ml with
[] -> ()
| Alternative alts :: ml' ->
List.iter reset alts;
reset ml'
| Optional opt :: ml' ->
reset opt;
reset ml'
| Record (v,r) :: ml' ->
v.found <- false;
v.begun <- false;
reset ml'
| _ :: ml' ->
reset ml'
in
let rec run k ml recs =
(* returns the number of matched characters or raises Not_found.
* 'k': position in s
......@@ -200,6 +184,18 @@ let match_string ml s =
raise Not_found
in
find (len-k)
| Lazystring :: ml' ->
let rec find max n =
if n <= max then
try
let n', recs' = run (k+n) ml' recs in
n + n', recs'
with
Not_found -> find max (n+1)
else
raise Not_found
in
find (len-k) 0
| Anystring_from set :: ml' ->
let rec find n =
if n >= 0 then
......@@ -222,6 +218,28 @@ let match_string ml s =
n
in
find (region 0)
| Lazystring_from set :: ml' ->
let rec find max n =
if n <= max then
try
let n', recs' = run (k+n) ml' recs in
n + n', recs'
with
Not_found -> find max (n+1)
else
raise Not_found
in
let rec region n =
if k+n < len then
let c = Char.code (s.[k+n]) in
if member_of_set c set then
region (n+1)
else
n
else
n
in
find (region 0) 0
| Anychar :: ml' ->
if k < len then
let n',recs' = run (k+1) ml' recs in
......@@ -283,40 +301,233 @@ let match_string ml s =
in
try
reset ml;
let _, recs = run 0 ml [] in
let _, recs = run k ml [] in
List.iter
(fun r ->
if r.found then
r.sref <- String.sub s r.from r.len)
r.sref <- s)
recs;
true
with
Not_found ->
false
;;
let match_string ml s =
let rec reset ml =
match ml with
[] -> ()
| Alternative alts :: ml' ->
List.iter reset alts;
reset ml'
| Optional opt :: ml' ->
reset opt;
reset ml'
| Record (v,r) :: ml' ->
v.found <- false;
v.begun <- false;
reset r;
reset ml'
| _ :: ml' ->
reset ml'
in
reset ml;
match_string_at ml s 0
;;
let var s =
{ sref = s; found = false; begun = false; from = 0; len = 0 }
{ sref = s; found = false; begun = false; from = 0; len = String.length s }
;;
type replacer =
ReplaceLiteral of string
| ReplaceVar of variable
| ReplaceFunction of (unit -> string)
;;
type rflag =
Anchored
| Limit of int
;;
type repl =
RLit of string
| RRegion of (int * int)
;;
exception Limit_exceeded;;
let replace_matched_substrings ml rl fl s =
let anchored = List.mem Anchored fl in
let all = var "" in
let ml' = [ Record(all, ml)] @ (if anchored then [] else [ Anystring ]) in
let rec resetlist ml =
match ml with
[] -> []
| Alternative alts :: ml' ->
List.flatten (List.map resetlist alts) @
resetlist ml'
| Optional opt :: ml' ->
resetlist opt @
resetlist ml'
| Record (v,r) :: ml' ->
v :: (resetlist r @ resetlist ml')
| _ :: ml' ->
resetlist ml'
in
let resl = resetlist ml' in
let limit =
List.fold_left
(fun m f ->
match f with
Limit n ->
if n < 0 then failwith "replace_matched_substrings";
if m >= 0 then min m n else n
| _ -> m)
(-1)
fl in
let n_repl = ref 0 in
let replace_at k =
if limit >= 0 & !n_repl >= limit then
raise Limit_exceeded;
List.iter
(fun v ->
v.found <- false;
v.begun <- false)
resl;
if match_string_at ml' s k then begin
(* interpret rl *)
let repltext =
List.map
(fun r ->
match r with
ReplaceLiteral s -> RLit s
| ReplaceVar v ->
if v.found then
RRegion (v.from, v.len)
else
RLit ""
| ReplaceFunction f ->
begin try
RLit (f ())
with
Not_found ->
raise Not_found
| Match_failure (_,_,_) ->
raise Not_found
end)
rl
in
let amount = all.len in
incr n_repl;
repltext, amount
end
else raise Not_found
in
let l = String.length s in
let rec left_to_right trans k_gapstart k =
if k <= l then begin
(* Note k<=l: this criterion could be much better *)
let trans', k_gapstart', k' =
try
let repltext, amount = replace_at k in
repltext @ [RRegion(k_gapstart, k-k_gapstart)] @ trans,
k + amount,
(if amount=0 then k+1 else k+amount)
with
Not_found ->
trans, k_gapstart, k+1
| Limit_exceeded ->
trans, k_gapstart, l+1
in
left_to_right trans' k_gapstart' k'
end
else
RRegion(k_gapstart, k-k_gapstart-1) :: trans
in
let with_anchors () =
try
let repltext, amount = replace_at 0 in
repltext
with
Not_found ->
[ RRegion(0, l) ]
| Limit_exceeded ->
[ RRegion(0, l) ]
in
let rec total_length n trans =
match trans with
RLit s :: trans' ->
total_length (n+String.length s) trans'
| RRegion (_,len) :: trans' ->
total_length (n+len) trans'
| [] ->
n
in
let rec form_replacement_ltor target trans j =
match trans with
RLit t :: trans' ->
let ls = String.length t in
let j' = j - ls in
if ls > 0 then String.blit t 0 target j' ls;
form_replacement_ltor target trans' j'
| RRegion (from,len) :: trans' ->
let j' = j - len in
if len > 0 then String.blit s from target j' len;
form_replacement_ltor target trans' j'
| [] -> ()
in
(* TODO: interpret rtol,
* what's with initialization of variables?
*)
let transformer =
if anchored then
with_anchors()
else
left_to_right [] 0 0
in
let length = total_length 0 transformer in
let target = String.create length in
form_replacement_ltor target transformer length;
target, !n_repl
;;
let var_matched v =
v.found
;;
let string_of_var v =
v.sref
String.sub v.sref v.from v.len
;;
let found_string_of_var v =
if v.found then v.sref else raise Not_found
if v.found then String.sub v.sref v.from v.len else raise Not_found
;;
......@@ -386,6 +597,13 @@ let mknegset s =
* History:
*
* $Log: xstr_match.ml,v $
* Revision 1.2 1999/07/04 20:02:07 gerd
* Added Lazystring, Lazystring_from.
* Added replace_matched_substring function.
* Changed the structure of 'variable'. 'sref' is either an arbitrary
* string, or it is the input string of the matching function. 'from' and
* 'len' are always used.
*
* Revision 1.1 1999/06/27 23:03:37 gerd
* Initial revision.
*
......
(* $Id: xstr_match.mli,v 1.1 1999/06/27 23:03:38 gerd Exp $
(* $Id: xstr_match.mli,v 1.2 1999/07/04 20:02:08 gerd Exp $
* ----------------------------------------------------------------------
* Matching strings
*)
(* Copyright 1999 by Gerd Stolpmann *)
type variable
(* A 'variable' can record matched regions *)
......@@ -12,8 +14,10 @@ type charset
type matcher =
Literal of string
| Anystring
| Lazystring
| Anychar
| Anystring_from of charset
| Lazystring_from of charset
| Anychar_from of charset
| Nullstring
| Alternative of matcher list list
......@@ -22,20 +26,22 @@ type matcher =
| Scanner of (string -> int)
;;
(* Literal s: matches literally s and nothing else
* Anystring: matches a string of arbitrary length with arbitrary
* contents
* Anychar: matches an arbitrary character
* Anystring_from s: matches a string of arbitrary length with characters
* from charset s
* Anychar_from s: matches a character from charset s
* Nullstring: matches the empty string
(* Literal s: matches literally s and nothing else
* Anystring/Lazystring matches a string of arbitrary length with arbitrary
* contents
* Anystring_from s/
* Lazystring_from s matches a string of arbitrary length with characters
* from charset s
* Anychar: matches an arbitrary character
* Anychar_from s: matches a character from charset s
* Nullstring: matches the empty string
* Alternative
* [ ml1; ml2; ... ]
* first tries the sequence ml1, then ml2, and so on
* until one of the sequences leads to a match of the
* whole string
* Optional ml: first tries the sequence ml, them the empty string.
* = Alternative [ml; [Nullstring]]
* Record (v, ml): matches the same as ml, but the region of the string
* is recorded in v
* Scanner f: f s is called where s is the rest to match. The function
......@@ -57,15 +63,60 @@ val match_string : matcher list -> string -> bool
* Notes:
* - Anystring and Anystring_from are "greedy"; they try to match as much
* as possible.
* - In contrast to this, Lazystring and Lazystring_from are "lazy"; they
* try to match as few as possible.
* - Alternatives are tested from left to right.
* - Options are first tested with argument, then with the empty string.
* - Options are first tested with argument, then with the empty string
* (i.e. "greedy")
*)
type replacer =
ReplaceLiteral of string
| ReplaceVar of variable
| ReplaceFunction of (unit -> string)
;;
type rflag =
Anchored
| Limit of int
(* | RightToLeft *)
;;
val replace_matched_substrings : matcher list -> replacer list -> rflag list
-> string -> (string * int)
(* replace_matched_substrings ml rl fl s:
*
* All substrings of 's' are matched against 'ml' in turn, and all
* non-overlapping matchings are replaced according 'rl'. The standard
* behaviour is to test from left to right, and to replace all occurences
* of substrings.
* This can be modified by 'fl':
* - Anchored: Not the substrings of 's', but only 's' itself is
* matched against 'ml'.
* - Limit n: At most 'n' replacements will be done.
* - RightToLeft: Begin with the rightmost matching; proceed with more
* left matchings (NOT YET IMPLEMENTED!!!!)
* The meaning of 'rl': Every matching is replaced by the sequence of
* the elements of 'rl'.
* - ReplaceLiteral t: Replace the string t
* - ReplaceVar v: Replace the contents of 'v' or the empty string,
* if v has no matching
* - ReplaceFunction f: Replace f(). You may raise Not_found or
* Match_failure to skip to the next matching.
* 'replace_matched_substrings' returns the number of replacements.
*)
val var : string -> variable
(* var s: creates new variable with initial value s. If this variable
* is used in a subsequent matching, and a value is found, the value
* is overwritten; otherwise the old value persists.
* - Initial vales are stored as references to strings
* - Matched values are stored as triples (s,from,len) where 's' is the
* input string of the matching function
*
* [Note thread-safety: variables must not be shared by multiple threads.]
*)
......@@ -102,11 +153,79 @@ val mknegset : string -> charset
*
* let v = var "" in
* let _ = match_string [ Literal "("; Record (v, [Anystring]); Literal ")" ]
* s in
* found_string_of_var v
* s
* in found_string_of_var v
*
* - if s is "(abc)" returns "abc"
* - if the parantheses are missing, raises Not_found
*
* VARIANT I:
*
* let v = var "" in
* let _ = match_string [ Lazystring;
* Literal "("; Record (v, [Lazystring]); Literal ")";
* Anystring ]
* s
* in found_string_of_var v
*
* - finds the first substring with parantheses, e.g.
* s = "abc(def)ghi(jkl)mno" returns "def"
*
* To get the last substring, swap Lazystring and Anystring at the beginning
* resp. end.
*
* VARIANT II:
*
* let v = var "" in
* let _ = match_string [ Lazystring;
* Literal "("; Record (v, [Anystring]); Literal ")";
* Anystring ]
* s
* in found_string_of_var v
*
* - for s = "abc(def)ghi(jkl)mno" it is returned "def)ghi(jkl"
*)
(* ---------------------------------------------------------------------- *)
(* EXAMPLE:
*
* let v = var "" in
* let digits = mkset "0-9" in
* let digits_re = [ Record(v, [ Anychar_from digits; Anystring_from digits])]
* in
* replace_matched_substrings digits_re [ ReplaceLiteral "D" ] [] "ab012cd456fg"
*
* yields: ("abDcdDfg", 2)
*
* VARIANT I:
*
* replace_matched_substrings digits_re [ ReplaceLiteral "D" ]
* [ Limit 1 ] "ab012cd456fg"
*
* yields: ("abDcd456fg", 1)
*
* VARIANT II:
*
* replace_matched_substrings digits_re [ ReplaceLiteral "D" ]
* [ Anchored ] "ab012cd456fg"
*
* yields: ("ab012cd456fg", 0)
*
* VARIANT III:
*
* replace_matched_substrings digits_re [ ReplaceLiteral "D" ]
* [ Anchored ] "012"
*
* yields: ("D", 1)
*
* VARIANT IV:
*
* let f() = string_of_int(1+int_of_string(string_of_var v)) in
* replace_matched_substrings digits_re [ ReplaceFunction f ]
* [] "ab012cd456fg"
*
* yields: ("ab13cd457fg", 2)
*)
......@@ -114,6 +233,13 @@ val mknegset : string -> charset
* History:
*
* $Log: xstr_match.mli,v $
* Revision 1.2 1999/07/04 20:02:08 gerd
* Added Lazystring, Lazystring_from.
* Added replace_matched_substring function.
* Changed the structure of 'variable'. 'sref' is either an arbitrary
* string, or it is the input string of the matching function. 'from' and
* 'len' are always used.
*
* Revision 1.1 1999/06/27 23:03:38 gerd
* Initial revision.
*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment