Commit f09a2adb authored by gerd's avatar gerd

Initial revision.


git-svn-id: https://godirepo.camlcity.org/svn/lib-xstr/[email protected] e4cd5726-45db-0310-8eb3-84e3bb775810
parents
(* $Id: xstr_match.ml,v 1.1 1999/06/27 23:03:37 gerd Exp $
* ----------------------------------------------------------------------
* String matching
*)
type variable =
{ mutable sref : string;
mutable found : bool;
mutable begun : bool;
mutable from : int;
mutable len : int
}
;;
type charset = int array;;
type matcher =
Literal of string
| Anystring
| Anychar
| Anystring_from of charset
| Anychar_from of charset
| Nullstring
| Alternative of matcher list list
| Optional of matcher list
| Record of (variable * matcher list)
| Scanner of (string -> int)
;;
(**********************************************************************)
(* operations on sets *)
(* copied from the JavaCaml regexp implementation *)
let the_full_set = Array.create 16 0xffff;;
let the_empty_set = Array.create 16 0;;
let dup_set s =
Array.copy s
;;
let empty_set () =
the_empty_set
;;
let full_set () =
the_full_set
;;
let ( +! ) a b =
(* union *)
let r = Array.create 16 0 in
for i=0 to 15 do
r.(i) <- a.(i) lor b.(i)
done;
r
;;
let ( *! ) a b =
(* intersection *)
let r = Array.create 16 0 in
for i=0 to 15 do
r.(i) <- a.(i) land b.(i)
done;
r
;;
let ( !! ) a =
(* negation *)
let r = Array.create 16 0 in
for i=0 to 15 do
r.(i) <- a.(i) lxor 0xffff
done;
r
;;
let ( ?! ) a =
(* not null? *)
let n = ref 0 in
for i=0 to 15 do
n := !n lor a.(i)
done;
!n <> 0
;;
let set_include a n =
(* include in set -- this is in-place modification! *)
a.( n lsr 4 ) <- a.( n lsr 4 ) lor (1 lsl (n land 15))
;;
let set_exclude a n =
(* exclude from set -- this is in-place modification! *)
a.( n lsr 4 ) <- a.( n lsr 4 ) land ((1 lsl (n land 15)) lxor 0xffff)
;;
let member_of_set n a =
(a.( n lsr 4 ) land (1 lsl (n land 15))) <> 0
;;
let word_set() =
let a = dup_set (empty_set()) in
List.iter
(fun c ->
set_include a (Char.code c))
[ 'a'; 'b'; 'c'; 'd'; 'e'; 'f'; 'g'; 'h'; 'i'; 'j'; 'k'; 'l'; 'm';
'n'; 'o'; 'p'; 'q'; 'r'; 's'; 't'; 'u'; 'v'; 'w'; 'x'; 'y'; 'z';
'A'; 'B'; 'C'; 'D'; 'E'; 'F'; 'G'; 'H'; 'I'; 'J'; 'K'; 'L'; 'M';
'N'; 'O'; 'P'; 'Q'; 'R'; 'S'; 'T'; 'U'; 'V'; 'W'; 'X'; 'Y'; 'Z';
'0'; '1'; '2'; '3'; '4'; '5'; '6'; '7'; '8'; '9';
'_' ];
a
;;
let noword_set() =
let a = word_set() in
!! a
;;
let set_as_string set =
let s = String.make 32 ' ' in
for i = 0 to 15 do
s.[i+i] <- Char.chr (set.(i) land 0xff);
s.[i+i+1] <- Char.chr (set.(i) lsr 8);
done;
s
;;
(**********************************************************************)
let match_string ml s =
let len = String.length s in
let rec reset ml =
match ml with
[] -> ()
| Alternative alts :: ml' ->
List.iter reset alts;
reset ml'
| Optional opt :: ml' ->
reset opt;
reset ml'
| Record (v,r) :: ml' ->
v.found <- false;
v.begun <- false;
reset ml'
| _ :: ml' ->
reset ml'
in
let rec run k ml recs =
(* returns the number of matched characters or raises Not_found.
* 'k': position in s
* 'ml': matcher list to process
* 'recs': recorded sections
*)
match ml with
[] ->
if k = len then
0, recs
else
raise Not_found
| Literal x :: ml' ->
let xlen = String.length x in
if k + xlen <= len & String.sub s k xlen = x then
let n, recs' = run (k+xlen) ml' recs in
n + xlen, recs'
else
raise Not_found
| Anystring :: ml' ->
let rec find n =
if n >= 0 then
try
let n', recs' = run (k+n) ml' recs in
n + n', recs'
with
Not_found -> find (n-1)
else
raise Not_found
in
find (len-k)
| Anystring_from set :: ml' ->
let rec find n =
if n >= 0 then
try
let n', recs' = run (k+n) ml' recs in
n + n', recs'
with
Not_found -> find (n-1)
else
raise Not_found
in
let rec region n =
if k+n < len then
let c = Char.code (s.[k+n]) in
if member_of_set c set then
region (n+1)
else
n
else
n
in
find (region 0)
| Anychar :: ml' ->
if k < len then
let n',recs' = run (k+1) ml' recs in
1 + n', recs'
else
raise Not_found
| Anychar_from set :: ml' ->
if k < len then
let c = Char.code (s.[k]) in
if member_of_set c set then
let n',recs' = run (k+1) ml' recs in
1 + n', recs'
else
raise Not_found
else
raise Not_found
| Nullstring :: ml' ->
run k ml' recs
| Alternative alts :: ml' ->
let rec find alts =
match alts with
[] -> raise Not_found
| alt :: alts' ->
begin try
run k (alt @ ml') recs
with
Not_found -> find alts'
end
in
find alts
| Optional opt :: ml' ->
begin try
run k (opt @ ml') recs
with
Not_found ->
run k ml' recs
end
| Record (r, recorded) :: ml' ->
if r.found then
failwith "string_match: the same variable matches twice";
if r.begun then begin (* ==> recorded = [] *)
let n, rec' = run k ml' recs in
r.found <- true;
r.len <- k - r.from;
n, r :: rec'
end
else begin
r.begun <- true;
r.from <- k;
run k (recorded @ (Record(r,[]) :: ml')) recs
end
| Scanner f :: ml' ->
let n = f (String.sub s k (len-k)) in
if k+n > len then
failwith "match";
let n', recs' = run (k+n) ml' recs in
n+n',recs'
in
try
reset ml;
let _, recs = run 0 ml [] in
List.iter
(fun r ->
if r.found then
r.sref <- String.sub s r.from r.len)
recs;
true
with
Not_found ->
false
;;
let var s =
{ sref = s; found = false; begun = false; from = 0; len = 0 }
;;
let var_matched v =
v.found
;;
let string_of_var v =
v.sref
;;
let found_string_of_var v =
if v.found then v.sref else raise Not_found
;;
let mkset s =
let l = String.length s in
let k = ref (-1) in
let c = ref ' ' in
let next_noesc() =
incr k;
if ( !k < l ) then begin
c := s.[ !k ];
end
in
let set = dup_set (empty_set()) in
let add_char c =
let code = Char.code c in
set_include set code
in
let add_range c1 c2 =
let code1 = Char.code c1 in
let code2 = Char.code c2 in
for i = code1 to code2 do
set_include set i
done
in
let continue = ref true in
let first = ref true in (* the character after [ or [^ ? *)
while !continue & !k < l do
match () with
() when !c = '[' & !k + 1 < l & s.[!k + 1] = ':' ->
failwith "mkset: Character classes such as [[:digit:]] not implemented";
(* TODO: check for predefined sets *)
| () when (!k + 2 < l) & (s.[!k + 1] = '-') & (s.[!k + 2] <> ']') ->
(* range *)
add_range !c (s.[!k + 2]);
next_noesc();
next_noesc();
next_noesc();
first := false;
| () ->
add_char !c;
next_noesc();
first := false;
done;
set
;;
let mknegset s =
!! (mkset s)
;;
(* ======================================================================
* History:
*
* $Log: xstr_match.ml,v $
* Revision 1.1 1999/06/27 23:03:37 gerd
* Initial revision.
*
*
*)
(* $Id: xstr_match.mli,v 1.1 1999/06/27 23:03:38 gerd Exp $
* ----------------------------------------------------------------------
* Matching strings
*)
type variable
(* A 'variable' can record matched regions *)
type charset
(* sets of characters *)
type matcher =
Literal of string
| Anystring
| Anychar
| Anystring_from of charset
| Anychar_from of charset
| Nullstring
| Alternative of matcher list list
| Optional of matcher list
| Record of (variable * matcher list)
| Scanner of (string -> int)
;;
(* Literal s: matches literally s and nothing else
* Anystring: matches a string of arbitrary length with arbitrary
* contents
* Anychar: matches an arbitrary character
* Anystring_from s: matches a string of arbitrary length with characters
* from charset s
* Anychar_from s: matches a character from charset s
* Nullstring: matches the empty string
* Alternative
* [ ml1; ml2; ... ]
* first tries the sequence ml1, then ml2, and so on
* until one of the sequences leads to a match of the
* whole string
* Optional ml: first tries the sequence ml, them the empty string.
* Record (v, ml): matches the same as ml, but the region of the string
* is recorded in v
* Scanner f: f s is called where s is the rest to match. The function
* should return the number of characters it can match,
* or raise Not_found
*)
val match_string : matcher list -> string -> bool
(* match_string ml s:
* Tries to match 'ml' against the string 's'; returns true on success, and
* false otherwise.
* As side-effect, the variables in 'ml' are set.
* Matching proceeds from left to right, and for some of the matchers there
* are particular matching orders. The first match that is found using
* this order is returned (i.e. the variables get their values from this
* match).
* Notes:
* - Anystring and Anystring_from are "greedy"; they try to match as much
* as possible.
* - Alternatives are tested from left to right.
* - Options are first tested with argument, then with the empty string.
*)
val var : string -> variable
(* var s: creates new variable with initial value s. If this variable
* is used in a subsequent matching, and a value is found, the value
* is overwritten; otherwise the old value persists.
*
* [Note thread-safety: variables must not be shared by multiple threads.]
*)
val var_matched : variable -> bool
(* returns true if the variable matched a value in the last match_string *)
val string_of_var : variable -> string
(* returns the current value of the variable *)
val found_string_of_var : variable -> string
(* returns the current value of the variable only if there was a match
* for this variable in the last match_string; otherwise raise Not_found
*)
val mkset : string -> charset
(* creates a set from readable description. The string simply enumerates
* the characters of the set, and the notation "x-y" is possible, too.
* To include '-' in the set, put it at the beginning or end.
*)
val mknegset : string -> charset
(* creates the complement that mkset would create *)
(* ---------------------------------------------------------------------- *)
(* EXAMPLE:
*
* let v = var "" in
* let _ = match_string [ Literal "("; Record (v, [Anystring]); Literal ")" ]
* s in
* found_string_of_var v
*
* - if s is "(abc)" returns "abc"
* - if the parantheses are missing, raises Not_found
*)
(* ======================================================================
* History:
*
* $Log: xstr_match.mli,v $
* Revision 1.1 1999/06/27 23:03:38 gerd
* Initial revision.
*
*
*)
(* $Id: xstr_search.ml,v 1.1 1999/06/27 23:03:38 gerd Exp $
* ----------------------------------------------------------------------
* Search & Replace
*)
exception Replace_phrase of (int * string);;
let index_of_substring_from s k_left substr =
let l = String.length s in
let lsub = String.length substr in
let k_right = l - lsub in
let c = if substr <> "" then substr.[0] else ' ' in
let rec search k =
if k <= k_right then begin
if String.sub s k lsub = substr then
k
else
let k_next = String.index_from s (k+1) c in
search k_next
end
else raise Not_found
in
if substr = "" then k_left else search k_left
;;
let rindex_of_substring_from s k_right substr =
let l = String.length s in
let lsub = String.length substr in
let c = if substr <> "" then substr.[0] else ' ' in
let rec search k =
if k >= 0 then begin
if String.sub s k lsub = substr then
k
else
let k_next = String.rindex_from s (k-1) c in
search k_next
end
else raise Not_found
in
if substr = "" then k_right else search k_right
;;
let index_of_substring s substr =
index_of_substring_from s 0 substr;;
let rindex_of_substring s substr =
rindex_of_substring_from s (String.length s - String.length substr) substr;;
let contains_substring s substr =
try
let _ = index_of_substring s substr in true
with
Not_found -> false
;;
let contains_substring_from s k_left substr =
try
let _ = index_of_substring_from s k_left substr in true
with
Not_found -> false
;;
let rcontains_substring_from s k_right substr =
try
let _ = rindex_of_substring_from s k_right substr in true
with
Not_found -> false
;;
let indexlist_of_substring s substr =
let rec enumerate k =
try
let pos = index_of_substring_from s k substr in
pos :: enumerate (pos+1)
with
Not_found -> []
in
enumerate 0
;;
let rev_concat sep sl =
(* = String.concat sep (List.rev sl), but more efficient *)
let lsep = String.length sep in
let rec get_len v sl =
match sl with
[] -> v
| s :: sl' ->
get_len (v + lsep + String.length s) sl'
in
let len =
if sl = [] then 0 else get_len 0 sl - lsep in
let t = String.create len in
let rec fill_in k sl =
match sl with
[] -> ()
| [ s ] ->
let s_len = String.length s in
String.blit s 0 t (k-s_len) s_len
| s :: sl' ->
let s_len = String.length s in
let k' = k - s_len in
let k'' = k' - lsep in
String.blit s 0 t k' s_len;
String.blit sep 0 t k'' lsep;
fill_in k'' sl'
in
fill_in len sl;
t
;;
let replace_char s rule =
let l = String.length s in
let rec replace coll k_last k =
if k < l then begin
let c = s.[k] in
try
let s' = rule c k in
raise (Replace_phrase (1,s'))
(* Alternatively, we could directly invoke 'replace' with some
* parameters. But this would be a true recursion, without the
* chance to be eliminated.
* Would lead to Stack_overflow for large strings.
*)
with
Match_failure(_,_,_) ->
replace coll k_last (k+1)
| Not_found ->
replace coll k_last (k+1)
| Replace_phrase (length, s') ->
replace (s' :: String.sub s k_last (k-k_last) :: coll) (k+length) (k+length)
end
else
String.sub s k_last (k-k_last) :: coll
in
rev_concat "" (replace [] 0 0)
;;
let replace_substring s substrlist rule =
let characters =
(List.map
(fun substr ->
if substr = "" then
failwith "replace_substring"
else
substr.[0])
substrlist) in
let l = String.length s in
let rec find k sl =
match sl with
[] -> raise Not_found
| sub :: sl' ->
let lsub = String.length sub in
if k <= l - lsub & String.sub s k lsub = sub then
let replacement = rule sub k in
raise (Replace_phrase(lsub, replacement))
else
raise Not_found
in
let rule' c k =
if List.mem c characters then
find k substrlist
else
raise Not_found
in
let rule'' c0 c k =
if c = c0 then find k substrlist else raise Not_found in
if List.length substrlist = 1 then
replace_char s (rule'' (List.hd substrlist).[0])
else
replace_char s rule'
;;
(* ======================================================================
* History:
*
* $Log: xstr_search.ml,v $
* Revision 1.1 1999/06/27 23:03:38 gerd
* Initial revision.
*
*
*)
(* $Id: xstr_search.mli,v 1.1 1999/06/27 23:03:38 gerd Exp $
* ----------------------------------------------------------------------
* Search & Replace
*)
exception Replace_phrase of (int * string);;
(* see 'replace_char' and 'replace_string' *)
val index_of_substring_from : string -> int -> string -> int
(* index_of_substring_from s k_left substr:
* finds the leftmost index >= k_left where 'substr' occurs within s
* or raises Not_found.
*)
val rindex_of_substring_from : string -> int -> string -> int
(* eindex_of_substring_from s k_right substr:
* finds the rightmost index <= k_right where 'substr' occurs within s
* or raises Not_found.
*)
val index_of_substring : string -> string -> int
(* index_of_substring s substr:
* finds the leftmost index where 'substr' occurs within s
* or raises Not_found.
*)
val rindex_of_substring : string -> string -> int
(* eindex_of_substring s substr:
* finds the rightmost index where 'substr' occurs within s
* or raises Not_found.
*)