Commit bcbe4ed2 authored by gerd's avatar gerd

Tried to optimize the function; but currently without success.

There should be deeper analysis -- on the other hand, splitting seems
to be relative fast compared with the Str splitting function.
Perhaps the improvements have an effect on machines with bigger caches.


git-svn-id: https://godirepo.camlcity.org/svn/lib-xstr/[email protected] e4cd5726-45db-0310-8eb3-84e3bb775810
parent ff63cd53
(* $Id: xstr_split.ml,v 1.1 1999/06/27 23:03:38 gerd Exp $
(* $Id: xstr_split.ml,v 1.2 1999/07/06 21:32:09 gerd Exp $
* ----------------------------------------------------------------------
*
*)
......@@ -60,87 +60,116 @@ let split_string ignoreset ignoreleft ignoreright separators (* s *) =
in
fun s ->
let l = String.length s in
let rec split i_wordbeg i_wordend i_current =
let rec split_over_word i_wordbeg i_wordend i_current =
(* i_wordbeg <= i_wordend: i_current has not yet reached the next
* separator. i_wordbeg is the position of
* the first CData character, i_wordend the
* position after the last CData Character of
* the word.
* i_wordbeg > i_wordend: i_current is just after the separator and
* searches the next word beginning
*)
if i_current < l then begin
if i_wordbeg <= i_wordend then begin
let code = Char.code (s.[i_current]) in
let cl = character_classification.(code) in
match cl with
CData -> split i_wordbeg (i_current+1) (i_current+1)
| CIgnore -> split i_wordbeg i_wordend (i_current+1)
| _ ->
let rec find_sep sepl =
match sepl with
[] ->
if cl = CSeparator then (* just as CData *)
split i_wordbeg (i_current+1) (i_current+1)
else (* just as CIgnore *)
split i_wordbeg i_wordend (i_current+1)
| sep :: sepl' ->
let lsep = String.length sep in
if i_current + lsep <= l &
String.sub s i_current lsep = sep then
let code = Char.code (s.[i_current]) in
let cl = character_classification.(code) in
match cl with
CData -> (* split i_wordbeg (i_current+1) (i_current+1) *)
fast_skip_word i_wordbeg (i_current+1)
| CIgnore -> split_over_word i_wordbeg i_wordend (i_current+1)
| _ ->
let rec find_sep sepl =
match sepl with
[] ->
if cl = CSeparator then (* just as CData *)
(* split i_wordbeg (i_current+1) (i_current+1) *)
fast_skip_word i_wordbeg (i_current+1)
else (* just as CIgnore *)
split_over_word i_wordbeg i_wordend (i_current+1)
| sep :: sepl' ->
let lsep = String.length sep in
if i_current + lsep <= l &
String.sub s i_current lsep = sep then
(* found separator *)
String.sub s i_wordbeg (i_wordend - i_wordbeg) ::
split (i_current + lsep) i_wordend (i_current + lsep)
else
find_sep sepl'
in
find_sep separators
end
else begin
(* i_wordbeg > i_wordend *)
let code = Char.code (s.[i_current]) in
let cl = character_classification.(code) in
match cl with
CData ->
split i_current (i_current+1) (i_current+1)
| (CIgnore|CIgnoreOrSeparator) ->
split i_wordbeg i_wordend (i_current+1)
| CSeparator ->
let rec find_sep sepl =
match sepl with
[] ->
split i_wordbeg (i_current+1) (i_current+1)
| sep :: sepl' ->
let lsep = String.length sep in
if i_current + lsep < l &
String.sub s i_current lsep = sep then
String.sub s i_wordbeg (i_wordend - i_wordbeg) ::
split_after_word (i_current + lsep) (i_current + lsep)
else
find_sep sepl'
in
find_sep separators
end
else
(* i_current >= l *)
if ignoreright then
[ String.sub s i_wordbeg (i_wordend - i_wordbeg) ]
else
[ String.sub s i_wordbeg (i_current - i_wordbeg) ]
and split_after_word i_wordbeg i_current =
(* i_wordbeg > i_wordend: i_current is just after the separator and
* searches the next word beginning
*)
if i_current < l then begin
let code = Char.code (s.[i_current]) in
let cl = character_classification.(code) in
match cl with
CData ->
(* split i_current (i_current+1) (i_current+1) *)
fast_skip_word i_current (i_current+1)
| (CIgnore|CIgnoreOrSeparator) ->
split_after_word i_wordbeg (i_current+1)
| CSeparator ->
let rec find_sep sepl =
match sepl with
[] ->
(* split i_wordbeg (i_current+1) (i_current+1) *)
fast_skip_word i_wordbeg (i_current+1)
| sep :: sepl' ->
let lsep = String.length sep in
if i_current + lsep < l &
String.sub s i_current lsep = sep then
(* found separator *)
"" ::
split (i_current + lsep) i_wordend (i_current + lsep)
else
find_sep sepl'
in
find_sep separators
end
"" ::
split_after_word (i_current + lsep) (i_current + lsep)
else
find_sep sepl'
in
find_sep separators
end
else
(* i_current >= l *)
if i_wordbeg <= i_wordend then begin
if ignoreright then
[ String.sub s i_wordbeg (i_wordend - i_wordbeg) ]
else
[ String.sub s i_wordbeg (i_current - i_wordbeg) ]
end
if i_wordbeg = 0 then
[] (* not any word found *)
else
if i_wordend = -1 then
[] (* not any word found *)
else
[ "" ]
[ "" ]
(* Now some frequent special cases *)
and fast_skip_word i_wordbeg i_current =
(* i_wordbeg <= i_current = i_wordend *)
if i_current < l-1 then begin
let code1 = Char.code (s.[i_current]) in
let cl1 = character_classification.(code1) in
match cl1 with
CData ->
begin
let code2 = Char.code (s.[i_current+1]) in
let cl2 = character_classification.(code2) in
match cl2 with
CData -> fast_skip_word i_wordbeg (i_current+2)
| CIgnore -> split_over_word i_wordbeg (i_current+1) (i_current+2)
| _ -> (* continue with the general routine *)
split_over_word i_wordbeg (i_current+1) (i_current+1)
end
| CIgnore -> split_over_word i_wordbeg i_current (i_current+1)
| _ -> (* continue with the general routine *)
split_over_word i_wordbeg i_current i_current
end
else split_over_word i_wordbeg i_current i_current
in
if ignoreleft then
split 0 (-1) 0
split_after_word 0 0
else
split 0 0 0
split_over_word 0 0 0
;;
......@@ -150,6 +179,12 @@ let split_string ignoreset ignoreleft ignoreright separators (* s *) =
* History:
*
* $Log: xstr_split.ml,v $
* Revision 1.2 1999/07/06 21:32:09 gerd
* Tried to optimize the function; but currently without success.
* There should be deeper analysis -- on the other hand, splitting seems
* to be relative fast compared with the Str splitting function.
* Perhaps the improvements have an effect on machines with bigger caches.
*
* Revision 1.1 1999/06/27 23:03:38 gerd
* Initial revision.
*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment