Commit a7c87f31 authored by gerd's avatar gerd

consume less memory for sorting


git-svn-id: https://gps.dynxs.de/private/svn/app-plasma/[email protected] 55289a75-7b90-4627-9e07-ffb4263930b2
parent e5fba012
......@@ -112,50 +112,67 @@ let exec_sort_task me mj (t:sort_task) =
let reader = fragment_reader c t.sort_input in
Stack.push (fun () -> reader # close_in()) cleanup;
let l = ref [] in
let n = ref 0 in
let b_size = ref 1024 in
let b_end = ref 0 in
let b_lines = ref (Array.make !b_size "") in
let b_keys = ref (Array.make !b_size "") in
let b_hashes = ref (Array.make !b_size 0) in
let double a null new_size =
let old_a = !a in
a := Array.make new_size null;
Array.blit old_a 0 !a 0 !b_size
in
( try
while true do
let line = reader # input_record() in
let key = mj # extract_key me line in
let key = mj # extract_key me line in
let hash = Hashtbl.hash key in
l := (hash,key,line) :: !l;
incr n
if !b_end = !b_size then (
let new_size = 2 * !b_size in
double b_lines "" new_size;
double b_keys "" new_size;
double b_hashes 0 new_size;
b_size := new_size
);
!b_lines. ( !b_end ) <- line;
!b_keys. ( !b_end ) <- key;
!b_hashes.( !b_end ) <- hash;
incr b_end
done
with
| End_of_file -> ()
);
let a = Array.make !n (0,"","") in
let k = ref (!n-1) in
while !l <> [] do
a.( !k ) <- List.hd !l;
l := List.tl !l;
decr k
done;
assert(!k = (-1));
let index_array = Array.init !b_end (fun k -> k) in
(* Note that stable_sort is a merge sort, i.e. needs another array
as workspace. Don't see this as a major problem - the storage for
the strings is shared with the orignal array.
as workspace
*)
Array.stable_sort
(fun (h1,k1,l1) (h2,k2,l2) ->
(fun i1 i2 ->
let h1 = Array.unsafe_get !b_hashes i1 in
let h2 = Array.unsafe_get !b_hashes i2 in
if h1 = h2 then
let k1 = Array.unsafe_get !b_keys i1 in
let k2 = Array.unsafe_get !b_keys i2 in
String.compare k1 k2
else
h1 - h2 (* h1, h2 are both positive *)
)
a;
index_array;
let writer =
Mapred_io.write_file c t.sort_output in
Stack.push (fun () -> writer # close_out()) cleanup;
Array.iter
(fun (_,_,l) ->
writer # output_record l
(fun i ->
let line = !b_lines.(i) in
writer # output_record line
)
a;
index_array;
writer # close_out();
Netlog.logf `Debug "Done sort %s"
(Mapred_tasks.string_of_task_id (`Sort t));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment