Commit 672b21e2 authored by gerd's avatar gerd

Renamed:

	Types -> Uni_types
	Parser -> Uni_parser
	Lexer -> Uni_lexer
	File -> Lexpp_file


git-svn-id: https://godirepo.camlcity.org/svn/lib-pxp/trunk@665 dbe99aee-44db-0310-b2b3-d33182c8eb97
parent f4f8293c
......@@ -3,25 +3,30 @@ include $(TOP_DIR)/Makefile.rules
all: lexpp$(EXEC_SUFFIX)
DEPOBJS = file.ml lexer.ml parser.ml parser.mli types.ml ucs2_to_utf8.ml \
main.ml
DEPOBJS = lexpp_file.ml uni_lexer.ml uni_parser.ml uni_parser.mli \
uni_types.ml ucs2_to_utf8.ml main.ml
OBJS = types.cmo lexer.cmo parser.cmo file.cmo ucs2_to_utf8.cmo main.cmo
OBJS = uni_types.cmo uni_lexer.cmo uni_parser.cmo lexpp_file.cmo \
mll_lexer.cmo \
ucs2_to_utf8.cmo main.cmo
PACKAGES=netstring
lexer.ml:
$(OCAMLLEX) lexer.mll
uni_lexer.ml:
$(OCAMLLEX) uni_lexer.mll
parser.ml:
$(OCAMLYACC) parser.mly
mll_lexer.ml:
$(OCAMLLEX) mll_lexer.mll
parser.mli:
$(OCAMLYACC) parser.mly
uni_parser.ml:
$(OCAMLYACC) uni_parser.mly
uni_parser.mli:
$(OCAMLYACC) uni_parser.mly
depend: $(DEPOBJS)
$(OCAMLDEP) $(DEPOBJS) > depend
lexpp$(EXEC_SUFFIX): $(OBJS)
$(OCAMLC) -o lexpp$(EXEC_SUFFIX) -linkpkg $(OBJS)
......@@ -29,7 +34,8 @@ lexpp.cma: $(OBJS)
$(OCAMLC) -o lexpp.cma -a $(OBJS)
clean:
rm -f $(CLEAN_LIST) lexer.ml parser.ml parser.mli \
rm -f $(CLEAN_LIST) uni_lexer.ml uni_parser.ml uni_parser.mli \
mll_lexer.ml \
lexpp$(EXEC_SUFFIX) depend
CLEAN: clean
......
(* $Id$
* ----------------------------------------------------------------------
*
*)
open Printf;;
let section_re =
Netstring_str.regexp "^[(][*][ \t]*\\[\\([A-Za-z0-9_-]+\\)\\][ \t]*[*][)]";;
let read_sections filename =
let f = open_in filename in
printf "[reading %s]\n" filename; flush stdout;
let current_section = ref None in
let current_data = Buffer.create 1000 in
let sections = ref [] in
let save_section() =
match !current_section with
None -> ()
| Some s ->
sections := (s, Buffer.contents current_data) :: !sections;
current_section := None;
in
try
while true do
let line = input_line f in
match Netstring_str.string_match section_re line 0 with
Some mtch ->
let section_name = Netstring_str.matched_group mtch 1 line in
(* save old section: *)
save_section();
(* begin new section: *)
current_section := Some section_name;
Buffer.clear current_data;
| None ->
Buffer.add_string current_data line;
Buffer.add_char current_data '\n';
done;
assert false
with
End_of_file ->
close_in f;
save_section();
List.rev !sections
;;
let parse_char_classes s =
Uni_parser.main Uni_lexer.token (Lexing.from_string s)
;;
(* The following printing functions have originally been written by Claudio
* Sacerdoti Coen.
*)
(* padded_string_of_int i returns the string representing the *)
(* integer i (i < 256) using exactly 3 digits (example: 13 -> "013") *)
let padded_string_of_int i =
if i < 10 then
"00" ^ string_of_int i
else if i < 100 then
"0" ^ string_of_int i
else
string_of_int i
;;
(* Two functions useful to print a definition *)
let rec print_disjunction ?(first = true) out =
function
[] ->
if first then output_string out " ['b'-'a' (*empty*) ] "
| he::tl ->
if not first then output_string out " | " ;
print_re out he ;
print_disjunction ~first:false out tl
and print_re out =
function
Uni_types.Char i -> output_string out ("'\\" ^ padded_string_of_int i ^ "'")
| Uni_types.Interval (l,u) ->
output_string out ("['\\" ^ padded_string_of_int l ^ "'-'\\" ^
padded_string_of_int u ^ "']")
| Uni_types.Identifier i -> output_string out i
| Uni_types.Concat rell ->
let foo rel =
if List.length rel > 1 then
(output_string out "(" ; print_disjunction out rel ;
output_string out ")")
else
print_disjunction out rel
in
List.iter foo rell
;;
(* print_definition prints a definition in the format expected by ocamllex *)
let print_definition out { Uni_types.id = id ; Uni_types.rel = rel } =
output_string out ("let " ^ id ^ " =\n ") ;
print_disjunction out rel ;
output_string out "\n\n"
;;
......@@ -4,7 +4,7 @@
*)
open Types
open Uni_types
open Ucs2_to_utf8
open Printf
......@@ -22,12 +22,12 @@ type config =
type char_classes =
CC_generic of (definition list * string)
(* regexp definitions, section LET *)
| CC_wlex of (string * string)
| CC_wlex of (string * string)
(* section CLASSES, section LETS *)
let get_char_classes cfg =
let filename = cfg.char_classes_file in
let cc_sections = File.read_sections filename in
let cc_sections = Lexpp_file.read_sections filename in
let type_generic = List.mem_assoc "TYPE_GENERIC" cc_sections in
let type_wlex = List.mem_assoc "TYPE_WLEX" cc_sections in
if not type_generic && not type_wlex then
......@@ -38,19 +38,19 @@ let get_char_classes cfg =
let let_section =
try List.assoc "LET" cc_sections
with Not_found -> failwith ("File " ^ filename ^ ": no LET section") in
if type_generic then begin
let let_unicode_section =
try List.assoc "LET_UNICODE" cc_sections
with Not_found -> failwith ("File " ^ filename ^ ": no LET_UNICODE section") in
CC_generic(File.parse_char_classes let_unicode_section, let_section);
CC_generic(Lexpp_file.parse_char_classes let_unicode_section, let_section);
end
else begin
assert(type_wlex);
let classes_section =
try List.assoc "CLASSES" cc_sections
with Not_found -> failwith ("File " ^ filename ^ ": no CLASSES section")
with Not_found -> failwith ("File " ^ filename ^ ": no CLASSES section")
in
let let_section =
try List.assoc "LET" cc_sections
......@@ -68,11 +68,11 @@ let recode_char_classes_8bit cfg cc =
try
let s = Netconversion.makechar out_enc n in (* or Not_found *)
if String.length s <> 1 then
failwith("Character " ^ string_of_int n ^
failwith("Character " ^ string_of_int n ^
" has a multibyte representation");
Some(Char.code s.[0])
with
Not_found ->
Not_found ->
None
in
......@@ -103,7 +103,7 @@ let recode_char_classes_8bit cfg cc =
let rec recode_regexp re =
(* recodes a regexp to a regexp list *)
match re with
Char n ->
Char n ->
( match recode_char n with
Some p -> [Char p]
| None -> []
......@@ -134,12 +134,12 @@ let recode_char_classes_8bit cfg cc =
let recode_char_classes cfg cc =
(* FUTURE: Use Netconversion.is_ascii_compatible and is_single_byte *)
printf
"[Recoding character classes to %s]\n"
printf
"[Recoding character classes to %s]\n"
(Netconversion.string_of_encoding cfg.encoding);
flush stdout;
match cfg.encoding with
`Enc_utf8 ->
`Enc_utf8 ->
( match cc with
CC_generic(defs, let_section) ->
CC_generic(List.map ucs2_to_utf8 defs, let_section)
......@@ -168,7 +168,7 @@ let name_of_rule rule_str =
"*)" :: l' -> l'
| _ :: l' -> skip_comments l'
| [] -> failwith ("Unfinished comment in: " ^ rule_str)
and find_first_word l =
and find_first_word l =
match l with
"(*" :: l' -> find_first_word (skip_comments l')
| w :: l' -> w
......@@ -200,22 +200,22 @@ let open_out_ann name =
printf "[writing %s]\n" name;
flush stdout;
let f = open_out name in
output_string f
output_string f
"(* THIS FILE IS GENERATED BY LEXPP. DO NOT EDIT MANUALLY! *)\n\n";
f
;;
let write_output_files cfg cc =
let lex_src = File.read_sections cfg.lex_src_file in
let link_src = File.read_sections cfg.link_src_file in
let lex_src = Lexpp_file.read_sections cfg.lex_src_file in
let link_src = Lexpp_file.read_sections cfg.link_src_file in
let write_header out =
match cc with
CC_generic(defs,let_str) ->
if List.mem_assoc "HEADER" lex_src then
output_string out (List.assoc "HEADER" lex_src);
List.iter (File.print_definition out) defs;
List.iter (Lexpp_file.print_definition out) defs;
output_string out let_str;
if List.mem_assoc "LET" lex_src then
output_string out (List.assoc "LET" lex_src);
......@@ -275,7 +275,7 @@ let write_output_files cfg cc =
module_of_rule;
*)
let link_str =
let link_str =
try List.assoc "LINK" link_src
with Not_found ->
failwith ("Section LINK is missing in " ^ cfg.link_src_file) in
......@@ -284,7 +284,7 @@ let write_output_files cfg cc =
if name = "encoding" then
cfg.encoding_name
else
let filename =
let filename =
try Hashtbl.find module_of_rule name
with Not_found ->
failwith ("No such rule: " ^ name)
......@@ -300,7 +300,7 @@ let write_output_files cfg cc =
let main() =
let cfg =
let cfg =
{ char_classes_file = "cc.def";
encoding = `Enc_iso88591;
encoding_name = "iso88591";
......@@ -319,7 +319,7 @@ let main() =
"<file> The name of the lex source input file";
"-linksrc", Arg.String (fun s -> cfg.link_src_file <- s),
"<file> The name of the link source input file";
"-encoding", Arg.String (fun s ->
"-encoding", Arg.String (fun s ->
cfg.encoding <-
Netconversion.encoding_of_string s;
cfg.encoding_name <- s;
......@@ -334,7 +334,7 @@ let main() =
]
(fun _ -> raise(Arg.Bad("Bad usage!")))
"usage: lexpp <options>";
let cc = get_char_classes cfg in
let cc' = recode_char_classes cfg cc in
write_output_files cfg cc'
......
......@@ -22,22 +22,22 @@ exception InvalidInterval of int * int;;
let char_ucs2_to_utf8 =
function
n when n >= 0xD800 && n <= 0xDFFF -> raise SurrogatePairs
| n when n <= 0x007F -> Types.Char n
| n when n <= 0x007F -> Uni_types.Char n
| n when n <= 0x07FF ->
Types.Concat
[[Types.Char (n lsr 6 land 0b00011111 lor 0b11000000)] ;
[Types.Char (n land 0b00111111 lor 0b10000000)]]
Uni_types.Concat
[[Uni_types.Char (n lsr 6 land 0b00011111 lor 0b11000000)] ;
[Uni_types.Char (n land 0b00111111 lor 0b10000000)]]
| n when n <= 0xffff ->
Types.Concat
[[Types.Char (n lsr 12 land 0b00001111 lor 0b11100000)] ;
[Types.Char (n lsr 6 land 0b00111111 lor 0b10000000)] ;
[Types.Char (n land 0b00111111 lor 0b10000000)]]
Uni_types.Concat
[[Uni_types.Char (n lsr 12 land 0b00001111 lor 0b11100000)] ;
[Uni_types.Char (n lsr 6 land 0b00111111 lor 0b10000000)] ;
[Uni_types.Char (n land 0b00111111 lor 0b10000000)]]
| n when n <= 0x1fffff ->
Types.Concat
[[Types.Char (n lsr 18 land 0b00000111 lor 0b11110000)] ;
[Types.Char (n lsr 12 land 0b00111111 lor 0b10000000)] ;
[Types.Char (n lsr 6 land 0b00111111 lor 0b10000000)] ;
[Types.Char (n land 0b00111111 lor 0b10000000)]]
Uni_types.Concat
[[Uni_types.Char (n lsr 18 land 0b00000111 lor 0b11110000)] ;
[Uni_types.Char (n lsr 12 land 0b00111111 lor 0b10000000)] ;
[Uni_types.Char (n lsr 6 land 0b00111111 lor 0b10000000)] ;
[Uni_types.Char (n land 0b00111111 lor 0b10000000)]]
| _ ->
failwith "Code point is outside the supported range 0..0x1fffff"
;;
......@@ -72,14 +72,14 @@ let rec mklist e =
;;
let sup =
let t = Types.Char 0b10111111 in
let t = Uni_types.Char 0b10111111 in
function
1 -> t
| n -> Types.Concat (mklist [t] n)
| n -> Uni_types.Concat (mklist [t] n)
;;
let rec inf =
let b = Types.Char 0b10000000 in
let b = Uni_types.Char 0b10000000 in
function
1 -> [[b]]
| n -> mklist [b] n
......@@ -87,13 +87,13 @@ let rec inf =
let mysucc =
function
[Types.Char n] -> n + 1
[Uni_types.Char n] -> n + 1
| _ -> assert false
;;
let mypred =
function
[Types.Char n] -> n - 1
[Uni_types.Char n] -> n - 1
| _ -> assert false
;;
......@@ -101,7 +101,7 @@ let mypred =
(* whose 'length' is the same, it returns the utf8 regular expression *)
(* matching all the characters in the interval *)
let rec same_length_ucs2_to_utf8 =
let module T = Types in
let module T = Uni_types in
function
(* Trivial cases: *)
......@@ -172,10 +172,10 @@ let rec seq_ucs2_to_utf8 =
;;
(* simplify: For example,
'\224'('\160'['\128'-'\191'] |
['\161'-'\190']['\128'-'\191'] |
'\191'['\128'-'\191']) |
(* simplify: For example,
'\224'('\160'['\128'-'\191'] |
['\161'-'\190']['\128'-'\191'] |
'\191'['\128'-'\191']) |
can be simplified to
'\224' ['\160'-'\191'] ['\128'-'\191']
*)
......@@ -183,44 +183,44 @@ let rec seq_ucs2_to_utf8 =
let rec simplify_disjunction =
function
Types.Char n1 :: Types.Interval(n2,n3) :: rest when n1+1 = n2 ->
simplify_disjunction(Types.Interval(n2,n3) :: rest)
| Types.Interval(n1,n2) :: Types.Interval(n3,n4) :: rest when n2+1 = n3 ->
simplify_disjunction(Types.Interval(n1,n4) :: rest)
| Types.Interval(n1,n2) :: Types.Char n3 :: rest when n2+1 = n3 ->
simplify_disjunction(Types.Interval(n1,n3) :: rest)
| Types.Concat( [Types.Char n1] :: tail1 ) ::
Types.Concat( [Types.Interval(n2,n3)] :: tail2 ) ::
Uni_types.Char n1 :: Uni_types.Interval(n2,n3) :: rest when n1+1 = n2 ->
simplify_disjunction(Uni_types.Interval(n2,n3) :: rest)
| Uni_types.Interval(n1,n2) :: Uni_types.Interval(n3,n4) :: rest when n2+1 = n3 ->
simplify_disjunction(Uni_types.Interval(n1,n4) :: rest)
| Uni_types.Interval(n1,n2) :: Uni_types.Char n3 :: rest when n2+1 = n3 ->
simplify_disjunction(Uni_types.Interval(n1,n3) :: rest)
| Uni_types.Concat( [Uni_types.Char n1] :: tail1 ) ::
Uni_types.Concat( [Uni_types.Interval(n2,n3)] :: tail2 ) ::
rest when n1+1 = n2 && tail1 = tail2 ->
simplify_disjunction(
Types.Concat( [Types.Interval(n1,n3)] :: tail1 ) :: rest)
| Types.Concat( [Types.Interval(n1,n2)] :: tail1 ) ::
Types.Concat( [Types.Interval(n3,n4)] :: tail2 ) ::
simplify_disjunction(
Uni_types.Concat( [Uni_types.Interval(n1,n3)] :: tail1 ) :: rest)
| Uni_types.Concat( [Uni_types.Interval(n1,n2)] :: tail1 ) ::
Uni_types.Concat( [Uni_types.Interval(n3,n4)] :: tail2 ) ::
rest when n2+1 = n3 && tail1 = tail2 ->
simplify_disjunction(
Types.Concat( [Types.Interval(n1,n4)] :: tail1 ) :: rest)
| Types.Concat( [Types.Interval(n1,n2)] :: tail1 ) ::
Types.Concat( [Types.Char n3] :: tail2 ) ::
simplify_disjunction(
Uni_types.Concat( [Uni_types.Interval(n1,n4)] :: tail1 ) :: rest)
| Uni_types.Concat( [Uni_types.Interval(n1,n2)] :: tail1 ) ::
Uni_types.Concat( [Uni_types.Char n3] :: tail2 ) ::
rest when n2+1 = n3 && tail1 = tail2 ->
simplify_disjunction(
Types.Concat( [Types.Interval(n1,n3)] :: tail1 ) :: rest)
simplify_disjunction(
Uni_types.Concat( [Uni_types.Interval(n1,n3)] :: tail1 ) :: rest)
| Types.Concat([[x]]) :: rest ->
| Uni_types.Concat([[x]]) :: rest ->
simplify_disjunction(x :: rest)
| Types.Concat([Types.Concat d] :: d') :: rest ->
| Uni_types.Concat([Uni_types.Concat d] :: d') :: rest ->
let d'' = List.map simplify_disjunction d' in
simplify_disjunction(Types.Concat(d @ d'') :: rest)
simplify_disjunction(Uni_types.Concat(d @ d'') :: rest)
(* there are probably missing cases!!! *)
| Types.Concat l :: rest ->
| Uni_types.Concat l :: rest ->
let l' = List.map simplify_disjunction l in
if l = l' then
Types.Concat l :: simplify_disjunction rest
Uni_types.Concat l :: simplify_disjunction rest
else
simplify_disjunction(Types.Concat l' :: simplify_disjunction rest)
simplify_disjunction(Uni_types.Concat l' :: simplify_disjunction rest)
| x :: rest -> x :: (simplify_disjunction rest)
| [] -> []
......@@ -235,17 +235,17 @@ let rec multi_simplify_disjunction l =
(* Given an ucs2 regual expression, returns *)
(* the corresponding utf8 regular expression *)
let ucs2_to_utf8 { Types.id = id ; Types.rel = rel } =
let ucs2_to_utf8 { Uni_types.id = id ; Uni_types.rel = rel } =
let rec aux re l2 =
match re with
Types.Char i -> char_ucs2_to_utf8 i :: l2
| Types.Interval (l,u) -> seq_ucs2_to_utf8 (l,u) @ l2
| Types.Identifier _ as i -> i :: l2
| Types.Concat rell ->
Uni_types.Char i -> char_ucs2_to_utf8 i :: l2
| Uni_types.Interval (l,u) -> seq_ucs2_to_utf8 (l,u) @ l2
| Uni_types.Identifier _ as i -> i :: l2
| Uni_types.Concat rell ->
let foo rel = List.fold_right aux rel [] in
Types.Concat (List.map foo rell) :: l2
Uni_types.Concat (List.map foo rell) :: l2
in
{ Types.id = id ;
Types.rel = multi_simplify_disjunction (List.fold_right aux rel []) }
{ Uni_types.id = id ;
Uni_types.rel = multi_simplify_disjunction (List.fold_right aux rel []) }
;;
......@@ -9,7 +9,7 @@
(* 14/05/2000 *)
(******************************************************)
open Parser
open Uni_parser
let comment_depth = ref 0;;
......
......@@ -19,7 +19,7 @@
%token RANGE
%token EOF
%start main
%type <Types.definition list> main
%type <Uni_types.definition list> main
%%
......@@ -30,7 +30,7 @@ main:
declaration:
LET IDENT EQ regexp END_OF_LET
{ { Types.id = $2 ; Types.rel = $4 } }
{ { Uni_types.id = $2 ; Uni_types.rel = $4 } }
;
regexp:
......@@ -39,7 +39,7 @@ regexp:
;
regexptoken:
CHAR { Types.Char $1 }
| LBRACKET CHAR RANGE CHAR RBRACKET { Types.Interval ($2,$4) }
| IDENT { Types.Identifier $1 }
CHAR { Uni_types.Char $1 }
| LBRACKET CHAR RANGE CHAR RBRACKET { Uni_types.Interval ($2,$4) }
| IDENT { Uni_types.Identifier $1 }
;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment