diff --git a/configure b/configure index 743d05bbd4117b4eb3c51a96c137c30d4381edd9..452d0d24632a1fc15c313f525c1ad2ac69cf6c6d 100755 --- a/configure +++ b/configure @@ -7,16 +7,18 @@ with_lex=1 with_wlex=1 with_wlex_compat=1 with_ulex=1 +with_pp=1 lexlist="utf8,iso88591,iso88592,iso88593,iso88594,iso88595,iso88596,iso88597,iso88598,iso88599,iso885910,iso885913,iso885914,iso885915,iso885916" -version="1.1.95test1" +version="1.1.95test2" exec_suffix="" help_lex="Enable/disable ocamllex-based lexical analyzer for the -lexlist encodings" help_wlex="Enable/disable wlex-based lexical analyzer for UTF-8" help_wlex_compat="Enable/disable wlex-style compatibility package for UTF-8 and ISO-8859-1" help_ulex="Enable/disable ulex-based lexical analyzer for UTF-8" +help_pp="Enable/disable the build of the preprocessor (pxp-pp)" -options="lex wlex wlex_compat ulex" +options="lex wlex wlex_compat ulex pp" lexlist_options="utf8 usascii iso88591 iso88592 iso88593 iso88594 iso88595 iso88596 iso88597 iso88598 iso88599 iso885910 iso885913 iso885914 iso885915 iso885916 koi8r windows1250 windows1251 windows1252 windows1253 windows1254 windows1255 windows1256 windows1257 windows1258 cp437 cp737 cp775 cp850 cp852 cp855 cp856 cp857 cp860 cp861 cp862 cp863 cp864 cp865 cp866 cp869 cp874 cp1006 macroman" print_options () { @@ -189,6 +191,7 @@ if [ $with_wlex -gt 0 ]; then echo "not found" echo "wlex support is disabled" with_wlex=0 + with_wlex_compat=0 fi fi @@ -206,6 +209,12 @@ if [ $with_ulex -gt 0 ]; then fi fi +# If ulex not found/disabled, also disable pxp-pp: + +if [ $with_ulex -eq 0 ]; then + with_pp=0 +fi + ###################################################################### # Check Lexing.lexbuf type @@ -229,6 +238,25 @@ fi rm -f tmp.* +###################################################################### +# Check type of camlp4 locations + +printf "%s" "Checking type of camlp4 location... " +cat <tmp.ml +open Stdpp;; +raise_with_loc (0,0) Not_found;; +EOF + +if ocamlc -c -I +camlp4 tmp.ml >/dev/null 2>/dev/null; then + echo "old style" + camlp4_loc="" +else + echo "new style" + camlp4_loc="-DOCAML_NEW_LOC" +fi + +rm -f tmp.* + ###################################################################### # Pregenerated wlex lexers @@ -286,7 +314,10 @@ print_options echo pkglist="pxp pxp-engine" -# Currently pkglist is constant + +if [ $with_pp -gt 0 ]; then + pkglist="$pkglist pxp-pp" +fi genpkglist="" # Generated packages @@ -405,6 +436,7 @@ ALLGENPKGLIST = $allgenpkglist EXEC_SUFFIX = $exec_suffix LEXBUF_307 = $lexbuf_307 LEX_OPT = $lex_opt +CAMLP4_LOC = $camlp4_loc _EOF_ ###################################################################### diff --git a/doc/EXTENSIONS.xml b/doc/EXTENSIONS.xml index 24756dd1b01da106a4c1135e92b9347a48fc29a5..6b97f8dd2723c6deaa18d145d78967fb917759d7 100644 --- a/doc/EXTENSIONS.xml +++ b/doc/EXTENSIONS.xml @@ -85,8 +85,8 @@ The following text is valid XML: The first element has the expanded name (namespace1,a) while the second element has the expanded name (namespace2,a); so the elements have different types. As -already pointed out, PXP does not support the expanded names directly (there is -some support for them in elements, but not in attributes). Alternatively, the +already pointed out, PXP does not support the expanded names directly. +Alternatively, the XML text is transformed while it is being parsed such that the prefixes become unique. In this example, the transformed text would read: @@ -124,6 +124,52 @@ because PXP normalizes any prefixes for namespace1 or namespace2 to the preferred prefixes "x" and "y".

+

Since PXP-1.1.95, the namespace support has been extended. In +addition to prefix normalization, the parser now also stores the +scoping structure of the namespaces (in the namespace_scope +objects). More or less, this means that the parser remembers +which elements have which "xmlns" attributes. There are two +important applications of this feature:

+ +

First, it is now possible to look up the namespace URI when +only the original, non-normalized namespace prefix is known. +A number of XML standards, e.g. XSchema, use namespace prefixes +within data nodes. Of course, these prefixes are not normalized +by PXP, but simply remain as they are when the XML text is +parsed. To get the URI of such a prefix p in the context of node +n, just call + + +n # namespace_scope # uri_of_display_prefix p + + +In PXP terminology, the non-normalized prefixes are now called +"display prefixes".

+ +

The other application is that it is now even possible to +retrieve the original "display" prefix of node names, e.g. + + +n # display_prefix + + +returns it. However, the display prefix is only guessed in the +sense that when there are several prefixes bound to the same +URI, one of the prefixes may be taken. For instance, in + + +]]> + +both "x" and "y" are bound to the same URI "sample", and +the display_prefix method selects now one of the prefixes +at random.

+ +

It is now even possible to output the parsed XML text +with original namespace structure: The "display" method +outputs XML text where the namespaces are declared as in the +original XML text.

+

Regarding the "xmlns" attributes, PXP treats them in a very special way. It is not only allowed not to declare them in the DTD, such declarations would be even not applied to the actual "xmlns" attributes. For example, diff --git a/doc/INSTALL.xml b/doc/INSTALL.xml index e83fdc3f7b1039df709f58f2398d84814adbb965..8a7c1a45b86c96d482be76a2e17b36298a35e3e6 100644 --- a/doc/INSTALL.xml +++ b/doc/INSTALL.xml @@ -58,6 +58,17 @@ the runtime part of wlex, and not the "wlex" command itself.

-with-wlex-compat

Creates a compatibility package pxp-wlex that includes lexers for UTF8 and ISO-8859-1 (may be required to build old software)

+ +
  • +

    -with-ulex

    +

    Enables the lexical analyzer that works for UTF-8 as internal encoding, and that is based on Alain Frisch's ulex tool. It +is relatively small, but a bit slower than the ocamllex-based lexers. +ulex will supersede wlex soon.

    +
  • +
  • +

    -with-pp

    +

    Enables the PXP preprocessor (installed as package pxp-pp). +See the file PREPROCESSOR for details. The preprocessor also requires ulex.

  • -lexlist <list-of-encodings>

    diff --git a/doc/Makefile b/doc/Makefile index 0422237e94a225f9011bdc1b16bd4dcac517696d..80a18f6feef7c30ba0512b6c0e663c2214a4f0c8 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -23,7 +23,7 @@ installrel = $$HOME/homepage/ocaml-programming.de/packages/documentation/pxp/ind .PHONY: all -all: README INSTALL ABOUT-FINDLIB SPEC EXTENSIONS +all: README INSTALL ABOUT-FINDLIB SPEC EXTENSIONS PREPROCESSOR README: README.xml common.xml config.xml readme.dtd $(readme) -text README.xml >README @@ -40,6 +40,9 @@ SPEC: SPEC.xml common.xml config.xml readme.dtd EXTENSIONS: EXTENSIONS.xml common.xml config.xml readme.dtd $(readme) -text EXTENSIONS.xml >EXTENSIONS +PREPROCESSOR: PREPROCESSOR.xml common.xml config.xml readme.dtd + $(readme) -text PREPROCESSOR.xml >PREPROCESSOR + DEV: DEV.xml common.xml config.xml readme.dtd $(readme) -text DEV.xml >DEV #$(readme) -html DEV.xml >$(installdev) diff --git a/doc/PREPROCESSOR.xml b/doc/PREPROCESSOR.xml new file mode 100644 index 0000000000000000000000000000000000000000..aab3bdf941b3034e90973949acd7470bdeb01724 --- /dev/null +++ b/doc/PREPROCESSOR.xml @@ -0,0 +1,745 @@ + + +%common; + +PXP"> + +]> + + + + The Preprocessor for PXP + +

    Since PXP-1.1.95, there is a preprocessor as part of the PXP +distribution. It allows you to compose XML trees and event lists +dynamically, which is very handy to write XML transformations.

    + +

    To enable the preprocessor, compile your source files as in: + + +ocamlfind ocamlc -syntax camlp4o -package pxp-pp,... ... + + +The package pxp-pp contains the preprocessor. The -syntax option +enables camlp4, on which the preprocessor is based. It is also +possible to use it together with the revised syntax, use +"-syntax camlp4r" in this case.

    + +

    In the toploop, type + + +ocaml +# #use "topfind";; +# #camlp4o;; +# #require "pxp-pp";; +# #require "pxp";; + +

    + +

    The preprocessor defines the following new syntax notations, +explained below in detail: + +> +<:pxp_tree< EXPR >> +<:pxp_vtree< EXPR >> +<:pxp_evlist< EXPR >> +<:pxp_evpull< EXPR >> +<:pxp_text< TEXT >> +]]> + +The basic notation is "pxp_tree" which creates a tree of PXP document +nodes as described in EXPR. "pxp_vtree" is the variant where the tree +is immediately validated. "pxp_evlist" creates a list of PXP events +instead of nodes, useful together with the event-based parser. +"pxp_evpull" is a variation of the latter: Instead of an event list +an event generator is created that works like a pull parser.

    + +

    The "pxp_charset" notation only configures the character sets to +assume. Finally, "pxp_text" is a notation for string literals.

    + + + Creating constant XML + +

    The following examples are all written for "pxp_tree". You can +also use one of the other XML composers instead, but see the notes +below.

    + +

    In order to use "pxp_tree", you must define two variables in +the environment: "spec" and "dtd": + + +let spec = Pxp_tree_parser.default_spec;; +let dtd = Pxp_dtd.create_dtd `Enc_iso88591;; + + +These variables occur in the code generated by the preprocessor. +The "dtd" variable is the DTD object. Note that you need it even +in well-formedness mode (validation turned off). The "spec" variable +controls which classes are instantiated as node representation +(see PXP manual).

    + +

    Now you can create XML trees like in + + + [ [ "The Lord of The Rings" ] + <author>[ "J.R.R. Tolkien" ] + ] + >> +]]></code> + +As you can see, the syntax is somehow XML-related but not really XML. +(Many ideas are borrowed from CDUCE, by the way.) In particular, +there are start tags like <title> but no end tags. Instead, +we are using square brackets to denote the children of an XML +element. Furthermore, character data must be put into double +quotes.</p> + +<p>You may ask why the well-known XML syntax has been modified for +this preprocessor. There are many reasons, and they will become +clearer in the following explanations. For now, you can see the advantage +that the syntax is less verbose, as you need not to repeat the +element names in end tags. Furthermore, you can exactly control +which characters are part of the data nodes without having to make +compromises with indentation.</p> + +<p>Attributes are written as in XML: + +<code><![CDATA[ +let book = + <:pxp_tree< + <book id="BOOK_001"> + [ <title lang="en">[ "The Lord of The Rings" ] + <author>[ "J.R.R. Tolkien" ] + ] + >> +]]></code> +</p> + +<p>An element without children can be written + +<code><![CDATA[ +<element>[] +]]></code> + +or slightly shorter: + +<code><![CDATA[ +<element/> +]]></code> + </p> +<p>You can also create processing instructions and comment nodes: + +<code><![CDATA[ +let list = + <:pxp_tree< + <list> + [ <!>"Now the list of books follows!" + <?>"formatter_directive" "one book per page" + book + ] + >> +]]></code> + +The notation "<!>" creates a comment node with the following string +as contents. The notation "<?>" needs two strings, first the target, +then the value (here, this results in +"<?formatter_directive one book per page?>". </p> + +<p>Look again at the last example: The O'Caml variable "book" occurs, +and it inserts its tree into the list of books. Identifiers +without "decoration" just refer to O'Caml variables. We will see +more examples below.</p> + +<p>The preprocessor syntax knows a number of shortcuts and variations. +First, you can omit the square brackets when an element has exactly +one child: + +<code><![CDATA[ +<element><child>"Data inside child" +]]></code> + +This is the same as + +<code><![CDATA[ +<element>[ <child>[ "Data inside child" ] ] +]]></code> + +Second, you are already used to a common abbreviation: Strings are +automatically converted to data nodes. The "expanded" syntax is + +<code><![CDATA[ +<*>"Data string" +]]></code> + +where "<*>" denotes a data node, and the following string is +used as contents. Usually, you can omit "<*>". However, there +are a few occasions where this notation is still useful, see below.</p> + +<p>In strings, the usual entity references can be used: +"Double quotes: &quot;". For a newline character, +write &#10;.</p> + +<p>The preprocessor knows two operators: "^" concatenates strings, +and "@" concatenates lists. Examples: + +<code><![CDATA[ +<element>[ "Word1" ^ "Word2" ] +<element>([ <a/> ] @ [ <b/> ]) +]]></code></p> + +<p>Parentheses can be used to clarify precedence. For example: + +<code><![CDATA[ +<element>(l1 @ l2) +]]></code> + +Here, the concatenation operator "@" could also be parsed as + +<code><![CDATA[ +(<element> l1) @ l2 +]]></code> + +Parentheses may be used in every expression.</p> + + <p>Rarely used, there is also a notation for the +"super root" nodes (see the PXP manual for their meaning): + +<code><![CDATA[ +<^>[ <element> ... ] +]]></code> + </p> + + </sect2> + + + <sect2> + <title>Dynamic XML + +

    Let us begin with an example. The task is to convert +O'Caml values of type + + + +to XML trees like + + + 'title' + 'author' + +]]> + +(conventional syntax). When b is the book variable, the solution is + + + [ <*>title + <author><*>author + ] + >> +]]></code> + +First, we bind the simple O'Caml variables "title", "author", and +"isbn". The reason is that the preprocessor syntax does not allow +expressions like "b.title" directly in the XML tree (but see below +for a better workaround).</p> + +<p>The XML tree contains the O'Caml variables. The "id" attribute +is a concatenation of the fixed prefix "BOOK_" and the contents of +"isbn". The "title" and "author" elements contain a data node +whose contents are the O'Caml strings "title", and "author", +respectively.</p> + +<p>Why "<*>"? If we just wrote "<title>title", the +generated code would assume that the "title" variable is an XML node, +and not a string. From this point of view, "<*>" works like +a type annotation, as it specialises the type of the following +expression.</p> + +<p>Here is an alternate solution: + +<code><![CDATA[ +let book = + <:pxp_tree< + <book id=("BOOK_" ^ (: b.isbn :))> + [ <title><*>(: b.title :) + <author><*>(: b.author :) + ] + >> +]]></code> + +The notation "(: ... :)" allows you to include arbitrary O'Caml +expressions into the tree. In this solution it is no longer necessary +to create artificial O'Caml variables for the only purpose of +injecting values into trees. + </p> + + <p>It is possible to create XML elements with dynamic names: +Just put parentheses around the expression. Example: + +<code><![CDATA[ +let name = "book" in +<:pxp_tree< <(name)> ... >> +]]></code> + +With the same notation, one can also set attribute names dynamically: + +<code><![CDATA[ +let att_name = "id" in +<:pxp_tree< <book (att_name)=...> ... >> +]]></code> + +Finally, it is also possible to include complete attribute lists +dynamically: + +<code><![CDATA[ +let att_list = [ "id", ("BOOK_" ^ b.isbn) ] in +<:pxp_tree< <book (: att_list :) > ... >> +]]></code> + </p> + +<p>Typing: Depending on where a variable or O'Caml expression occurs, +different types are assumed. Compare the following examples: + +<code><![CDATA[ +<:pxp_tree< <element>x1 >> +<:pxp_tree< <element>[x2] >> +<:pxp_tree< <element><*>x3 >> +]]></code> + +As a rule of thumb, the most general type is assumed that would make +sense at a certain location. As x1 could be replaced by a list +of children, its type is assumed to be a node list. As x2 could +be replaced by a single node, its type is assumed to be a node. +And x3 is a string, we had this case already. +</p> + </sect2> + + <sect2> + <title>Character Encodings + +

    As the preprocessor generates code that builds XML trees, it +must know two character encodings:

    + +
      +
    • Which encoding is used in the source code (in the .ml file) +

    • +
    • Which encoding is used in the XML representation, i.e. +in the O'Caml values representing the XML trees

    • +
    + +

    Both encodings can be set independently. The syntax is: + +> +]]> + +The default is ISO-8859-1 for both encodings. For example, to set +the representation encoding to UTF-8, use: + +> +]]> + +The "pxp_charset" notation is a constant expression that always +evaluates to "()". (A requirement by camlp4 that looks artificial.) +

    + +

    When you set the representation encoding, it is required that the +encoding stored in the DTD object is the same. Remember that we +need a DTD object like + + +let dtd = Pxp_dtd.create_dtd `Enc_iso88591;; + + +Of course, we must change this to the representation encoding, too, +in our example: + + +let dtd = Pxp_dtd.create_dtd `Enc_utf8;; + + +The preprocessor cannot check this at compile time, and for performance +reasons, a runtime check is not generated. So it is up to the programmer +that the character encodings are used in a consistent way. +

    +
    + + + + Validated Trees + +

    In order to validate trees, you need a filled DTD object. +In principle, you can create this object by a number of methods. +For example, you can parse an external file: + + + +It is, however, often more convenient to include the DTD literally +into the program. This works by + + + +As the double quotes are often used inside DTDs, O'Caml string +literals are a bit impractical, as they are also delimited by +double quotes, and one needs to add backslashes as escape characters. +The "pxp_text" notation is often more readable here: +<:pxp_text<STRING>> is just another way of writing +"STRING". In our DTD, we have + + + + + + + >>;; +let config = default_config;; +let dtd = Pxp_dtd_parser.parse_dtd_entity config (from_string dtd_text);; +]]> + +Note that "pxp_text" is not restricted to DTDs, as it can be used +for any kind of string.

    + +

    After we have the DTD, we can validate the trees. One +option is to call the "validate" function: + + + [ [ "The Lord of The Rings" ] + <author>[ "J.R.R. Tolkien" ] + ] + >>;; +Pxp_document.validate book;; +]]></code> + +(This example is invalid, as the "id" attribute is missing.)</p> + +<p>Note that it is a misunderstanding that "pxp_tree" builds XML trees in +well-formed mode. You can create any tree with it, and the fact is that +"pxp_tree" just does not invoke the validator. So if the DTD enforces +validation, the tree is validated when the "validate" function is +called. If the DTD is in well-formedness mode, the tree is effectively +not validated, even when the "validate" function is invoked. Btw, +the following statements would create a DTD in well-formedness mode: + +<code> +let dtd = Pxp_dtd.create_dtd `Enc_iso88591;; +dtd # allow_arbitrary; +</code> + +As an alternative of calling the "validate" function, one can also +use "pxp_vtree" instead. It immediately validates every XML element it +creates. However, "injected" subtrees are not validated, i.e. validation +does not proceed recursively to subnodes as the "validate" function +does it.</p> + </sect2> + + + <sect2> + <title>Generating Events + +

    As PXP has also an event model to represent XML, the preprocessor +can also produce such events. In particular, there are two modes: The +"pxp_evlist" notation outputs lists of events (type "event list") +representing the XML expression. The "pxp_evpull" notation creates +an automaton from which one can "pull" events (like from a pull +parser).

    + +

    These two notations work very much like "pxp_tree". For example, + + + [ [ "The Lord of The Rings" ] + <author>[ "J.R.R. Tolkien" ] + ] + >> +]]></code> + +generates + +<code><![CDATA[ +[ E_start_tag ("book", [], None, <obj>); + E_start_tag ("title", [], None, <obj>); + E_char_data "The Lord of The Rings"; + E_end_tag ("title", <obj>); + E_start_tag ("author", [], None, <obj>); + E_char_data "J.R.R. Tolkien"; + E_end_tag ("author", <obj>); + E_end_tag ("book", <obj>) +] +]]></code> + +Note that you neither need a "dtd" variable nor a "spec" variable. +There is one important difference, however: Both nodes and lists +of nodes are represented by the same type, "event list". That +has the consequence that in the following example x1 and x2 +have the same type "event list": + +<code><![CDATA[ +<:pxp_evlist< <element>x1 >> +<:pxp_evlist< <element>[x2] >> +<:pxp_evlist< <element><*>x3 >> +]]></code> + +In principle, it could be checked at runtime whether x1 and x2 +have the right structure. However, this is not done because of +performance reasons.</p> + + <p>As mentioned, "pxp_evpull" works like a pull parser. +After defining + +<code><![CDATA[ +let book = + <:pxp_evpull< + <book> + [ <title>[ "The Lord of The Rings" ] + <author>[ "J.R.R. Tolkien" ] + ] + >> +]]></code> + +"book" is a function 'a->event. One can call it to get the events +one after the other: + +<code><![CDATA[ +let e1 = book();; (* = Some(E_start_tag ("book", [], None, <obj>)) *) +let e2 = book();; (* = Some(E_start_tag ("title", [], None, <obj>)) *) +... +]]></code> + +After the last event, "book" returns None to indicate the end of the +event stream.</p> + + <p>As for "pxp_evlist", it is not possible to distinguish between +nodes and node lists. In this example, both x1 and x2 are assumed +to have type 'a->event: + +<code><![CDATA[ +<:pxp_evlist< <element>x1 >> +<:pxp_evlist< <element>[x2] >> +<:pxp_evlist< <element><*>x3 >> +]]></code> + +Note that "<element>x1" actually means to build a new pull automaton +around the existing pull automaton x1: The children of "element" are +retrieved by pulling events from x1 until "None" is returned.</p> + + <p>A consequence of the pull semantics is that once an event +is obtained from an automaton, the state of the automaton is modified +such that it is not possible to get the same event again. If you need +an automaton that can be reset to the beginning, just wrap the +"pxp_evlist" notation into a functional abstraction: + +<code><![CDATA[ +let book_maker() = + <:pxp_evpull< <book ...> ... >>;; +let book1 = book_maker();; +let book2 = book_maker();; +]]></code> + +This way, "book1" and "book2" are independent event streams.</p> + + <p>There is another implication of the nature of the +automatons: Subexpressions are lazily evaluated. For example, +in + +<code><![CDATA[ +<:pxp_evpull< <element>[ <*> (: get_data_contents() :) ] >> +]]></code> + +the call of get_data_contents is performed just before the event +for the data node is constructed.</p> + </sect2> + + + <sect2> + <title>Namespaces + +

    By default, the preprocessor does not generate nodes or +events that support namespaces. It can, however, be configured +to create namespace-aware XML aggregations. +

    + +

    In any case, you need a namespace manager. This is an object +that tracks the usage of namespace prefixes in XML nodes. For example, +we can create a namespace manager that knows the "html" prefix: + + + +Here, we declare that we want to use the "html" prefix for the +internal representation of the XML nodes. This kind of prefix is +called normalized prefix, or normprefix for short. It is possible to configure +different prefixes for the external representation, i.e. when the +XML tree is printed to a file. This other kind of prefix is called +display prefix. We will have a look at them later.

    + +

    Next, we must tell the DTD object that we have a namespace manager: + + +

    + +

    For "pxp_evlist" and "pxp_evpull" we are now prepared (note that +we need now a "dtd" variable, as the DTD object knows the namespace +manager). For "pxp_tree" and "pxp_vtree", it is required to use +a namespace-aware specification: + + + +(Normal specifications do not work, you would get "Namespace method +not applicable" errors if you tried to use them.)

    + +

    The special notation "<:autoscope>" enables namespace mode in +this example: + + + + [ "Item1" + "Item2" + ] + >> +]]> + +In particular, "<:autoscope>" defines a new O'Caml variable for +its subexpression: "scope". This variable contains the namespace +scope object, which contains the namespace declarations for the +subexpression. "<:autoscope>" initialises this variable from the +namespace manager such that it contains now a declaration for the +"html" prefix.

    + +

    In general, the namespace scope object contains the prefixes to use for the +external representation. For this simple example, we have chosen +to use the same prefixes as for the internal representation, +and "<:autoscope>" performs the right initialisations for this.

    + +

    Print the tree by + + + +The point is to call the "display" method and not the "write" method. +The latter would not respect the display prefixes. +

    + +

    Alternatively, we can also create the "scope" variable manually: + + + + [ "Item1" + "Item2" + ] + >> +]]> + +Note that we now use "<:scope>". In this simple form, this +construct just enables namespace mode, and takes the "scope" +variable from the environment.

    + +

    Furthermore, the namespace scope contains now a different +namespace declaration: The display prefix "" is used for HTML. The +empty prefix just means to declare a default prefix +(by xmlns="URI"). The effect can be seen when the XML tree +is printed by calling the "display" method.

    + +

    Here is a third variant of the same example: + + + + [ "Item1" + "Item2" + ] + >> +]]> + +The "scope" is now initially empty. The "<:scope>" notation is +used to extend the scope for the time the subexpression is +evaluated.

    + +

    There is also a notation "<:emptyscope" that creates +an empty scope object, so one could even write + + + <:scope ("")="http://www.w3.org/1999/xhtml"> + + [ "Item1" + "Item2" + ] + >> +]]> +

    + +

    It is recommended to create the "scope" variable manually with +a reasonable initial declaration, and to use "<:scope>" to +enable namespace processing, and to extend the scope when necessary. +The advantage of this approach is that the same scope object can be +shared by many XML nodes, so you need less memory.

    + +

    One tip: To get a namespace scope that is initialised with all +prefixes of the namespace manager (as <:autoscope> does it), define + + +let scope = create_namespace_scope ~decl: mng#as_declaration mng + +

    + +

    For event-based processing of XML, the namespace mode works in +the same way as described here, there is no difference.

    +
    + +
    +
    + + diff --git a/godi/godi-pxp/BUILDMSG b/godi/godi-pxp/BUILDMSG new file mode 100644 index 0000000000000000000000000000000000000000..aba1cab8ee5c4c77736babb53c27e3392df0b861 --- /dev/null +++ b/godi/godi-pxp/BUILDMSG @@ -0,0 +1,4 @@ +Available options for godi.conf: + +- GODI_PXP_WITH_WLEX: Whether to build the pxp-wlex + lexer (yes/no) diff --git a/godi/godi-pxp/CONFOPTS b/godi/godi-pxp/CONFOPTS new file mode 100644 index 0000000000000000000000000000000000000000..9345b398e16a9900a86707d3f2f07082ac26b3bc --- /dev/null +++ b/godi/godi-pxp/CONFOPTS @@ -0,0 +1 @@ +GODI_PXP_WITH_WLEX diff --git a/godi/godi-pxp/Makefile b/godi/godi-pxp/Makefile index 89a03ec77a5ad8be50fc4a186786a505cb2df411..b4f944272dc806e6016f7b65862691a1ae1ea2ad 100644 --- a/godi/godi-pxp/Makefile +++ b/godi/godi-pxp/Makefile @@ -3,7 +3,7 @@ GODI_PLIST= yes .include "../../mk/godi.pkg.mk" -VERSION= 1.1.95test1 +VERSION= 1.1.95test2 PKGNAME= godi-pxp-${VERSION} DISTNAME= trunk DISTFILES= @@ -13,15 +13,25 @@ MAINTAINER= gerd@gerd-stolpmann.de HOMEPAGE= http://www.ocaml-programing.de/ COMMENT= PXP is an advanced XML parser +GODI_PXP_WITH_WLEX?=yes + DEPENDS+= godi-ocaml>=3.07:../../godi/godi-ocaml DEPENDS+= godi-ocamlnet>0.97.1:../../godi/godi-ocamlnet -DEPENDS+= godi-wlex>=20021107:../../godi/godi-wlex DEPENDS+= godi-ulex>=0:../../godi/godi-ulex BUILD_DEPENDS+= godi-findlib>=1.0:../../godi/godi-findlib +.if ${GODI_PXP_WITH_WLEX} == "yes" +DEPENDS+= godi-wlex>=20021107:../../godi/godi-wlex +.endif + PATH:= ${LOCALBASE}/bin:${PATH} HAS_CONFIGURE= yes +.if ${GODI_PXP_WITH_WLEX} == "yes" +CONFIGURE_ARGS+= -with-wlex -with-wlex-compat +.else +CONFIGURE_ARGS+= -without-wlex -without-wlex-compat +.endif # ocamlfind must install into the pkg-lib directory, not into site-lib. # Use the build time configuration file: diff --git a/godi/godi-pxp/PLIST.godi b/godi/godi-pxp/PLIST.godi index 33d008445b4e3e0e1bb7cf94ae0704f9dee2ca8f..c70c14eff44a613afcdf37b737b92c8bc066da74 100644 --- a/godi/godi-pxp/PLIST.godi +++ b/godi/godi-pxp/PLIST.godi @@ -16,6 +16,8 @@ @findlib pxp-lex-iso885915 @findlib pxp-lex-iso885916 @findlib pxp-lex-utf8 -@findlib pxp-wlex -@findlib pxp-wlex-utf8 +@optional @findlib pxp-wlex +@optional @findlib pxp-wlex-utf8 +@findlib pxp-ulex +@findlib pxp-pp @deepdir doc/godi-pxp diff --git a/pxp.files b/pxp.files index 085bf0cbae1f33b85cb79908cb3676a884fc34e0..c9c0c84a4fea730da2b7d822cb6978aab1ce5c7c 100644 --- a/pxp.files +++ b/pxp.files @@ -37,6 +37,7 @@ f doc/INSTALL f doc/README f doc/DEV f doc/EXTENSIONS +f doc/PREPROCESSOR f doc/RELEASE-NOTES f doc/SPEC f doc/design.txt @@ -256,6 +257,12 @@ f gensrc-pre/pxp-wlex-utf8/pxp_wlex_utf8_01.mll f gensrc-pre/pxp-wlex-utf8/pxp_wlex_utf8_01.ml.306 f gensrc-pre/pxp-wlex-utf8/pxp_wlex_utf8_01.ml.307 +d src/pxp-pp +f src/pxp-pp/Makefile +f src/pxp-pp/META.in +f src/pxp-pp/PPSPEC +f src/pxp-pp/pxp_pp.ml + d rtests f rtests/Makefile f rtests/README diff --git a/rtests/Makefile b/rtests/Makefile index ac696689fdb12d559c214eb770c496ac56ce6683..ce90933abe0d9da6d4ebb45f5bed0c700319dc4c 100644 --- a/rtests/Makefile +++ b/rtests/Makefile @@ -1,4 +1,4 @@ -include ../Makefile.conf +-include ../Makefile.conf .PHONY: all all: toploops diff --git a/src/Makefile b/src/Makefile index abf17f55c2d36461956675170a3a119b658c5b2e..5fcff14cf5ea8dc85daad1b8f44dc9cbed3762dc 100644 --- a/src/Makefile +++ b/src/Makefile @@ -6,11 +6,14 @@ clean: distclean: $(MAKE) -C pxp distclean $(MAKE) -C pxp-engine distclean + $(MAKE) -C pxp-pp distclean CLEAN: $(MAKE) -C pxp CLEAN $(MAKE) -C pxp-engine CLEAN + $(MAKE) -C pxp-pp CLEAN uninstall: $(MAKE) -C pxp uninstall $(MAKE) -C pxp-engine uninstall + $(MAKE) -C pxp-pp uninstall diff --git a/src/pxp-engine/Makefile b/src/pxp-engine/Makefile index e6a19c78c856172f931421cd19b2b438d14afc09..8d0a23721ae70ee8210f9d89f561003fb412545f 100644 --- a/src/pxp-engine/Makefile +++ b/src/pxp-engine/Makefile @@ -20,7 +20,7 @@ depend: $(OCAMLDEP) *.ml *.mli >depend clean: - rm -f $(CLEAN_LIST) pxp_core_parser.ml pxp_lib.ml + rm -f $(CLEAN_LIST) pxp_core_parser.ml pxp_lib.ml pxp_lexing.ml CLEAN: clean diff --git a/src/pxp-engine/pxp_dtd.ml b/src/pxp-engine/pxp_dtd.ml index 62a686be3c0d3e21131b945b3c224b8aaac1792c..8f655e70db6a4d877d19112127b0d1ebc3fbbe1f 100644 --- a/src/pxp-engine/pxp_dtd.ml +++ b/src/pxp-engine/pxp_dtd.ml @@ -1313,6 +1313,13 @@ module Entity = struct ~name ~xid ~resolver dtd | Entity(make,resolver) -> make dtd (* resolver ignored *) + + let entity_id ent = (ent :> < >) + + class fake = object end + + let create_entity_id () = new fake + end diff --git a/src/pxp-engine/pxp_dtd.mli b/src/pxp-engine/pxp_dtd.mli index 9ca83487f371ff880210314d7f28972ad7d2b631..c4d1c2ba89c3a3421a73ed7906cd33c44d27b4f8 100644 --- a/src/pxp-engine/pxp_dtd.mli +++ b/src/pxp-engine/pxp_dtd.mli @@ -751,6 +751,14 @@ module Entity : sig Pxp_entity.entity (* Creates an external entity that reads from the passed source *) + val entity_id : Pxp_entity.entity -> Pxp_lexer_types.entity_id + (* Returns the abstract entity ID *) + + val create_entity_id : unit -> Pxp_lexer_types.entity_id + (* Create a new abstract entity ID. This ID can be used whereever + * an entity_id is expected but no entity is available. + *) + end ;; diff --git a/src/pxp-pp/META.in b/src/pxp-pp/META.in new file mode 100644 index 0000000000000000000000000000000000000000..08e9c83f2ca64ecaf63a248da907b1027ec3d588 --- /dev/null +++ b/src/pxp-pp/META.in @@ -0,0 +1,17 @@ +description = "Preprocessor for PXP" +version = "@VERSION@" + +# At runtime, we need at least pxp-engine. +requires = "camlp4,pxp-engine" + +# At preprocess time, we need netstring and ulex: +requires(syntax) = "camlp4,netstring,ulex" + +# The toploop is the combination of both: +requires(syntax,toploop) = "camlp4,netstring,ulex,pxp-engine" + +# Specification of stand-alone preprocessor call: +archive(syntax,preprocessor) = "pxp_pp.cma" + +# Specification for the toploop: +archive(syntax,toploop) = "pxp_pp.cma" diff --git a/src/pxp-pp/Makefile b/src/pxp-pp/Makefile index b4106339600f9a4943685d94ee33b29a734670ce..03367de6b7bcc03f7aac182f62c42218f05798d9 100644 --- a/src/pxp-pp/Makefile +++ b/src/pxp-pp/Makefile @@ -1,2 +1,30 @@ -pxp_pp.cmo: pxp_pp.ml - ocamlfind ocamlc -c -package netstring,ulex,camlp4.quotations,camlp4.macro -syntax camlp4o pxp_pp.ml +TOP_DIR = ../.. + +include $(TOP_DIR)/Makefile.rules + +PACKAGES = netstring,ulex,camlp4.quotations,camlp4.macro + +OCAMLC_OPTIONS += -syntax camlp4o +OCAMLC_OPTIONS += -ppopt "$(CAMLP4_LOC)" + + +all: pxp_pp.cma + +opt: + +pxp_pp.cma: pxp_pp.cmo + $(OCAMLC) -a -o pxp_pp.cma pxp_pp.cmo + +clean: + rm -f $(CLEAN_LIST) + +CLEAN: clean + +distclean: clean + rm -f META depend + +install: + $(OCAMLFIND) install pxp-pp pxp_pp.cma META + +uninstall: + $(OCAMLFIND) remove pxp-pp diff --git a/src/pxp-pp/SPEC b/src/pxp-pp/PPSPEC similarity index 100% rename from src/pxp-pp/SPEC rename to src/pxp-pp/PPSPEC diff --git a/src/pxp-pp/pxp_pp.ml b/src/pxp-pp/pxp_pp.ml index 39c974d2cbb58151ae1256436dc9eebff2c1b77e..0aab32a3da5b827ab150a6b31e05508c15f92efd 100644 --- a/src/pxp-pp/pxp_pp.ml +++ b/src/pxp-pp/pxp_pp.ml @@ -642,7 +642,7 @@ let mkloc ((p1_line,p1_line_start,p1_pos) as p1) ;; -let raise_at p1 p2 exn = +let raise_at (p1:pos) (p2:pos) exn = (* let (p1_l,p1_s,p1_p) = p1 in Printf.eprintf "Raise_at %d %d %d\n" p1_l p1_s p1_p; @@ -869,6 +869,286 @@ let expand_tree_expr (valcheck:bool) (s:string) : MLast.expr = ;; +(**********************************************************************) +(* Code generator for event streams *) + + +type ann = [`Single|`Tree|`Forest];; + +let generate_event_generator + (generate_tree : (ann * MLast.expr) list -> MLast.expr) + (generate_forest : (ann * MLast.expr) list -> MLast.expr) + (s:string) + : MLast.expr = + (* Generates code to generate events. The input arguments + * [generate_tree] and [generate_forest] process an intermediate + * representation, the so-called annotated expression lists + * (type (ann * MLast.expr) list), and return the final code. + * + * Kinds of annotations: + * - `Single: The expression is a single event, i.e. an O'Caml value + * of type [event]. + * - `Tree: The expression represents a list of events corresponding + * to a node tree. It is left + * open how such lists are represented. The expression is either + * an O'Caml identifier or a subexpression from an antiquotation. + * - `Forest: The expression represents a list of events corresponding + * to a list of node trees. + * + * The argument [generate_tree] is a function that generates the + * final code for an annotated list of expressions. It can be expected + * that the input list for [generate_tree] represents a node tree. + * + * The argument [generate_forest] does the same for an annotated + * list of expressions that represents a list of node trees. + *) + + let to_rep s = + Netconversion.convert + ~in_enc:`Enc_utf8 ~out_enc:(!current_decl.rep_enc) s in + + let to_src s = + Netconversion.convert + ~in_enc:`Enc_utf8 ~out_enc:(!current_decl.source_enc) s in + + let rec generate_for_any_expr loc : ast_any_node -> MLast.expr = + function + `Node n -> + let e = generate_tree (generate_for_node_expr false n) in + <:expr< let _eid = Pxp_dtd.Entity.create_entity_id() in $e$ >> + | `Nodelist nl -> + let e = generate_forest (generate_for_nodelist_expr false nl) in + <:expr< let _eid = Pxp_dtd.Entity.create_entity_id() in $e$ >> + + and generate_for_node_expr nsmode : ast_node -> (ann * MLast.expr) list = ( + (* nsmode: Whether there is a variable [scope] in the environment *) + function + (`Element(name,attrs,subnodes),p1,p2) -> + let loc = mkloc p1 p2 in + let name_expr = generate_for_string_expr name in + let attrs_expr_l = List.map generate_for_attr_expr attrs in + let attrs_expr = generate_ann_list loc attrs_expr_l in + let subnodes_expr = generate_for_nodelist_expr nsmode subnodes in + let scope_opt_expr = + if nsmode then <:expr< Some scope >> else <:expr< None >> in + + let start_tag = + <:expr< Pxp_types.E_start_tag($name_expr$, + $attrs_expr$, + $scope_opt_expr$, + _eid) >> in + let end_tag = + <:expr< Pxp_types.E_end_tag($name_expr$,_eid) >> in + + [`Single, start_tag] @ subnodes_expr @ [`Single, end_tag] + | (`Data text,p1,p2) -> + let text_expr = generate_for_string_expr text in + let loc = mkloc p1 p2 in + [ `Single, <:expr< Pxp_types.E_char_data($text_expr$) >> ] + | (`Comment text,p1,p2) -> + let text_expr = generate_for_string_expr text in + let loc = mkloc p1 p2 in + [ `Single, <:expr< Pxp_types.E_comment($text_expr$) >> ] + | (`PI(target,value),p1,p2) -> + let target_expr = generate_for_string_expr target in + let value_expr = generate_for_string_expr value in + let loc = mkloc p1 p2 in + [ `Single, <:expr< Pxp_types.E_pinstr($target_expr$,$value_expr$) >> ] + | (`Super subnodes,p1,p2) -> + let subnodes_expr = generate_for_nodelist_expr nsmode subnodes in + let loc = mkloc p1 p2 in + ( [ `Single, <:expr< Pxp_types.E_start_super >> ] @ + subnodes_expr @ + [ `Single, <:expr< Pxp_types.E_end_super >> ] ) + | (`Meta(name,attrs,subnode),p1,p2) -> + let loc = mkloc p1 p2 in + ( match name with + "scope" -> generate_scope loc attrs subnode + | "autoscope" -> generate_autoscope loc subnode + | "emptyscope" -> generate_emptyscope loc subnode + | _ -> assert false (* already caught above *) + ) + | (`Ident name,p1,p2) -> + let loc = mkloc p1 p2 in + [ `Tree, (generate_ident loc (to_src name)) ] + | (`Anti text,p1,p2) -> + let expr = + Grammar.Entry.parse Pcaml.expr_eoi (Stream.of_string (to_src text)) + in + [ `Tree, expr ] + | _ -> + (* `Literal and `Concat are impossible after type check *) + assert false ) + + and generate_for_nodelist_expr nsmode : + ast_node_list -> (ann * MLast.expr) list = ( + function + (`Nodes l, p1, p2) -> + let loc = mkloc p1 p2 in + let l' = List.map (generate_for_node_expr nsmode) l in + List.flatten l' + | (`Concat l, p1, p2) -> + let loc = mkloc p1 p2 in + let l' = List.map (generate_for_nodelist_expr nsmode) l in + List.flatten l' + | (`Ident name, p1, p2) -> + let loc = mkloc p1 p2 in + [ `Forest, (generate_ident loc (to_src name)) ] + | (`Anti text, p1, p2) -> + let expr = + Grammar.Entry.parse Pcaml.expr_eoi (Stream.of_string (to_src text)) + in + [ `Forest, expr ] + ) + + and generate_for_attr_expr : ast_attr -> [`Single|`List] * MLast.expr = ( + function + (`Attr(n,v), p1, p2) -> + let loc = mkloc p1 p2 in + let n_expr = generate_for_string_expr n in + let v_expr = generate_for_string_expr v in + `Single, <:expr< ($n_expr$, $v_expr$) >> + | (`Anti text, p1, p2) -> + `List, + Grammar.Entry.parse Pcaml.expr_eoi (Stream.of_string (to_src text)) + ) + + and generate_scope loc attrs subnode : (ann * MLast.expr) list = ( + let subexpr = generate_for_node_expr true subnode in + if attrs = [] then + subexpr + else + let decl_expr_l = List.map generate_for_attr_expr attrs in + let decl_expr = generate_ann_list loc decl_expr_l in + let old_scope_expr = <:expr< Some scope >> in + let scope_expr = + <:expr< new Pxp_dtd.namespace_scope_impl + (dtd # namespace_manager) + $old_scope_expr$ + $decl_expr$>> in + let compiled_subexpr = generate_tree subexpr in + [ `Tree, ( <:expr< let scope = $scope_expr$ in $compiled_subexpr$ >> ) ] + ) + + and generate_autoscope loc subnode : (ann * MLast.expr) list = ( + let subexpr = generate_for_node_expr true subnode in + let compiled_subexpr = generate_tree subexpr in + let scope_expr = + <:expr< ( let mng = dtd # namespace_manager in + new Pxp_dtd.namespace_scope_impl + mng None mng#as_declaration ) >> in + [ `Tree, ( <:expr< let scope = $scope_expr$ in $compiled_subexpr$ >> ) ] + ) + + and generate_emptyscope loc subnode : (ann * MLast.expr) list = ( + let subexpr = generate_for_node_expr true subnode in + let compiled_subexpr = generate_tree subexpr in + let scope_expr = + <:expr< ( let mng = dtd # namespace_manager in + new Pxp_dtd.namespace_scope_impl + mng None [] ) >> in + [ `Tree, ( <:expr< let scope = $scope_expr$ in $compiled_subexpr$ >> ) ] + ) + + and generate_for_string_expr : ast_string -> MLast.expr = ( + function + (`Literal s, p1, p2) -> + let loc = mkloc p1 p2 in + let s' = to_rep s in + <:expr< $str:s'$ >> + | (`Concat l, p1, p2) -> + let loc = mkloc p1 p2 in + let l' = List.map generate_for_string_expr l in + let l'' = generate_list loc l' in + <:expr< String.concat "" $l''$ >> + | (`Ident name, p1, p2) -> + let loc = mkloc p1 p2 in + generate_ident loc (to_src name) + | (`Anti text, p1, p2) -> + Grammar.Entry.parse Pcaml.expr_eoi (Stream.of_string (to_src text)) + ) + + in + + catch_errors + (fun () -> + let stream = scan_string s in + let ast = call_parser parse_any_expr stream in + let ast' = check_any_expr ast in + let loc = mkloc (1,0,0) (last_pos stream) in + let expr = generate_for_any_expr loc ast' in + <:expr< $anti:expr$ >> + ) +;; + + +let expand_evlist_expr s = + let loc = mkloc (0,0,0) (0,0,0) in (* ??? *) + let rec generate_tree annlist = + match annlist with + (`Single, e) :: annlist' -> + let rest = generate_tree annlist' in + <:expr< [$e$ :: $rest$] >> + | ((`Tree | `Forest), e) :: annlist' -> + let rest = generate_tree annlist' in + <:expr< $e$ @ $rest$ >> + | [] -> + <:expr< [] >> + in + let generate_forest annlist = generate_tree annlist in + check_file(); + generate_event_generator generate_tree generate_forest s +;; + + +let expand_evpull_expr s = + let loc = mkloc (0,0,0) (0,0,0) in (* ??? *) + let generate_tree annlist = + let rec generate_match k annlist = + match annlist with + (`Single, e) :: annlist' -> + ( <:patt< $int:string_of_int k$ >>, + None, + <:expr< let ev = $e$ in + do { _state.val := $int:string_of_int(k+1)$; + Some ev } + >> ) :: generate_match (k+1) annlist' + | ((`Tree | `Forest), e) :: annlist' -> + ( <:patt< $int:string_of_int k$ >>, + None, + <:expr< match $e$ _arg with + [ None -> do { _state.val := $int:string_of_int(k+1)$; + _generator _arg } + | Some Pxp_types.E_end_of_stream -> _generator _arg + | Some ev -> Some ev ] + >> ) :: generate_match (k+1) annlist' + | [] -> + [ <:patt< $int:string_of_int k$ >>, + None, + <:expr< None >>; + + <:patt< _ >>, + None, + <:expr< assert False >> + ] + in + <:expr< let rec _generator = + let _state = ref 0 in + fun _arg -> + match _state.val with + [$list:generate_match 0 annlist$] + in _generator >> + in + let generate_forest annlist = generate_tree annlist in + check_file(); + generate_event_generator generate_tree generate_forest s +;; + + + +(**********************************************************************) +(* Other expanders *) + let expand_charset_expr s = check_file(); catch_errors @@ -899,5 +1179,9 @@ Quotation.add "pxp_tree" (Quotation.ExAst(expand_tree_expr false, na_pat)) ;; Quotation.add "pxp_vtree" (Quotation.ExAst(expand_tree_expr true, na_pat)) ;; +Quotation.add + "pxp_evlist" (Quotation.ExAst(expand_evlist_expr, na_pat)) ;; +Quotation.add + "pxp_evpull" (Quotation.ExAst(expand_evpull_expr, na_pat)) ;; Quotation.add "pxp_text" (Quotation.ExAst(expand_text_expr, na_pat)) ;;