Commit 92418c9b authored by Erick's avatar Erick

Recognize UTF-8 strings in regexps

parent 686b9eeb
......@@ -6279,6 +6279,7 @@ then
PCRE_FLAGS="-Dregexec=PCRE_regexec -Dregcomp=PCRE_regcomp \
-Dregerror=PCRE_regerror -Dregfree=PCRE_regfree"
(cd pcre; CC=${CC} CFLAGS="${CFLAGS} $PCRE_FLAGS" ./configure \
--enable-utf8 \
--prefix=$prefix --disable-shared --disable-cpp) || \
{ echo "Cannot configure PCRE"; exit; }
PCRE=pcre
......
......@@ -2,7 +2,7 @@
###
### Author: Erick Gallesio [eg@unice.fr]
### Creation date: 28-Dec-1999 21:19 (eg)
### Last file update: 27-Jul-2011 23:01 (eg)
### Last file update: 21-Aug-2011 12:56 (eg)
AC_PREREQ(2.64)
AC_INIT([stklos], [1.10])
......@@ -297,6 +297,7 @@ then
PCRE_FLAGS="-Dregexec=PCRE_regexec -Dregcomp=PCRE_regcomp \
-Dregerror=PCRE_regerror -Dregfree=PCRE_regfree"
(cd pcre; CC=${CC} CFLAGS="${CFLAGS} $PCRE_FLAGS" ./configure \
--enable-utf8 \
--prefix=$prefix --disable-shared --disable-cpp) || \
{ echo "Cannot configure PCRE"; exit; }
PCRE=pcre
......
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@essi.fr]
* Creation date: 1-Jul-2003 11:38 (eg)
* Last file update: 27-May-2011 22:34 (eg)
* Last file update: 19-Aug-2011 18:00 (eg)
*/
......@@ -250,7 +250,7 @@ DEFINE_PRIMITIVE("%parameter-dynenv-pop!", parameter_dynenv_pop, subr1, (SCM par
*
\*===========================================================================*/
struct extended_type_descr xtype_parameter = { "parameter", NULL };
static struct extended_type_descr xtype_parameter = { "parameter", NULL };
int STk_init_parameter(void)
{
......
/*
* promise.c -- Implementation of promises
*
* Copyright © 2000-2005 Erick Gallesio - I3S-CNRS/ESSI <eg@unice.fr>
* Copyright © 2000-2011 Erick Gallesio - I3S-CNRS/ESSI <eg@unice.fr>
*
*
* This program is free software; you can redistribute it and/or modify
......@@ -22,7 +22,7 @@
* Author: Erick Gallesio [eg@unice.fr]
* Author: Erick Gallesio [eg@kaolin.unice.fr]
* Creation date: 2-Jun-1993 12:27 (eg)
* Last file update: 24-Apr-2005 21:55 (eg)
* Last file update: 19-Aug-2011 18:01 (eg)
*/
#include <stklos.h>
......@@ -146,7 +146,7 @@ static void print_promise(SCM promise, SCM port, int mode)
}
struct extended_type_descr xtype_promise = {
static struct extended_type_descr xtype_promise = {
"promise",
print_promise
};
......
/*
* regexp.c -- STklos Regexps
*
* Copyright © 2000-2010 Erick Gallesio - I3S-CNRS/ESSI <eg@unice.fr>
* Copyright © 2000-2011 Erick Gallesio - I3S-CNRS/ESSI <eg@unice.fr>
*
*
* This program is free software; you can redistribute it and/or modify
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 24-Nov-2000 10:35 (eg)
* Last file update: 4-Apr-2010 12:22 (eg)
* Last file update: 21-Aug-2011 13:33 (eg)
*/
#include "stklos.h"
......@@ -55,20 +55,29 @@
* So what we do here is potentially FALSE. However, it is unlikely
* that the code include in STklos is not compatible with the one
* used to compile the installed library (this file seems to have
* been always "semantically" compatible). The only point wehr we
* can have difference shoul be the definition of regmatch_t type
* been always "semantically" compatible). The only point where we
* can have difference should be the definition of regmatch_t type
*/
# include "../pcre/pcreposix.h"
#endif
#ifdef REG_UTF8
# define PCRE_COMP_FLAG REG_UTF8
#else
# define PCRE_COMP_FLAG 0
#endif
/* ---------------------------------------------------------------------- */
struct regexp_obj {
stk_header header;
SCM src;
regex_t buffer;
};
#define REGEXPP(p) (BOXED_TYPE_EQ((p), tc_regexp))
#define REGEXP_SRC(p) (((struct regexp_obj *) (p))->src)
#define REGEXP_BUFFER(p) (((struct regexp_obj *) (p))->buffer)
#define REGEXP_DEPTH(p) ((((struct regexp_obj *) (p))->buffer).re_nsub)
......@@ -99,6 +108,13 @@ static void regexp_finalizer(SCM re)
PCRE_regfree(&REGEXP_BUFFER(re));
}
static void print_regexp(SCM obj, SCM port, int mode)
{
STk_fprintf(port,
"#[regexp '%s' @ %lx]",
STRING_CHARS(REGEXP_SRC(obj)),
(unsigned long) obj);
}
/*
<doc EXT string->regexp
......@@ -119,10 +135,12 @@ DEFINE_PRIMITIVE("string->regexp", str2regexp, subr1, (SCM re))
int ret;
if (!STRINGP(re)) error_bad_string(re);
NEWCELL_ATOMIC(z, regexp, sizeof(struct regexp_obj) );
ret = PCRE_regcomp(&REGEXP_BUFFER(z), STRING_CHARS(re), 0);
NEWCELL(z, regexp);
ret = PCRE_regcomp(&REGEXP_BUFFER(z),
STRING_CHARS(re),
STk_use_utf8? PCRE_COMP_FLAG: 0);
REGEXP_SRC(z) = re;
if (ret) signal_regexp_error(ret, &REGEXP_BUFFER(z));
STk_register_finalizer(z, regexp_finalizer);
......@@ -186,6 +204,7 @@ static SCM regexec_helper(SCM re, SCM str, int pos_only)
int i, ret, depth, max;
SCM result;
STk_debug("regexec helper ~S ~S", re, str);
/* RE can be a string or a already compiled regexp */
if (STRINGP(re)) re = STk_str2regexp(re);
else if (!REGEXPP(re)) STk_error("bad compiled regexp ~S", re);
......@@ -218,6 +237,7 @@ static SCM regexec_helper(SCM re, SCM str, int pos_only)
STk_makestring(to-from, STRING_CHARS(str)+from),
result);
}
STk_debug("regexec helper ~S ~S => ", re, str, STk_reverse(result));
return STk_dreverse(result);
}
......@@ -288,7 +308,7 @@ DEFINE_PRIMITIVE("regexp-quote", regexp_quote, subr1, (SCM str))
/* The stucture which describes the regexp type */
static struct extended_type_descr xtype_regexp = {
"regexp", /* name */
NULL /* print function */
print_regexp /* print function */
};
......
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 30-Apr-2011 19:46 (eg)
* Last file update: 16-Aug-2011 18:03 (eg)
* Last file update: 19-Aug-2011 18:05 (eg)
*/
#include "stklos.h"
......@@ -92,16 +92,12 @@ int STk_utf8_read_char(SCM port)
}
int STk_char2utf8(int ch, char *str) /* result = length of the UTF-8 repr. */
{
uint8_t *buff = (uint8_t *)str;
int n = 0;
if (VALID_UTF8_VALUE(ch))
if (VALID_UTF8_VALUE(ch)) {
if (ch < 0x80) {
*buff++ = ch;
n = 1;
......@@ -121,6 +117,7 @@ int STk_char2utf8(int ch, char *str) /* result = length of the UTF-8 repr. */
*buff++ = (ch & 0x3f) | 0x80;
n = 4;
}
}
/* *buff = '\0'; */
return n;
}
......@@ -198,7 +195,8 @@ DEFINE_PRIMITIVE("%char-utf8-encoding", char_utf8_encoding, subr1, (SCM c))
DEFINE_PRIMITIVE("%dump-string", dump_string, subr12, (SCM str, SCM index))
{
int i, c=0;
int i;
uint32_t c = 0;
STk_debug("String ~S. space=%d, size=%d, len =%d", str,
STRING_SPACE(str), STRING_SIZE(str), STRING_LENGTH(str));
......@@ -213,8 +211,7 @@ DEFINE_PRIMITIVE("%dump-string", dump_string, subr12, (SCM str, SCM index))
printf("------\nChar starting at index %d\n", i);
STk_debug(" length of char = %d",
STk_utf8_sequence_length(&(STRING_CHARS(str)[i])));
STk_utf8_grab_char(
STRING_CHARS(str)+i, &c);
STk_utf8_grab_char(STRING_CHARS(str)+i, &c);
STk_debug(" character is %d ~S", (unsigned) c, MAKE_CHARACTER(c));
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment