Commit 65688d9e authored by Erick's avatar Erick

Fixed a bug with indices of regexp-match-position with UTF-8 encoded strings

parent 04807707
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 24-Nov-2000 10:35 (eg)
* Last file update: 22-Aug-2011 00:35 (eg)
* Last file update: 18-Sep-2011 21:47 (eg)
*/
#include "stklos.h"
......@@ -230,11 +230,24 @@ static SCM regexec_helper(SCM re, SCM str, int pos_only)
LIST2(MAKE_INT(0), MAKE_INT(0)) :
STk_false,
result);
else
else {
int ifrom, ito;
char *s = STRING_CHARS(str);
int size = STRING_SIZE(str);
if (STRING_MONOBYTE(str)) {
ifrom = from;
ito = to;
} else {
ifrom = STk_utf8_char_from_byte(s, from, size);
ito = STk_utf8_char_from_byte(s, to, size);
}
result = STk_cons((pos_only)?
LIST2(STk_long2integer(from), STk_long2integer(to)) :
STk_makestring(to-from, STRING_CHARS(str)+from),
LIST2(STk_long2integer(ifrom), STk_long2integer(ito)) :
STk_makestring(to-from, s+from),
result);
}
}
return STk_dreverse(result);
......@@ -311,7 +324,6 @@ static struct extended_type_descr xtype_regexp = {
};
int STk_init_regexp(void)
{
DEFINE_XTYPE(regexp, &xtype_regexp);
......
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 28-Dec-1999 22:58 (eg)
* Last file update: 28-Aug-2011 15:04 (eg)
* Last file update: 9-Sep-2011 15:37 (eg)
*/
......@@ -1168,6 +1168,7 @@ struct string_obj {
#define STRING_CONST (1 << 0)
#define STRING_MONOBYTE(str) (STRING_LENGTH(str) == STRING_SIZE(str))
SCM STk_makestring(int len, char *init);
SCM STk_Cstring2string(char *str); /* Embed a C string in Scheme world */
......@@ -1266,6 +1267,8 @@ int STk_utf8_read_char(SCM port);
int STk_utf8_sequence_length(char *str); /* # of bytes of sequence starting at str */
int STk_utf8_char_bytes_needed(unsigned int ch);/* # of bytes needed to represent ch*/
char *STk_utf8_index(char *s, int i, int max);/* return the address of ith char of s*/
int STk_utf8_char_from_byte(char *s, int i, int max); /* byte index => char index */
int STk_init_utf8(void);
......
......@@ -22,7 +22,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: ??????
* Last file update: 4-Sep-2011 22:16 (eg)
* Last file update: 9-Sep-2011 15:37 (eg)
*/
#include <ctype.h>
......@@ -33,7 +33,6 @@
/* min size added to a string when reallocated in a string-set! */
#define UTF8_STRING_INCR 8
#define STRING_MONOBYTE(str) (STRING_LENGTH(str) == STRING_SIZE(str))
/*
* Utilities
......
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 30-Apr-2011 19:46 (eg)
* Last file update: 19-Aug-2011 18:05 (eg)
* Last file update: 9-Sep-2011 14:43 (eg)
*/
#include "stklos.h"
......@@ -175,6 +175,25 @@ char *STk_utf8_index(char *s, int i, int max) /* return the address of ith char
return s;
}
int STk_utf8_char_from_byte(char *s, int i, int max) /* byte index => char index */
{
char *start = s,
*end = s + max,
*idx = s + i;
int pos = 0;
while ((s != idx) && (s < end) ) {
int sz = STk_utf8_sequence_length(s);
if (sz == UTF8_INCORRECT_SEQUENCE)
error_bad_sequence(start);
s += sz;
pos += 1;
}
return (s == idx) ? pos : -1;
}
/* ======================================================================
* STklos Primitives
* ====================================================================== */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment