Commit f1f00e4a authored by Erick's avatar Erick

Added UTF-8 support to string-ref, string<->list

parent b60d70f9
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 5-Jan-2000 12:17 (eg)
* Last file update: 18-Apr-2011 23:46 (eg)
* Last file update: 6-May-2011 20:01 (eg)
*/
......@@ -38,6 +38,7 @@ STk_init_library(int *argc, char ***argv, int stack_size)
STk_get_stack_pointer(&start_stack);
return
STk_init_env() &&
STk_init_symbol() &&
......@@ -77,5 +78,8 @@ STk_init_library(int *argc, char ***argv, int stack_size)
STk_init_box() &&
STk_init_blob() &&
STk_init_ffi() &&
#ifdef STK_DEBUG
STk_init_utf8() &&
#endif
(STk_library_initialized = TRUE);
}
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 28-Dec-1999 22:58 (eg)
* Last file update: 5-May-2011 23:35 (eg)
* Last file update: 6-May-2011 20:53 (eg)
*/
......@@ -943,7 +943,6 @@ struct port_obj {
**** sio.h primitives
****/
#define UTF8_INCORRECT_SEQUENCE (-2)
int STk_readyp(SCM port);
int STk_getc(SCM port);
......@@ -1251,12 +1250,15 @@ int STk_init_mutexes(void);
------------------------------------------------------------------------------
*/
#define UTF8_INCORRECT_SEQUENCE (-2)
extern int STk_use_utf8;
char *STk_utf8_grab_char(char *str, int *c); /* result = pos. after current one */
int STk_char2utf8(int ch, char *str); /* result = length of the UTF-8 repr. */
int STk_utf8_strlen(char *s, int max);
int STk_utf8_read_char(SCM port);
int STk_utf8_char_bytes_needed(unsigned int ch);/* # of bytes needed to represent ch*/
int STk_init_utf8(void);
......
......@@ -22,7 +22,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: ??????
* Last file update: 5-May-2011 23:53 (eg)
* Last file update: 6-May-2011 21:37 (eg)
*/
#include <ctype.h>
......@@ -323,7 +323,19 @@ DEFINE_PRIMITIVE("string-ref", string_ref, subr2, (SCM str, SCM index))
if (k < 0 || k >= STRING_SIZE(str))
STk_error("index ~S out of bound in string ~S", index, str);
return MAKE_CHARACTER(STRING_CHARS(str)[k]);
if (STRING_SIZE(str) == STRING_LENGTH(str))
/* string doesn't contain multibytes chars */
return MAKE_CHARACTER(STRING_CHARS(str)[k]);
else {
/* We have multibytes chars */
int c;
char *s = STRING_CHARS(str);
do
s = STk_utf8_grab_char(s, &c);
while (k--);
return MAKE_CHARACTER(c);
}
}
......@@ -509,18 +521,19 @@ doc>
DEFINE_PRIMITIVE("string->list", string2list, subr1, (SCM str))
{
register char *s;
int len;
int len, c;
SCM tmp, tmp1, z;
if (!STRINGP(str)) error_bad_string(str);
len = STRING_SIZE(str);
len = STRING_LENGTH(str);
s = STRING_CHARS(str);
tmp = z = STk_nil;
while (len--) {
tmp1 = STk_cons(MAKE_CHARACTER(*s++), STk_nil);
s = STk_utf8_grab_char(s, &c);
tmp1 = STk_cons(MAKE_CHARACTER(c), STk_nil);
if (z == STk_nil)
tmp = z = tmp1;
else
......@@ -531,18 +544,26 @@ DEFINE_PRIMITIVE("string->list", string2list, subr1, (SCM str))
DEFINE_PRIMITIVE("list->string", list2string, subr1, (SCM l))
{
int len = STk_int_length(l);
int bytes = 0, len = STk_int_length(l);
register char *s;
SCM z;
SCM z, tmp;
if (len < 0) STk_error("bad list ~S", l);
z = STk_makestring(len, NULL);
/* compute the number of bytes needed */
for (tmp=l; !NULLP(tmp); tmp=CDR(tmp)) {
if (!CHARACTERP(CAR(tmp))) error_bad_character(CAR(tmp));
bytes += STk_utf8_char_bytes_needed(CHARACTER_VAL(CAR(tmp)));
}
z = STk_makestring(bytes, NULL);
s = STRING_CHARS(z);
/* copy the characters in the newly allocated string */
for ( ; !NULLP(l); l=CDR(l)) {
if (!CHARACTERP(CAR(l))) error_bad_character(CAR(l));
*s++ = CHARACTER_VAL(CAR(l));
s += STk_char2utf8(CHARACTER_VAL(CAR(l)), s);
}
*s = '\0';
return z;
}
......
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 30-Apr-2011 19:46 (eg)
* Last file update: 5-May-2011 23:35 (eg)
* Last file update: 6-May-2011 20:52 (eg)
*/
#include "stklos.h"
......@@ -118,8 +118,9 @@ int STk_char2utf8(int ch, char *str) /* result = length of the UTF-8 repr. */
return n;
}
static int utf8_char_length(uint8_t ch)
int STk_utf8_char_bytes_needed(unsigned int ch)
{
/* # of bytes needed ro represent character ch */
if (ch < 0x80) return 1;
if (ch < 0x800) return 2;
if (ch < 0x10000) return 3;
......@@ -127,30 +128,28 @@ static int utf8_char_length(uint8_t ch)
return 1; /* to avoid infinite loop, but obiously incorrect */
}
static int utf8_sequence_length(uint8_t c)
{
/* return length of a the UTF-8 sequence given its first byte */
if (c < 0x80) return 1;
if ((c < 0xc0) || (c > 0xf7)) return UTF8_INCORRECT_SEQUENCE;
if (c < 0xe0) return 2;
if (c < 0xf0) return 3;
return 4;
}
int STk_utf8_strlen(char *s, int max)
{
int len;
char *end = s + max;
for (len = 0; (s < end) && *s; len++) {
s += utf8_char_length(*s);
s += utf8_sequence_length(*s);
}
return len;
}
#ifdef STK_DEBUG
void STk_dump_utf8_str(char *str)
{
printf("Dump of '%s' (len = %d)\n", str, strlen(str));
while (*str) {
printf("%03d %02x ", (uint8_t) *str, (uint8_t) *str);
str++;
}
printf("---\n");
}
#endif
/* ======================================================================
* STklos Primitives
* ====================================================================== */
......@@ -168,6 +167,31 @@ DEFINE_PRIMITIVE("%char-utf8-encoding", char_utf8_encoding, subr1, (SCM c))
lst = STk_cons(MAKE_INT(buffer[i]), lst);
return lst;
}
DEFINE_PRIMITIVE("%dump-string", dump_string, subr12, (SCM str, SCM index))
{
int i, c;
STk_debug("String ~S. space=%d, size=%d, len =%d\n", str,
STRING_SPACE(str), STRING_SIZE(str), STRING_LENGTH(str));
printf("[");
for (i=0; i < STRING_SIZE(str); i++)
printf(" %02x", (uint8_t) STRING_CHARS(str)[i]);
printf(" ]\n");
if (index) {
i = STk_integer_value(index);
printf("------\nChar starting at index %d\n", i);
STk_debug(" length of char = %d",
utf8_sequence_length(STRING_CHARS(str)[i]));
STk_utf8_grab_char(STRING_CHARS(str)+i, &c);
STk_debug(" character is %d ~S", (unsigned) c, MAKE_CHARACTER(c));
}
return STk_void;
}
#endif
......@@ -178,7 +202,7 @@ int STk_init_utf8(void)
{
#ifdef STK_DEBUG
ADD_PRIMITIVE(char_utf8_encoding);
ADD_PRIMITIVE(dump_string);
#endif
return TRUE;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment