Commit 96eca9eb authored by Erick's avatar Erick

Added the R7RS utf8->string & string->utf8 functions

parent 7690d4bd
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 28-Dec-1999 22:58 (eg)
* Last file update: 21-Jun-2018 13:51 (eg)
* Last file update: 11-Jul-2018 16:30 (eg)
*/
......@@ -1264,6 +1264,7 @@ int STk_utf8_strlen(char *s, int max);
int STk_utf8_read_char(SCM port);
int STk_utf8_sequence_length(char *str); /* # of bytes of sequence starting at str */
int STk_utf8_char_bytes_needed(unsigned int ch);/* # of bytes needed to represent ch*/
int STk_utf8_verify_sequence(char *s, int len); /* s constitutes a valid UTF8? */
char *STk_utf8_index(char *s, int i, int max);/* return the address of ith char of s*/
int STk_utf8_char_from_byte(char *s, int i, int max); /* byte index => char index */
......
......@@ -22,7 +22,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: ??????
* Last file update: 6-Jul-2018 19:38 (eg)
* Last file update: 10-Jul-2018 20:33 (eg)
*/
#include <ctype.h>
......@@ -30,6 +30,9 @@
#include "stklos.h"
extern SCM STk_make_bytevector_from_string(char *str, long len);
/* min size added to a string when reallocated in a string-set! */
#define UTF8_STRING_INCR 8
......@@ -150,7 +153,7 @@ static int stringcompi(SCM s1, SCM s2)
}
static SCM control_index(int argc, SCM *argv, int *pstart, int *pend)
static SCM control_index(int argc, SCM *argv, long *pstart, long *pend)
{
SCM s = NULL;
long len, start=0, end=-1;
......@@ -757,7 +760,7 @@ doc>
*/
DEFINE_PRIMITIVE("string-copy", string_copy, vsubr, (int argc, SCM *argv))
{
int start, end;
long start, end;
control_index(argc, argv, &start, &end);
if (start == -1)
......@@ -991,7 +994,7 @@ static SCM string_xxcase(int argc, SCM *argv, int (*toxx)(int),
wint_t (*towxx)(wint_t))
{
SCM s;
int start, end;
long start, end;
s = control_index(argc, argv, &start, &end);
......@@ -1039,7 +1042,7 @@ static SCM string_dxxcase(int argc, SCM *argv, int (*toxx)(int),
wint_t (*towxx)(wint_t))
{
SCM s;
int i, start, end;
long i, start, end;
s = control_index(argc, argv, &start, &end);
if (BOXED_INFO(s) & STRING_CONST) error_change_const_string(s);
......@@ -1173,7 +1176,7 @@ doc>
DEFINE_PRIMITIVE("string-titlecase", string_titlecase, vsubr, (int argc, SCM *argv))
{
SCM s, z;
int start, end;
long start, end;
char *endp, *p, *q;
char prev_is_sep = 1, curr_is_sep;
......@@ -1204,7 +1207,7 @@ doc>
DEFINE_PRIMITIVE("string-titlecase!", string_dtitlecase,vsubr,(int argc, SCM *argv))
{
SCM s;
int start, end;
long start, end;
char *endp, *p;
char prev_is_sep = 1, curr_is_sep;
......@@ -1303,6 +1306,22 @@ DEFINE_PRIMITIVE("string-pos", string_pos, subr2, (SCM str, SCM index))
*/
DEFINE_PRIMITIVE("string->utf8", string2utf8, vsubr, (int argc, SCM *argv))
{
long start, end;
SCM str;
char *start_addr, *end_addr;
str = control_index(argc, argv, &start, &end);
start_addr = STk_utf8_index(STRING_CHARS(str), (int) start, STRING_SIZE(str));
end_addr = STk_utf8_index(STRING_CHARS(str), (int) end, STRING_SIZE(str));
return STk_make_bytevector_from_string(start_addr, end_addr - start_addr);
}
DEFINE_PRIMITIVE("%use-utf8?", using_utf8, subr0, (void))
{
return MAKE_BOOLEAN(STk_use_utf8);
......@@ -1371,6 +1390,7 @@ int STk_init_string(void)
ADD_PRIMITIVE(string_dtitlecase);
ADD_PRIMITIVE(string_blit);
ADD_PRIMITIVE(string2utf8);
ADD_PRIMITIVE(using_utf8);
ADD_PRIMITIVE(string_use_utf8);
ADD_PRIMITIVE(string2bytes);
......
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 30-Apr-2011 19:46 (eg)
* Last file update: 27-Mar-2018 14:04 (eg)
* Last file update: 11-Jul-2018 16:36 (eg)
*/
#include "stklos.h"
......@@ -168,6 +168,20 @@ int STk_utf8_strlen(char *s, int max)
return len;
}
int STk_utf8_verify_sequence(char *s, int len)
/* Are the len bytes starting at s constitute a valid UTF8 sequence? */
{
char *end = s + len;
while (s < end) {
int sz = STk_utf8_sequence_length(s);
if (sz == UTF8_INCORRECT_SEQUENCE) return 0;
s += sz;
}
return (s == end);
}
char *STk_utf8_index(char *s, int i, int max) /* return the address of ith char of s*/
{
char *start = s, *end = s + max;
......
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 15-Apr-2001 10:13 (eg)
* Last file update: 10-Jul-2018 16:10 (eg)
* Last file update: 11-Jul-2018 16:25 (eg)
*/
#include "stklos.h"
......@@ -596,6 +596,62 @@ DEFINE_PRIMITIVE("bytevector-append", bytevector_append, vsubr,(int argc, SCM *a
return z;
}
/*
<doc R7RS utf8->string string->utf8
* (utf8->string bytevector)
* (utf8->string bytevector start)
* (utf8->string bytevector start end)
* (string->utf8 string)
* (string->utf8 string start)
* (string->utf8 string start end)
*
* These procedures translate between strings and bytevectors
* that encode those strings using the UTF-8 encoding.
* The |utf8->string| procedure decodes the bytes of
* a bytevector between |start| and |end| and returns the
* corresponding string; the |string->utf8| procedure encodes the
* characters of a string between |start| and |end| and returns
* the corresponding bytevector.
*
* It is an error for |bytevector| to contain invalid UTF-8 byte
* sequences.
* @lisp
* (utf8->string #u8(#x41)) => "A"
* (string->utf8 "λ") => #u8((#xce #xbb)
* @end lisp
doc>
*/
SCM STk_make_bytevector_from_string(char *str, long len)
{
SCM z = makeuvect(UVECT_U8, len, (SCM) NULL);
memcpy(UVECTOR_DATA(z),str, len);
return z;
}
DEFINE_PRIMITIVE("utf8->string", utf82string, vsubr, (int argc, SCM *argv))
{
long start, end, len;
SCM v;
unsigned char *start_addr, *end_addr;
v = control_index(argc, argv, &start, &end, NULL);
start_addr = (unsigned char*) UVECTOR_DATA(v) + start;
end_addr = (unsigned char*) UVECTOR_DATA(v) + end;
len = end_addr - start_addr;
/* Verify that the sub-vector denotes a correct string */
if (STk_utf8_verify_sequence(start_addr, len)) {
SCM z = STk_makestring(len, NULL);
memcpy(STRING_CHARS(z), start_addr, end_addr - start_addr);
return z;
}
else
STk_error("bad UTF8 sequence between %d and %d in ~S", start, end, v);
return STk_void; /* for the compiler */
}
/* ====================================================================== */
int STk_init_uniform_vector(void)
......@@ -615,6 +671,7 @@ int STk_init_uniform_vector(void)
/* R7RS specific bytevectors primitives */
ADD_PRIMITIVE(bytevector_copy);
ADD_PRIMITIVE(bytevector_append);
ADD_PRIMITIVE(utf82string);
/* A pseudo primitive to launch the definition of all the function of SRFI-4 */
ADD_PRIMITIVE(allow_uvectors);
......
;;;; -*- coding: latin-1 -*-
;;;; -*- coding: utf-8 -*-
;;;;
;;;; test-r7rs.stk -- Testing R7RS constructs/primitives
;;;;
;;;; Copyright © 2011 Erick Gallesio - Polytech'Nice-Sophia <eg@unice.fr>
;;;; Copyright © 2011 Erick Gallesio - Polytech'Nice-Sophia <eg@unice.fr>
;;;;
;;;;
;;;; This program is free software; you can redistribute it and/or modify
......@@ -22,7 +22,7 @@
;;;;
;;;; Author: Erick Gallesio [eg@unice.fr]
;;;; Creation date: 17-Apr-2011 20:58 (eg)
;;;; Last file update: 10-Jul-2018 16:06 (eg)
;;;; Last file update: 11-Jul-2018 18:22 (eg)
;;;;
(require "test")
......@@ -349,6 +349,16 @@
(test "byte-vector-append.7" #u8(0 1 2 3) (bytevector-append #u8(0) #u8(1)
#u8() #u8(2) #u8(3)))
(test "chibi utf8->string.1" "ABC" (utf8->string #u8(#x41 #x42 #x43)))
(test "chibi utf8->string.2" "ABC" (utf8->string #u8(0 #x41 #x42 #x43) 1))
(test "chibi utf8->string.3""ABC" (utf8->string #u8(0 #x41 #x42 #x43 0) 1 4))
(test "chibi utf8->string.4" "λ" (utf8->string #u8(0 #xCE #xBB 0) 1 3))
(test "utf8->string.5" *test-failed* (utf8->string #u8(0 #xCE)))
(test "chibi string->utf8.1" #u8(#x41 #x42 #x43) (string->utf8 "ABC"))
(test "chibi string->utf8.2" #u8(#x42 #x43) (string->utf8 "ABC" 1))
(test "chibi string->utf8.3" #u8(#x42) (string->utf8 "ABC" 1 2))
(test "chibi string->utf8.4" #u8(#xCE #xBB) (string->utf8 "λ"))
;;------------------------------------------------------------------
(test-subsection "Lists and Pairs")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment