Commit 17425537 authored by Erick's avatar Erick

All string-* functions are now able to manage string encoded in UTF-8

parent 0b1b8180
......@@ -2,7 +2,7 @@
#
# Author: Erick Gallesio [eg@unice.fr]
# Creation date: 11-Apr-2000 10:30 (eg)
# Last file update: 17-Apr-2011 21:33 (eg)
# Last file update: 10-Aug-2011 09:30 (eg)
SUBDIRS = Match.d SILex.d Lalr.d ScmPkg.d
......@@ -33,6 +33,7 @@ scheme_BOOT = assembler.stk \
repl.stk \
repl-readline.stk \
runtime.stk \
str.stk \
srfi-0.stk \
struct.stk \
thread.stk
......
......@@ -19,7 +19,7 @@
#
# Author: Erick Gallesio [eg@unice.fr]
# Creation date: 11-Apr-2000 10:30 (eg)
# Last file update: 17-Apr-2011 21:33 (eg)
# Last file update: 10-Aug-2011 09:30 (eg)
VPATH = @srcdir@
pkgdatadir = $(datadir)/@PACKAGE@
......@@ -272,6 +272,7 @@ scheme_BOOT = assembler.stk \
repl.stk \
repl-readline.stk \
runtime.stk \
str.stk \
srfi-0.stk \
struct.stk \
thread.stk
......
......@@ -21,12 +21,13 @@
;;;;
;;;; Author: Erick Gallesio [eg@unice.fr]
;;;; Creation date: 01-Jan-2000 15:04 (eg)
;;;; Last file update: 23-Apr-2011 18:56 (eg)
;;;; Last file update: 10-Aug-2011 00:15 (eg)
;;;;
(include "runtime.stk") ; Definition necessary for the bootstrap
(include "module.stk") ; All the macros for defining modules
(include "r5rs.stk") ; R5RS stuff written in Scheme
(include "str.stk") ; String functions UTF-8 aware
(include "callcc.stk") ; Call/cc support
(include "struct.stk") ; STklos structures
(include "bonus.stk") ; Extended functions and syntaxes
......
;;;;
;;;; srfi-0.stk -- SRFI-0 aka cond-expand
;;;;
;;;; Copyright 1999-2010 Erick Gallesio - I3S-CNRS/ESSI <eg@unice.fr>
;;;; Copyright 1999-2011 Erick Gallesio - I3S-CNRS/ESSI <eg@unice.fr>
;;;;
;;;;
;;;; This program is free software; you can redistribute it and/or modify
......@@ -21,7 +21,7 @@
;;;;
;;;; Author: Erick Gallesio [eg@unice.fr]
;;;; Creation date: 30-Aug-1999 16:26 (eg)
;;;; Last file update: 7-Nov-2010 16:14 (eg)
;;;; Last file update: 10-Aug-2011 09:25 (eg)
;;;;
(define-module SRFI-0
......@@ -32,8 +32,13 @@
(select-module SRFI-0)
(define *all-features*
'(stklos ; Of course
`(stklos ; Of course
STklos ; In case of cases-sensitive reader
,(running-os) ; OS used
,@(if (%use-utf8?) ; UTF-8 is used
'(utf-8 UTF-8)
'())
; ===============================================
srfi-0 ; COND-EXPAND
((srfi-1 lists) "srfi-1") ; List primitives
((srfi-2 and-let*) "srfi-2") ; AND-LET*
......
;;;;
;;;; str.stk -- string operations
;;;;
;;;; Copyright © 2011 Erick Gallesio - Polytech'Nice-Sophia <eg@unice.fr>
;;;;
;;;;
;;;; This program is free software; you can redistribute it and/or modify
;;;; it under the terms of the GNU General Public License as published by
;;;; the Free Software Foundation; either version 2 of the License, or
;;;; (at your option) any later version.
;;;;
;;;; This program is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;;; GNU General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU General Public License
;;;; along with this program; if not, write to the Free Software
;;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
;;;; USA.
;;;;
;;;; Author: Erick Gallesio [eg@unice.fr]
;;;; Creation date: 14-Jul-2011 16:42 (eg)
;;;; Last file update: 10-Aug-2011 16:17 (eg)
;;;;
;;; The functions defined here replace C primitives which are not UTF-8 aware.
;;; These functions still use their correspondant C primitive if the string
;;; argument doesn't use multi-byte characters
(when (%use-utf8?)
(let ((%string-split string-split)
(%string-blit! string-blit!)
(%string-titlecase string-titlecase)
(%string-titlecase! string-titlecase!)
(error-bad-string (lambda (str)
(error "bad string ~S" str))))
;;
;; string-split
;;
(set! string-split
(lambda (str :optional (delimiters " \t\n"))
(if (not (%string-use-utf8? str))
(%string-split str delimiters)
(begin
(unless (string? str)
(error-bad-string str))
(unless (string? delimiters)
(error-bad-string delimiters))
(let ((len (string-length str))
(delim (string->list delimiters)))
(let Loop ((start 0)
(current 0)
(res '()))
(cond
(( = current len)
(reverse! (if (> current start)
(cons (substring str start current) res)
res)))
((memq (string-ref str current) delim)
(if (> current start)
(Loop (+ current 1)
(+ current 1)
(cons (substring str start current) res))
(Loop (+ current 1)
(+ current 1)
res)))
(else
(Loop start
(+ current 1)
res)))))))))
;;
;; string-blit!
;;
(set! string-blit!
(lambda (str1 str2 offset)
(if (not (and (%string-use-utf8? str1) (%string-use-utf8? str2)))
(%string-blit! str1 str2 offset)
(begin
(unless (string? str1)
(error-bad-string str1))
(unless (string? str2)
(error-bad-string str2))
(unless (integer? offset)
(error "bad offset ~S" offset))
(unless (string-mutable? str1)
(error "changing the constant string ~S is not allowed"))
(let ((len1 (string-length str1))
(len2 (string-length str2)))
(cond
((and (zero? len1) (zero? offset))
str2)
((< (+ offset len2) len1)
;; str2 can be written in str1
(dotimes (i len2)
(string-set! str1 (+ offset i) (string-ref str2 i)))
str1)
(else
;; Size of original string changes => allocate a new string
(let* ((newl (if (>= len1 (+ offset len2)) len1 (+ offset len2)))
(new (make-string newl)))
(let Loop ((i 0)
(j 0))
(cond
((= i newl)
new)
((and (>= i offset) (< i (+ offset len2)))
(string-set! new i (string-ref str2 j))
(Loop (+ i 1) (+ j 1)))
((< i len1)
(string-set! new i (string-ref str1 i))
(Loop (+ i 1) j))
(else
(Loop (+ i 1) j))))))))))))
;;
;; string-titlecase
;;
(set! string-titlecase
(lambda (str :optional (start 0) (end -1))
(if (not (%string-use-utf8? str))
(%string-titlecase str start end)
(begin
(unless (string? str)
(error-bad-string str))
(unless (integer? start)
(error "bad starting index ~S" start))
(cond
((not (integer? end))
(error "bad ending index ~S" end))
((= end -1)
(set! end (string-length str))))
(let ((new (make-string (- end start))))
(let Loop ((i start)
(j 0)
(prev-is-sep? #t))
(if (= i end)
new
(let* ((curr (string-ref str i))
(curr-is-sep? (not (char-alphabetic? curr))))
(string-set! new j (cond
(curr-is-sep? curr)
(prev-is-sep? (char-upcase curr))
(else (char-downcase curr))))
(Loop (+ i 1) (+ j 1) curr-is-sep?)))))))))
;;
;; string-titlecase!
;;
(set! string-titlecase!
(lambda (str :optional (start 0) (end -1))
(if (not (%string-use-utf8? str))
(%string-titlecase! str start end)
(begin
(unless (string? str)
(error-bad-string str))
(unless (integer? start)
(error "bad starting index ~S" start))
(cond
((not (integer? end))
(error "bad ending index ~S" end))
((= end -1)
(set! end (string-length str))))
(let Loop ((i start)
(prev-is-sep? #t))
(if (= i end)
(void)
(let* ((curr (string-ref str i))
(curr-is-sep? (not (char-alphabetic? curr))))
(string-set! str i (cond
(curr-is-sep? curr)
(prev-is-sep? (char-upcase curr))
(else (char-downcase curr))))
(Loop (+ i 1) curr-is-sep?))))))))
))
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 28-Dec-1999 21:19 (eg)
* Last file update: 27-Jul-2011 15:36 (eg)
* Last file update: 10-Aug-2011 00:27 (eg)
*/
#include <stklos.h>
......@@ -98,7 +98,7 @@ static void Usage(char *progname, int only_version)
" -d, --debug add informations to ease debugging\n"
" -s, --stack-size=n use a stack of size n (default %d)\n"
" -c, --case-sensitive be case sensitive (default is #f)\n"
" -u, --utf8-encoding=n use/don't use UTF-8 encoding (default is yes)\n"
" -u, --utf8-encoding=yes|no use/don't use UTF-8 encoding (default is yes)\n"
" -v, --version print program version and exit\n"
" -h, --help print this help and exit\n"
"All the arguments given after options are passed to the Scheme program.\n",
......
......@@ -22,7 +22,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: ??????
* Last file update: 27-Jul-2011 23:27 (eg)
* Last file update: 10-Aug-2011 00:34 (eg)
*/
#include <ctype.h>
......@@ -802,14 +802,26 @@ DEFINE_PRIMITIVE("string-fill!", string_fill, subr2, (SCM str, SCM c))
*
*/
static char *Memmem(char *s1, int l1, char *s2, int l2)
static int Memmem(char *s1, int l1, char *s2, int l2, int use_utf8)
{
if (l2 == 0) return s1;
int pos;
for ( ; l1 >= l2 ; s1++, l1--)
if (memcmp(s1, s2, (unsigned int) l2) == 0) return s1;
if (l2 == 0) return 0;
return NULL;
for (pos=0; l1 >= l2 ; pos++, l1--) {
if (memcmp(s1, s2, (unsigned int) l2) == 0) return pos;
/* go to next character */
if (use_utf8) {
int len = STk_utf8_sequence_length(s1);
if (len == UTF8_INCORRECT_SEQUENCE) STk_error("bad UTF-8 sequence");
s1 += len;
} else {
s1++;
}
}
return -1; /* not found */
}
/*
......@@ -821,11 +833,16 @@ doc>
*/
DEFINE_PRIMITIVE("string-find?", string_find, subr2, (SCM s1, SCM s2))
{
int pos;
if (!STRINGP(s1)) error_bad_string(s1);
if (!STRINGP(s2)) error_bad_string(s2);
return MAKE_BOOLEAN(Memmem(STRING_CHARS(s2), STRING_SIZE(s2),
STRING_CHARS(s1), STRING_SIZE(s1)));
pos = Memmem(STRING_CHARS(s2), STRING_SIZE(s2),
STRING_CHARS(s1), STRING_SIZE(s1),
FALSE);
return MAKE_BOOLEAN(pos != -1);
}
/*
......@@ -842,15 +859,16 @@ doc>
*/
DEFINE_PRIMITIVE("string-index", string_index, subr2, (SCM s1, SCM s2))
{
char *p;
int pos;
if (!STRINGP(s1)) error_bad_string(s1);
if (!STRINGP(s2)) error_bad_string(s2);
p = Memmem(STRING_CHARS(s2), STRING_SIZE(s2),
STRING_CHARS(s1), STRING_SIZE(s1));
pos = Memmem(STRING_CHARS(s2), STRING_SIZE(s2),
STRING_CHARS(s1), STRING_SIZE(s1),
STk_use_utf8 && !STRING_MONOBYTE(s2));
return p ? STk_long2integer((p - (char*)STRING_CHARS(s2))) : STk_false;
return (pos != -1) ? STk_long2integer(pos) : STk_false;
}
......@@ -1260,6 +1278,12 @@ DEFINE_PRIMITIVE("string-pos", string_pos, subr2, (SCM str, SCM index))
}
*/
DEFINE_PRIMITIVE("%use-utf8?", using_utf8, subr0, (void))
{
return MAKE_BOOLEAN(STk_use_utf8);
}
DEFINE_PRIMITIVE("%string-use-utf8?", string_use_utf8, subr1, (SCM str))
{
if (!STRINGP(str)) error_bad_string(str);
......@@ -1268,7 +1292,6 @@ DEFINE_PRIMITIVE("%string-use-utf8?", string_use_utf8, subr1, (SCM str))
}
int STk_init_string(void)
{
ADD_PRIMITIVE(stringp);
......@@ -1306,6 +1329,7 @@ int STk_init_string(void)
ADD_PRIMITIVE(string_dtitlecase);
ADD_PRIMITIVE(string_blit);
ADD_PRIMITIVE(using_utf8);
ADD_PRIMITIVE(string_use_utf8);
return TRUE;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment