Commit 90c9b752 authored by Erick's avatar Erick

- Added the possibility

      *  to insert hexadecimal character in symbols as in R6RS
      * to specify characters in hexadecimal
- Changed the syntax of hexadecimal escape sequence in strings. (Octal bytes
  are now deprecated)
- Added Continuation lines in strings compatible with R6RS
parent 99d72575
......@@ -18,7 +18,7 @@
;; USA.
;; Author: Erick Gallesio [eg@unice.fr]
;; Creation date: 26-Nov-2000 18:19 (eg)
;; Last file update: 14-Aug-2011 13:08 (eg)
;; Last file update: 16-Aug-2011 20:25 (eg)
;;
;; ======================================================================
......@@ -176,6 +176,16 @@ characters that must be entered ,(q "as is"). In order to maintain
read-write invariance, symbols containing such sequences of special
characters will be written between a pair of ,(q "|").])
(p [In addition, any character can be used within an identifier when
specified via an inline hex escape . For example, the identifier
,(tt "H\\x65;llo") is the same as the identifier ,(tt "Hello"), and, if the
UTF-8 encoding is used, the identifier ,(tt "\\x3BB;") is the same as
the identifier ,(symbol "lambda").])
(fontified-code [
'|a| ,(symbol-arrow) a
(string->symbol "a") ,(symbol-arrow) |A|
......@@ -211,7 +221,7 @@ which is also shown in this table.])
(tr (td "nul") (td "000") (td "null") (td "soh") (td "001") (td ""))
(tr (td "stx") (td "002") (td "") (td "etx") (td "003") (td ""))
(tr (td "eot") (td "004") (td "") (td "enq") (td "005") (td ""))
(tr (td "ack") (td "006") (td "") (td "bel") (td "007") (td "bell"))
(tr (td "ack") (td "006") (td "") (td "bel") (td "007") (td "alarm"))
(tr (td "bs") (td "010") (td "backspace") (td "ht") (td "011") (td "tab"))
(tr (td "nl") (td "012") (td "newline") (td "vt") (td "013") (td ""))
(tr (td "np") (td "014") (td "page") (td "cr") (td "015") (td "return"))
......@@ -226,6 +236,14 @@ which is also shown in this table.])
(tr (td "rs") (td "036") (td "") (td "us") (td "037") (td ""))
(tr (td "sp") (td "040") (td "space") (td "del") (td "177") (td "delete"))))
(p [,(stklos) supports the complete Unicode character set, if UTF-8 encoding is used. Hereafter, are some examples of characters:])
(fontified-code [
#\A ,(symbol-arrow) uppercase A
#\a ,(symbol-arrow) lowercase a
#\x41\; ,(symbol-arrow) the U+0041 character (uppercase A)
#\x03BB\; ,(symbol-arrow) ,(symbol "lambda")
])
(insertdoc 'char?)
(insertdoc 'char>=?)
(insertdoc 'char-ci>=?)
......@@ -248,6 +266,8 @@ the following table.])
(center
(table :rules 'cols :frame 'border
(tr :bg "#eeeeee" (th "Sequence") (th "Character inserted"))
(tr (td "\\a")
(td "Alarm"))
(tr (td "\\b")
(td "Backspace"))
(tr (td "\\e")
......@@ -256,23 +276,38 @@ the following table.])
(td " Newline"))
(tr (td "\\t")
(td " Horizontal Tab"))
(tr (td "\\n")
(tr (td "\\r")
(td " Carriage Return"))
(tr (td "\\\"")
(td " doublequote U+0022"))
(tr (td "\\\\")
(td " backslash U+005C"))
(tr (td "\\0abc")
(td " ASCII character with octal value abc"))
(tr (td "\\xab")
(td " ASCII character with hexadecimal value ab"))
(tr (td "\\<newline>")
(tr (td "\\x<hexa value>;")
(td " ASCII character with given hexadecimal value"))
(tr (td "\\<intraline whitespace><newline><intraline whitespace>")
(td " None (permits to enter a string on several lines)"))
(tr (td "\\<other>")
(td " <other>"))))
(p [For instance, the string])
(fontified-code ["ab\040c\\nd\
e"])
(fontified-code ["ab\\040\\x20;c\\nd\
e"])
(p [is the string consisting of the characters
,(code "#\\a"), ,(code "#\\b"), ,(code "#\\space"),
,(code "#\\a"), ,(code "#\\b"), ,(code "#\\space"), ,(code "#\\space"),
,(code "#\\c"), ,(code "#\\newline"), ,(code "#\\d") and ,(code "#\\e").])
(p (bold "Notes:")
(itemize
(item [Using octal code is limited to characters in the range 0
to #xFF. It is then not convenient to enter Unicode characters. This
form is deprecated should not be used anymore.])
(item [A line ending which is preceded by <intraline whitespace>
expands to nothing (along with any trailing <intraline
whitespace>), and can be used to indent strings for improved
legibility.])))
(insertdoc 'string?)
(insertdoc 'make-string)
(insertdoc 'string)
......@@ -305,7 +340,8 @@ function listed below just don't need to load the full SRFI to be used])
(insertdoc 'string-titlecase)
(insertdoc 'string-titlecase!))
(p [These functions are inspired from R6RS and R7RS.])
(p [The functions ,(tt "string-foldcase") and ,(tt "string-foldcase!") described
below are inspired from R6RS.])
(insertdoc 'string-foldcase)
(insertdoc 'string-foldcase!)
......
;;;;
;;;; repl-readline.stk -- REPL with GNU-Readline support
;;;;
;;;; Copyright 2010 Erick Gallesio - Polytech'Nice-Sophia <eg@unice.fr>
;;;;
;;;;
;;;;
;;;; Copyright 2010-2011 Erick Gallesio - Polytech'Nice-Sophia <eg@unice.fr>
;;;;
;;;;
;;;; This program is free software; you can redistribute it and/or modify
;;;; it under the terms of the GNU General Public License as published by
;;;; the Free Software Foundation; either version 2 of the License, or
;;;; (at your option) any later version.
;;;;
;;;;
;;;; This program is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;;; GNU General Public License for more details.
;;;;
;;;;
;;;; You should have received a copy of the GNU General Public License
;;;; along with this program; if not, write to the Free Software
;;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
;;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
;;;; USA.
;;;;
;;;;
;;;; Author: Erick Gallesio [eg@unice.fr]
;;;; Creation date: 15-May-2010 22:00 (eg)
;;;; Last file update: 6-Aug-2010 00:16 (eg)
;;;; Last file update: 16-Aug-2011 00:09 (eg)
;;;;
......@@ -49,10 +49,10 @@
(repl-prompt-use-color? #f)
;; We use the readl GNU readline. Good
(repl-make-prompt (lambda (module)
(ansi-color-protect "\x01" "\x02")
(ansi-color-protect "\x01;" "\x02;")
(old-make-prompt module)
(ansi-color-protect "" ""))))
(repl-display-prompt (lambda (port)
'nothing))
;;
......@@ -67,7 +67,7 @@
;;
(let* ((buff "")
(buff-index -1)
(fill-buff (lambda ()
(fill-buff (lambda ()
;; No more char. to read. Fill the buffer with readline
(set! buff (read-with-history (repl-prompt)))
(set! buff-index 0)
......@@ -98,7 +98,7 @@
(if (eof-object? buff)
(begin (fill-buff) #eof)
#f)))))
(repl-change-default-ports :in port))))
(define (try-initialize-repl-with-readline)
......
/*
/* -*- coding: utf-8 -*-
*
* c h a r . c -- Chaacters management
*
......@@ -23,7 +23,7 @@
*
* Author: Erick Gallesio [eg@kaolin.unice.fr]
* Creation date: ??????
* Last file update: 14-Aug-2011 13:26 (eg)
* Last file update: 16-Aug-2011 19:53 (eg)
*/
#include <ctype.h>
......@@ -38,7 +38,8 @@ struct charelem {
static struct charelem chartable [] = {
{"null", '\000'},
{"bell", '\007'},
{"alarm", '\007'}, /* R7RS name */
{"bell", '\007'}, /* old STklos name, for backward compatibility */
{"backspace", '\010'},
{"tab", '\011'},
{"newline", '\012'},
......@@ -277,7 +278,8 @@ static int charcompi(SCM c1, SCM c2)
if (!CHARACTERP(c1)) error_bad_char(c1);
if (!CHARACTERP(c2)) error_bad_char(c2);
return STk_use_utf8 ?
(towlower(CHARACTER_VAL(c1)) - towlower(CHARACTER_VAL(c2))):
(STk_casefold_char(CHARACTER_VAL(c1)) -
STk_casefold_char(CHARACTER_VAL(c2))):
(tolower((unsigned char) CHARACTER_VAL(c1)) -
tolower((unsigned char) CHARACTER_VAL(c2)));
}
......@@ -435,7 +437,6 @@ TEST_CTYPE(lower, "char-lower-case?")
/*=============================================================================*/
DEFINE_PRIMITIVE("char->integer", char2integer, subr1, (SCM c))
/*
<doc char->integer integer->char
* (char->integer char)
......@@ -459,8 +460,12 @@ DEFINE_PRIMITIVE("char->integer", char2integer, subr1, (SCM c))
* (char<=? (integer->char x)
* (integer->char y)) => #t
* @end lisp
* |integer->char| accepts an exact number between 0 and #xD7FFF or between
* #xE000 and #x10FFFF, if UTF8 encoding is used. Otherwise it accepts a
* number between0 and #xFF.
doc>
*/
DEFINE_PRIMITIVE("char->integer", char2integer, subr1, (SCM c))
{
if (!CHARACTERP(c)) error_bad_char(c);
return MAKE_INT((long) CHARACTER_VAL(c));
......@@ -473,7 +478,7 @@ DEFINE_PRIMITIVE("integer->char", integer2char, subr1, (SCM i))
if (STk_use_utf8) {
/* Unicode defines characters in the range [0, #xd7FF] U [#xE000, #x10FFFF] */
if (! ((0 <= c && c <= 0xd7ff) || (0xE000 <=c && c <= 0x10FFFF)))
if (! VALID_UTF8_VALUE(c))
STk_error("bad integer ~S (must be in range [0, #xd7FF] U [#xE000, #x10FFFF]",
i);
}
......@@ -514,7 +519,7 @@ DEFINE_PRIMITIVE("char-downcase", char_downcase, subr1, (SCM c))
tolower((unsigned char) CHARACTER_VAL(c)));
}
/*
<doc char-foldcase
<doc EXT char-foldcase
* (char-foldcase char)
*
* This procedure applies the Unicode simple case folding algorithm and returns
......
......@@ -20,7 +20,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 17-Feb-1993 12:27
* Last file update: 23-Jun-2011 23:47 (eg)
* Last file update: 16-Aug-2011 18:07 (eg)
*
*/
......
......@@ -20,7 +20,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: ??-Oct-1993 ??:??
* Last file update: 27-Jul-2011 15:28 (eg)
* Last file update: 16-Aug-2011 00:15 (eg)
*
*/
......@@ -92,6 +92,10 @@ static void error_bad_dotted_list(SCM port)
signal_error(port, "bad dotted list", STk_nil);
}
static void error_bad_inline_hexa_sequence(SCM port)
{
signal_error(port, "bad inline hexa sequence", STk_nil);
}
static void warning_parenthesis(SCM port)
{
......@@ -116,6 +120,44 @@ static int flush_spaces(SCM port, char *message, SCM file)
}
static int read_hex_sequence(SCM port, char* utf8_seq)
{
char *end, buffer[30]; /* normally max value is 10FFFFF */
int c, i = 0;
long int val;
/* assert: current char is 'x' */
do
c = buffer[i++] = STk_getc(port);
while ((i < sizeof(buffer) - 1) && isxdigit(c) && (c != ';') && (c != EOF));
buffer[i] = '\0';
if (c != ';')
error_bad_inline_hexa_sequence(port);
else {
val = strtol(buffer, &end, 16);
if (val == LONG_MIN || val == LONG_MAX || *end != ';')
error_bad_inline_hexa_sequence(port);
else
if (STk_use_utf8) {
int len = STk_char2utf8(val, utf8_seq);
if (len) return len;
} else {
if (0 <= val && val <= 0xFF) {
*utf8_seq = (char) val;
return 1;
}
}
}
/* if we are here , we have an error */
error_bad_inline_hexa_sequence(port);
return 0;
}
static SCM read_list(SCM port, char delim, struct read_context *ctx)
/* Read a list ended by the `delim' char */
{
......@@ -192,6 +234,25 @@ static int read_word(SCM port, int c, char *tok, int case_significant)
if (c != '|')
tok[j++] = (allchars || case_significant) ? c : tolower(c);
if (c == '\\') {
c = STk_getc(port);
if (c == 'x') {
/* This is an internal hexa sequence */
char buffer[5];
int len = read_hex_sequence(port, buffer);
if (j + len >= MAX_TOKEN_SIZE-1) {
tok[j] = '\0';
error_token_too_large(port, tok);
} else {
memcpy(tok + j-1, buffer, len);
j += len-1;
}
} else { /* c != 'x' */
STk_ungetc(c, port);
}
}
c = STk_getc(port);
if (c == EOF) break;
if (!allchars) {
......@@ -410,12 +471,11 @@ static void patch_references(SCM port, SCM l, SCM cycles)
static SCM read_string(SCM port, int constant)
{
int k ,c, n, hexa;
int k ,c, n;
size_t j, len;
char *p, *buffer;
SCM z;
hexa = 0;
j = 0;
len = 100;
p = buffer = STk_must_malloc(len);
......@@ -433,8 +493,34 @@ static SCM read_string(SCM port, int constant)
case 'r' : c = '\r'; break; /* Cr */
case 't' : c = '\t'; break; /* Tab */
case 'v' : c = '\v'; break; /* VTab */
case '\n': continue;
case 'x' : hexa = 1;
case ' ' : do {
c = STk_getc(port);
} while (c == ' ' || c == '\t');
if (c != '\n') {
signal_error(port, "bad line continuation sequence in string",
STk_nil);
} else {
/* No break */;
}
case '\n': do {
c = STk_getc(port);
} while (c == ' ' || c == '\t');
break;
case 'x' : {
char seq[5];
int seqlen = read_hex_sequence(port, seq);
if ((j + seqlen) >= len) {
len = len + len / 2;
buffer = STk_must_realloc(buffer, len);
p = buffer + j;
}
memcpy(p, seq, seqlen);
p += seqlen;
j += seqlen;
continue;
}
case '0' : for( k=n=0 ; ; k++ ) {
c = STk_getc(port);
if (c == EOF)
......@@ -443,21 +529,15 @@ static SCM read_string(SCM port, int constant)
STk_nil);
c &= 0377;
/* if hexa 2 digits max, if octal 3 digit max */
if (hexa && isxdigit(c) && k < 2) {
/* because of a GCC bug, factorisation is not possible */
c = tolower(c);
n = n * 16 + (isdigit(c) ? (c - '0'): (c - 'a' + 10));
}
else if (!hexa && isdigit(c) && (c < '8') && k < 3)
n = n * 8 + c - '0';
/* 3 digit max for bytes */
if (isdigit(c) && (c < '8') && k < 3)
n = n * 8 + c - '0';
else {
STk_ungetc(c, port);
break;
}
}
hexa = 0;
c = n & 0xff;
c = n & 0xff;
}
}
......
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 28-Dec-1999 22:58 (eg)
* Last file update: 14-Aug-2011 12:28 (eg)
* Last file update: 15-Aug-2011 19:43 (eg)
*/
......@@ -1255,6 +1255,11 @@ int STk_init_mutexes(void);
extern int STk_use_utf8;
#define VALID_UTF8_VALUE(c) \
/* Unicode defines characters in the range [0, #xd7FF] U [#xE000, #x10FFFF] */ \
((0 <= (c) && (c) <= 0xd7ff) || (0xE000 <=(c) && (c) <= 0x10FFFF))
char *STk_utf8_grab_char(char *str, uint32_t *c);/* result = pos. after current one */
int STk_char2utf8(int ch, char *str); /* result = length of the UTF-8 repr. */
int STk_utf8_strlen(char *s, int max);
......
......@@ -22,7 +22,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: ??????
* Last file update: 14-Aug-2011 12:32 (eg)
* Last file update: 16-Aug-2011 18:02 (eg)
*/
#include <ctype.h>
......@@ -60,9 +60,9 @@ static void error_index_out_of_bound(SCM str, SCM index)
STk_error("index ~S out of bound in string ~S", index, str);
}
static void error_bad_sequence(SCM str)
static void error_bad_sequence(char *str)
{
STk_error("bad UTF-8 sequence in ~S", str);
STk_error("bad UTF-8 sequence in '%s'", str);
}
......@@ -82,8 +82,10 @@ static int stringcomp(SCM s1, SCM s2)
str2 = STRING_CHARS(s2); end2 = str2 + STRING_SIZE(s2);
while ((str1 < end1) && (str2 < end2)) {
if ((str1 = STk_utf8_grab_char(str1, &ch1)) == NULL) error_bad_sequence(s1);
if ((str2 = STk_utf8_grab_char(str2, &ch2)) == NULL) error_bad_sequence(s2);
if ((str1 = STk_utf8_grab_char(str1, &ch1)) == NULL)
error_bad_sequence(STRING_CHARS(s1));
if ((str2 = STk_utf8_grab_char(str2, &ch2)) == NULL)
error_bad_sequence(STRING_CHARS(s2));
if (ch1 != ch2) return ch1 - ch2;
}
......@@ -122,8 +124,10 @@ static int stringcompi(SCM s1, SCM s2)
str2 = STRING_CHARS(s2); end2 = str2 + STRING_SIZE(s2);
while ((str1 < end1) && (str2 < end2)) {
if ((str1 = STk_utf8_grab_char(str1, &ch1)) == NULL) error_bad_sequence(s1);
if ((str2 = STk_utf8_grab_char(str2, &ch2)) == NULL) error_bad_sequence(s2);
if ((str1 = STk_utf8_grab_char(str1, &ch1)) == NULL)
error_bad_sequence(STRING_CHARS(s1));
if ((str2 = STk_utf8_grab_char(str2, &ch2)) == NULL)
error_bad_sequence(STRING_CHARS(s2));
if (towlower(ch1) != towlower(ch2)) return towlower(ch1) - towlower(ch2);
}
......@@ -806,6 +810,7 @@ DEFINE_PRIMITIVE("string-fill!", string_fill, subr2, (SCM str, SCM c))
static int Memmem(char *s1, int l1, char *s2, int l2, int use_utf8)
{
int pos;
char *start_s1 = s1;
if (l2 == 0) return 0;
......@@ -816,7 +821,7 @@ static int Memmem(char *s1, int l1, char *s2, int l2, int use_utf8)
if (use_utf8) {
int len = STk_utf8_sequence_length(s1);
if (len == UTF8_INCORRECT_SEQUENCE) STk_error("bad UTF-8 sequence");
if (len == UTF8_INCORRECT_SEQUENCE) error_bad_sequence(start_s1);
s1 += len;
} else {
s1++;
......
......@@ -21,7 +21,7 @@
*
* Author: Erick Gallesio [eg@unice.fr]
* Creation date: 30-Apr-2011 19:46 (eg)
* Last file update: 27-Jul-2011 22:38 (eg)
* Last file update: 16-Aug-2011 18:03 (eg)
*/
#include "stklos.h"
......@@ -29,6 +29,12 @@
int STk_use_utf8 = 1;
static void error_bad_sequence(char *str)
{
STk_error("bad UTF-8 sequence in '%s'", str);
}
char *STk_utf8_grab_char(char *str, uint32_t *c) /* result = pos. after current one */
{
uint8_t *buff = (uint8_t *) str;
......@@ -86,35 +92,36 @@ int STk_utf8_read_char(SCM port)
}
int STk_char2utf8(int ch, char *str) /* result = length of the UTF-8 repr. */
{
uint8_t *buff = (uint8_t *)str;
int n;
if (ch < 0x80) {
*buff++ = ch;
n = 1;
} else if (ch < 0x800) {
*buff++ = (ch >> 6) | 0xc0;
*buff++ = (ch & 0x3f) | 0x80;
n = 2;
} else if (ch < 0x10000) {
*buff++ = (ch >> 12) | 0xe0;
*buff++ = ((ch >> 6) & 0x3f) | 0x80;
*buff++ = (ch & 0x3f) | 0x80;
n = 3;
} else if (ch < 0x110000) {
*buff++ = (ch >> 18) | 0xF0;
*buff++ = ((ch >> 12) & 0x3F) | 0x80;
*buff++ = ((ch >> 6) & 0x3F) | 0x80;
*buff++ = (ch & 0x3F) | 0x80;
n = 4;
} else {
n = 0; /* to make gcc happy */
STk_error("bad UTF-8 character %d", ch);
}
int n = 0;
if (VALID_UTF8_VALUE(ch))
if (ch < 0x80) {
*buff++ = ch;
n = 1;
} else if (ch < 0x800) {
*buff++ = (ch >> 6) | 0xc0;
*buff++ = (ch & 0x3f) | 0x80;
n = 2;
} else if (ch < 0x10000) {
*buff++ = (ch >> 12) | 0xe0;
*buff++ = ((ch >> 6) & 0x3f) | 0x80;
*buff++ = (ch & 0x3f) | 0x80;
n = 3;
} else if (ch < 0x110000) {
*buff++ = (ch >> 18) | 0xf0;
*buff++ = ((ch >> 12) & 0x3f) | 0x80;
*buff++ = ((ch >> 6) & 0x3f) | 0x80;
*buff++ = (ch & 0x3f) | 0x80;
n = 4;
}
/* *buff = '\0'; */
return n;
}
......@@ -144,13 +151,13 @@ int STk_utf8_sequence_length(char *str)
int STk_utf8_strlen(char *s, int max)
{
int len;
char *end = s + max;
char *start = s, *end = s + max;
for (len = 0; s < end; len++) {
int sz = STk_utf8_sequence_length(s);
if (sz == UTF8_INCORRECT_SEQUENCE)
STk_error("bad UTF-8 sequence");
error_bad_sequence(start);
s += sz;
}
return len;
......@@ -158,13 +165,13 @@ int STk_utf8_strlen(char *s, int max)
char *STk_utf8_index(char *s, int i, int max) /* return the address of ith char of s*/
{
char *end = s + max;
char *start = s, *end = s + max;
while ((s < end) && i--) {
int sz = STk_utf8_sequence_length(s);
if (sz == UTF8_INCORRECT_SEQUENCE)
STk_error("bad UTF-8 sequence");
error_bad_sequence(start);
s += sz;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment