string.lisp 18.1 KB
Newer Older
Pascal Bourguignon's avatar
Pascal Bourguignon committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
;;;; -*- coding:utf-8 -*-
;;;;*****************************************************************************
;;;;FILE:               string.lisp
;;;;LANGUAGE:           Common-Lisp
;;;;SYSTEM:             clisp
;;;;USER-INTERFACE:     clisp
;;;;DESCRIPTION
;;;;
;;;;    This module exports string functions.
;;;;
;;;;AUTHORS
;;;;    <PJB> Pascal J. Bourguignon <pjb@informatimago.com>
;;;;MODIFICATIONS
;;;;    2003-01-30 <PJB> Creation.
;;;;BUGS
;;;;LEGAL
17 18
;;;;    AGPL3
;;;;    
19
;;;;    Copyright Pascal J. Bourguignon 2003 - 2012
20 21 22 23 24 25 26 27 28 29 30 31 32
;;;;    
;;;;    This program is free software: you can redistribute it and/or modify
;;;;    it under the terms of the GNU Affero General Public License as published by
;;;;    the Free Software Foundation, either version 3 of the License, or
;;;;    (at your option) any later version.
;;;;    
;;;;    This program is distributed in the hope that it will be useful,
;;;;    but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;;;    GNU Affero General Public License for more details.
;;;;    
;;;;    You should have received a copy of the GNU Affero General Public License
;;;;    along with this program.  If not, see <http://www.gnu.org/licenses/>.
Pascal Bourguignon's avatar
Pascal Bourguignon committed
33 34 35
;;;;*****************************************************************************


36 37
(defpackage "COM.INFORMATIMAGO.CLISP.STRING"
  (:documentation "This module exports string functions.")
38 39 40 41
  (:use "COMMON-LISP"
        "REGEXP"
        "COM.INFORMATIMAGO.COMMON-LISP.CESARUM.LIST"
        "COM.INFORMATIMAGO.COMMON-LISP.CESARUM.STRING")
42
  (:export "SPLIT-STRING" "UNSPLIT-STRING"
Pascal Bourguignon's avatar
Pascal Bourguignon committed
43 44 45
           "STRING-MATCH" "STRING-TO-NUMBER"
           "CAPITALIZATION" "REPLACE-REGEXP-IN-STRING"
           "SUBSTRING"))
46
(in-package "COM.INFORMATIMAGO.CLISP.STRING")
Pascal Bourguignon's avatar
Pascal Bourguignon committed
47 48 49 50 51 52



;; We have our own implementation of SPLIT-STRING using REGEXP,
;; specific to CLISP.

53 54 55 56
(defparameter split-string-default-separators
  (format nil "[ ~C~C~C~C~C]\\+"
          (code-char 9) (code-char 10) (code-char 11) (code-char 12)
          (code-char 13))
57
  "The default separators for split-string (HT, LF, VT, FF, CR, SP)")
Pascal Bourguignon's avatar
Pascal Bourguignon committed
58 59


60
(defun split-string (string &optional separators)
Pascal Bourguignon's avatar
Pascal Bourguignon committed
61 62 63
  "
NOTE:   This implementation uses he REGEXP package.
"
64 65 66 67 68 69 70
  (unless separators (setq separators split-string-default-separators))
  (let ((result (regexp:regexp-split separators string)))
    (if (string= "" (car result))
        (setq result (cdr result)))
    (if (string= "" (car (last result)))
        (setq result (nbutlast result)))
    result))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
71

72
;; But we inherit UNSPLIT-STRING from COM.INFORMATIMAGO.COMMON-LISP.CESARUM.STRING.
Pascal Bourguignon's avatar
Pascal Bourguignon committed
73 74


75 76 77 78 79
(defun string-match (regexp string &key (start 0) (end nil)
                     (case-sensitive nil)
                     (extended nil)
                     (newline nil)
                     (nosub nil))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
80
  "An alias for REGEXP:MATCH."
81 82 83 84 85
  (regexp:match regexp string
                :start start :end end
                :ignore-case (not case-sensitive)
                :extended extended
                :newline newline :nosub nosub))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
86 87 88



89
(defvar *case-fold-search* nil
Pascal Bourguignon's avatar
Pascal Bourguignon committed
90 91
  "Whether searches and matches should ignore case.
Used by: REPLACE-REGEXP-IN-STRING.
92
")
Pascal Bourguignon's avatar
Pascal Bourguignon committed
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138



;;; CAPITALIZATION:
;;;
;;; 0  NIL
;;; 1   T
;;;
;;; 0  Upcase
;;; 1  Lowcase
;;; 2  Nocase
;;; 3  Special
;;;
;;; STATE:   (BOW,U/L/N/S)
;;;   Initial state: (NIL SP)
;;;
;;;   ((NIL UP) UP) --> (NIL UP) NOT NO-2C-WORD
;;;   ((NIL UP) LO) --> (NIL LO) NOT NO-2C-WORD NOT ALL-UPCASE
;;;   ((NIL UP) NO) --> (NIL NO) NOT NO-2C-WORD 
;;;   ((NIL UP) SP) --> (NIL SP)
;;;   ((NIL LO) UP) --> (NIL UP) NOT NO-2C-WORD NOT ALL-LOCASE NOT ALL-CAPITA 
;;;   ((NIL LO) LO) --> (NIL LO) NOT NO-2C-WORD
;;;   ((NIL LO) NO) --> (NIL NO) NOT NO-2C-WORD
;;;   ((NIL LO) SP) --> (NIL SP)
;;;   ((NIL NO) UP) --> (NIL UP) NOT NO-2C-WORD
;;;   ((NIL NO) LO) --> (NIL LO) NOT NO-2C-WORD
;;;   ((NIL NO) NO) --> (NIL NO) NOT NO-2C-WORD
;;;   ((NIL NO) SP) --> (NIL SP)
;;;   ((NIL SP) UP) --> ( T  UP) NOT ALL-LOCASE  
;;;   ((NIL SP) LO) --> ( T  LO) NOT ALL-UPCASE  NOT ALL-CAPITA
;;;   ((NIL SP) NO) --> ( T  NO)
;;;   ((NIL SP) SP) --> (NIL SP)
;;;   (( T  UP) UP) --> (NIL UP) NOT NO-2C-WORD NOT ALL-LOCASE NOT ALL-CAPITA
;;;   (( T  UP) LO) --> (NIL LO) NOT NO-2C-WORD NOT ALL-UPCASE  
;;;   (( T  UP) NO) --> (NIL NO) NOT NO-2C-WORD
;;;   (( T  UP) SP) --> (NIL SP)
;;;   (( T  LO) UP) --> (NIL UP) NOT NO-2C-WORD NOT ALL-LOCASE
;;;   (( T  LO) LO) --> (NIL LO) NOT NO-2C-WORD NOT ALL-UPCASE  
;;;   (( T  LO) NO) --> (NIL NO) NOT NO-2C-WORD
;;;   (( T  LO) SP) --> (NIL SP)
;;;   (( T  NO) UP) --> (NIL UP) NOT NO-2C-WORD NOT ALL-LOCASE NOT ALL-CAPITA 
;;;   (( T  NO) LO) --> (NIL LO) NOT NO-2C-WORD NOT ALL-UPCASE  
;;;   (( T  NO) NO) --> (NIL NO) NOT NO-2C-WORD
;;;   (( T  NO) SP) --> (NIL SP)
;;;    ( T  SP) is impossible.

139 140 141
(defparameter +capitalization-transitions+
  (make-array '(2 4 4)
              :initial-contents
Pascal Bourguignon's avatar
Pascal Bourguignon committed
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
              '((( (0 0 3)
                  (0 1 3 0)
                  (0 2 3)
                  (0 3) )
                 ( (0 0 3 1 2)
                  (0 1 3)
                  (0 2 3)
                  (0 3) )
                 ( (0 0 3)
                  (0 1 3)
                  (0 2 3)
                  (0 3) )
                 ( (1 0 1)
                  (1 1 0 2)
                  (1 2)
                  (0 3) ))
                (( (0 0 3 1 2)
                  (0 1 3 0)
                  (0 2 3)
                  (0 3) )
                 ( (0 0 3 1)
                  (0 1 3 0)
                  (0 2 3)
                  (0 3) )
                 ( (0 0 3 1 2)
                  (0 1 3 0)
                  (0 2 3)
                  (0 3) )
                 ( (0 0) ;; impossible state
                  (0 1)
                  (0 2)
173
                  (0 3) )))))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
174 175 176



177
(defun capitalization (string)
Pascal Bourguignon's avatar
Pascal Bourguignon committed
178 179 180
  "
RETURN:  :LOWER :UPPER :CAPITALIZED or :WHATEVER
"
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
  (let ((all-upcase 0)
        (all-locase 1)
        (all-capita 2)
        (no-2c-word 3)
        (result     (make-array '(4) :initial-element t))
        (state      (cons 0 3)) )
    (map nil (lambda (ch)
               (let ((new-state (aref +capitalization-transitions+
                                      (car state) (cdr state)
                                      (cond 
                                        ((not (alpha-char-p ch)) 3)
                                        ((upper-case-p ch)       0)
                                        ((lower-case-p ch)       1)
                                        (t                       2)))))
                 (setf (car state) (pop new-state))
                 (setf (cdr state) (pop new-state))
                 (mapc (lambda (sym) (setf (aref result sym) nil)) new-state)
Pascal Bourguignon's avatar
Pascal Bourguignon committed
198
                 ))
199 200 201 202 203 204
         string)
    (cond ((aref result no-2c-word) :whatever)
          ((aref result all-upcase) :upper)
          ((aref result all-locase) :lower)
          ((aref result all-capita) :capitalized)
          (t                        :whatever))))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
205 206 207 208 209





210
(defun emacs-bugged-string-capitalize (string)
Pascal Bourguignon's avatar
Pascal Bourguignon committed
211 212 213 214 215
  "
The string-capitalized that emacs implements in its replace-regexp-in-string
which is not even its capitalize (which is correct)!
Namely, it seems to  touch only the first character of each word.
"
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
  (do ((result (copy-seq string))
       (i 0 (1+ i))
       (sp t)
       (ch) )
      ((<= (length result) i) result)
    (setq ch (char result i))
    (if sp
        (when (alpha-char-p ch)
          (setf (char result i) (char-upcase ch))
          (setq sp nil))
        (when (not (alphanumericp ch))
          (setq sp t)))))



(defun replace-regexp-in-string
    (regexp rep string
     &optional (fixedcase nil) (literal nil) (subexp 0) (start 0)
     &key (case-sensitive (not *case-fold-search*))
     (extended nil) (newline nil) (nosub nil))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
  "
NOTE:       emacs regexps are a mix between POSIX basic regexps
            and POSIX extended regexps.
            By default we'll use basic POSIX regexps, to keep '\\(...\\)'
            at the cost of the '+' repetition. The key parameters are
            passed to REGEXP:MATCH if specific behavior is needed.
            (We're not entirely compatible with emacs, but it's emacs which
            is wrong and we'll have to use another regexp package in emacs).

Replace all matches for REGEXP with REP in STRING.

Return a new string containing the replacements.

Optional arguments FIXEDCASE, LITERAL and SUBEXP are like the
arguments with the same names of function `replace-match'.  If START
is non-nil, start replacements at that index in STRING.

REP is either a string used as the NEWTEXT arg of `replace-match' or a
function.  If it is a function it is applied to each match to generate
the replacement passed to `replace-match'; the match-data at this
point are such that match 0 is the function's argument.
When REP is a function it's passed the while match 0, even if SUBEXP is not 0.

To replace only the first match (if any), make REGEXP match up to \'
and replace a sub-expression, e.g.
  (replace-regexp-in-string \"\\(foo\\).*\\'\" \"bar\" \" foo foo\" nil nil 1)
    => \" bar foo\"

If second arg FIXEDCASE is non-nil, do not alter case of replacement text.
Otherwise maybe capitalize the whole text, or maybe just word initials,
based on the replaced text.
If the replaced text has only capital letters
and has at least one multiletter word, convert NEWTEXT to all caps.
Otherwise if all words are capitalized in the replaced text,
capitalize each word in NEWTEXT.

If third arg LITERAL is non-nil, insert NEWTEXT literally.
Otherwise treat `\' as special:
  `\&' in NEWTEXT means substitute original matched text.
  `\N' means substitute what matched the Nth `\(...\)'.
       If Nth parens didn't match, substitute nothing.
  `\\' means insert one `\'.
Case conversion does not apply to these substitutions.

FIXEDCASE and LITERAL are optional arguments.

The optional fifth argument SUBEXP specifies a subexpression ;
it says to replace just that subexpression with NEWTEXT,
rather than replacing the entire matched text.
This is, in a vague sense, the inverse of using `\N' in NEWTEXT ;
`\N' copies subexp N into NEWTEXT, but using N as SUBEXP puts
NEWTEXT in place of subexp N.
This is useful only after a regular expression search or match,
since only regular expressions have distinguished subexpressions.
"
;;; REP       function(\0) --> NEWTEXT
;;; REP       string       --> NEWTEXT
;;;
;;; FIXEDCASE   T         identity
;;; FIXEDCASE   NIL       replaced text capitalization -> replacement
;;;
;;; LITERAL     T         identity
;;; LITERAL     NIL       substitute \&, \N, \\ and \x.
;;;
;;; SUBEXP      N         replaces only \N instead of \0
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
  (do ((done nil)
       (pieces '())
       (pos 0)
       (replacement)
       (replaced-match)
       (matches))
      (done
       (progn (push (subseq string pos) pieces)
              (apply (function concatenate) 'string (nreverse pieces))))
    (setq matches (multiple-value-list
                   (regexp:match regexp string
                                 :start start
                                 :ignore-case (not case-sensitive)
                                 :extended extended
                                 :newline newline
                                 :nosub nosub)))
    (if (and matches (car matches))
        (progn
Pascal Bourguignon's avatar
Pascal Bourguignon committed
319
          ;; -1- Find the replacement:
320 321 322 323
          (setq replacement
                (if (functionp rep)
                    (funcall rep (regexp:match-string string (car matches)))
                    rep))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
324
          ;; -2- Process FIXEDCASE
325 326 327 328 329 330 331 332 333 334 335
          (when (or (< subexp 0) (<= (length matches) subexp))
            (error "Argument out of range SUBEXP=~A." subexp))
          (setq replaced-match (nth subexp matches))
          (unless fixedcase
            (let ((cap (capitalization
                        (regexp:match-string string replaced-match))) )
              (setq replacement
                    (funcall
                     (cond
                       ((eq cap :upper) (function string-upcase))
                       ((eq cap :lower) (function identity))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
336
                       ;;                That's what emacs does...
337 338 339 340
                       ((eq cap :capitalized)
                        (function emacs-bugged-string-capitalize))
                       (t               (function identity)))
                     replacement))))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
341
          ;; -3- Process LITERAL
342
          (unless literal
Pascal Bourguignon's avatar
Pascal Bourguignon committed
343
            ;; substitute \&, \N and \\.
344 345
            (setq replacement
                  (replace-regexp-in-string
Pascal Bourguignon's avatar
Pascal Bourguignon committed
346
                   "\\\\\\(.\\)"
347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
                   (lambda (substr)
                     (cond
                       ((char= (char substr 1) (character "&"))
                        (regexp:match-string string (car matches)) )
                       ((digit-char-p (char substr 1))
                        (let ((n (parse-integer substr :start 1)))
                          (if (<= (length matches) n)
                              substr ;; How coherent emacs is!
                              (regexp:match-string string (nth n matches)))) )
                       ((char= (character "\\") (char substr 1))
                        (subseq substr 1) )
                       (t
                        (error "Invalid use of '\\' in replacement text ~W."
                               substr) )))
                   replacement t t)) )
Pascal Bourguignon's avatar
Pascal Bourguignon committed
362
          ;; -4- Replace.
363 364 365 366 367 368 369 370 371 372 373
          (push (subseq string pos
                        (regexp:match-start (nth subexp matches))) pieces)
          (push replacement pieces)
          (setq start
                (if (= 0 (length regexp))
                    (1+ start)
                    (regexp:match-end (car matches))))
          (setq pos (regexp:match-end (nth subexp matches)))
          (setq done (<= (length string) start)) )
        (progn
          (setq done t) ))))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393




;;; (defun rris-result (test-case res)
;;;   "Emacs"
;;;   (if (or (and (eq :error res) (eq res (car test-case)))
;;;           (string= (car test-case) res))
;;;     (insert (format "Ok  %-50S --> %S\n" (cdr test-case) res))
;;;     (insert
;;;      (format "Unexpected result for %S\n    expected %S\n    got      %S\n"
;;;              (cdr test-case) (car test-case) res))))


(defun rris-result (test-case res)
  "Common-Lisp"
  (if (or (and (eq :error res) (eq res (car test-case)))
          (string= (car test-case) res))
      (format t "Ok  ~50W --> ~W~%" (cdr test-case) res)
      (format t "Unexpected result for ~W~%    expected ~W~%    got      ~W~%"
394
              (cdr test-case) (car test-case) res)))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453



(defun rris-test ()
  "
Test cases for REPLACE-REGEXP-IN-STRING
"
  (let ((*case-fold-search* t))
    (do* ((test-cases
           ;; We use basic POSIX regexps, so no 're+'.
           ;;  result      regexp      replac      string      fix lit sub start
           '( ("xy"        ""          "x"         "y"         t   t)
             ("xyxyxyxy"  ""          "x"         "yyyy"      t   t)
             (""          "."         "x"         ""          t   t)
             ("x"         "."         "x"         "y"         t   t)
             ("xxxx"      "."         "x"         "yyyy"      t   t)
             ("xxxx"      "."         "x"         "yaya"      t   t)
             ("good"      "a"         "bad"       "good"      t   t)
             ("good"      "bad"       "good"      "bad"       t   t)
             ("good rest" "bad"       "good"      "bad rest"  t   t)
             ("rest good" "bad"       "good"      "rest bad"  t   t)
             ("good"      "bad"  (lambda (x) "good") "bad"    t   t)
             ("good rest" "bad"  (lambda (x) "good") "bad rest" t   t)
             ("rest good" "bad"  (lambda (x) "good") "rest bad" t   t)
             ("test"      "r\\(e\\)g" "good"      "test"     nil nil 2)
             (:error      "r\\(e\\)g" "good"      "reg"      nil nil 2)
             ("rgoodg"    "r\\(e\\)g" "good"      "reg"      nil nil 1)
             ("BOC NEW VALUE hoc" "pattern"   "nEW VAlue" "BOC PATTERN hoc")
             ("BOC nEW VAlue hoc" "pattern"   "nEW VAlue" "BOC pattern hoc")
             ("BOC new value hoc" "pattern"   "new value" "BOC pattern hoc")
             ("BOC NEW VAlue hoc" "pattern"   "nEW VAlue" "BOC Pattern hoc")
             ("BOC New Value hoc" "pattern"   "new value" "BOC Pattern hoc")
             ("BOC nEW VAlue hoc" "pattern"   "nEW VAlue" "BOC pATteRN hoc")
             ("BOC New1value hoc" "pattern"   "new1value" "BOC Pattern hoc")
             ("BOC New-Value hoc" "pattern"   "new-value" "BOC Pattern hoc")
             ("rrrr-www-bb rr-w-bbb"
              "\\(bb*\\) \\(ww*\\) \\(rr*\\)"
              "\\3-\\2-\\1" "bb www rrrr bbb w rr")
             ("\\4-www-bb \\4-w-bbb"
              "\\(bb*\\) \\(ww*\\) \\(rr*\\)"
              "\\4-\\2-\\1" "bb www rrrr bbb w rr")
             (:error "blue" "\\\\b\\l\\&" "blue box and bluetooth")
             ("\\bblue box and \\bbluetooth"
              "blue" "\\\\b\\&" "blue box and bluetooth")
             )
           (cdr test-cases))
          (test-case (car test-cases) (car test-cases))
          (tc test-case test-case)
          (all-ok t))
         ((null test-cases) all-ok)
      (when (listp (nth 2 tc))
        (setq tc (copy-seq tc))
        (setf (nth 2 tc) (coerce (nth 2 tc) 'function)))
      (let ((res (handler-case
                     (apply (function replace-regexp-in-string) (cdr tc))
                   (error () :error))) )
        (if (eq :error res)
            (unless (eq res (car test-case)) (setq all-ok nil))
            (unless (string= (car test-case) res) (setq all-ok nil)))
454
        (rris-result  test-case res)))))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
455 456 457 458 459 460 461 462 463 464 465 466



;;  (rris-test)
;;; (let ((start 0) (case-sensitive nil) (extended nil) (newline nil) (nosub nil))
;;;       (REGEXP:MATCH "blue" "blue box and bluetooth"
;;;                     :START START :IGNORE-CASE (NOT CASE-SENSITIVE)
;;;                     :EXTENDED EXTENDED  :NEWLINE NEWLINE :NOSUB NOSUB) )

;; (replace-regexp-in-string "blue" "\\\\b\\X\\&" "blue box and bluetooth")


467
(defun string-to-number (string &key (base 10) (start 0) (end nil))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
468 469 470 471 472
  "
DO:         Converts the string to a number.
RETURN:     A number.
"
  ;; PARSE-INTEGER is for integers...
473 474 475 476 477 478
  (let ((result  (with-input-from-string
                     (stream string :start start :end end)
                   (let ((*read-base* base)) (read stream)))))
    (unless (numberp result)
      (error "Expected a number, not ~S." result))
    result))
Pascal Bourguignon's avatar
Pascal Bourguignon committed
479

480
;;;; THE END ;;;;