Regular expression matcher

Location: lib, 310 Lines

; Scheme 9 from Empty Space, Function Library
; By Nils M Holm, 2009,2010
; Placed in the Public Domain
; (re-comp string)                         ==>  list
; (re-match list string)                   ==>  list | #f
; (re-match list string 'symbol ...)       ==>  list | #f
; (re-subst list string1 string2)          ==>  list | #f
; (re-subst list string1 string2 'symbol)  ==>  list | #f
; (load-from-library "regex.scm")
; Compile, match, and substitute regular expressions.
; RE-COMP compiles a regular expression (RE) and returns it.
; Compiled REs (CREs) are represented by lists.
; RE-MATCH matches a compiled RE against a string. When (part of) the
; string matches the CRE, it returns a list of the form
;       ((first limit) ...)
; where FIRST is the index of the first character matching the RE and
; LIMIT is the first character *after* the match (so the match can be
; extracted with SUBSTRING). When no sub-matches are used, just a list
; of the form ((first limit)) will be returned. When sub-matches using
; \( and \) are contained in the regular expression, the subsequent pairs
; will contain the ranges matched by the sub-expressions (in order of
; occurrence of sub-expressions in the RE). See examples.
; When the CRE does not match the string, RE-MATCH returns #F.
; A special case occurs when a given pattern may match a string of zero
; length. In this case, RE-MATCH returns () when no better match could
; be found.
; When 'LAZY is passed as an additional argument to RE-MATCH, it matches
; repetitions "lazily," e.g.: "A+" will match "A" (rather than "AAA")
; in "AAAB".
; When 'ALL is passed as an additional argument to RE-MATCH, it will
; generate a list of *all* matches found in the given string, i.e. a
; list of the form:
;       (((first limit) ...) ...)
; The following RE patterns are evaluated:
; .          match any character
; [char...]  match character class (may contain ranges of the form c1-c2)
; ^          match beginning of line
; $          match end of line
; *          match zero or more instances of the preceding pattern
; +          match one or more instances of the preceding pattern
; ?          match the preceding pattern optionally
; \          match the following character literally (exception below!)
; \(         mark the beginning of a sub-match
; \)         mark the end of a sub-match
; RE-SUBST attempts to match the CRE LIST against STRING1. When it
; succeeds, it returns a fresh copy of STRING1 with the match replaced
; by STRING2. When the 'ALL option is used (see RE-MATCH), all occurrences
; of the match will be replaced. When the CRE contains sub-matches, they can
; be referred to in STRING2 by using a backslash and the position of the
; sub-match, i.e.: \1, \2, ... The notation \0 denotes the entire match. To
; include a literal backslash in the replacement string, use \\. (Of course,
; Scheme strings use backslashes for escaping, so you will have to use \\1
; in the place of \1 and \\\\ in the place of \\. Yes, this is awkward.)
; When SUBST fails to match the CRE, it returns #F.
; Example:   (re-match (re-comp "^a[1-9]*z$") "a1289z")     ==>  ((0 6))
;            (re-match (re-comp "a[1-9]+z") "___a123z___")  ==>  ((3 8))
;            (re-match (re-comp "a[^1-9]+z") "a123z")       ==>  #f
;            (re-match (re-comp "[1-9]*") "__1__")          ==>  ((2 3))
;            (re-match (re-comp "[1-9]*") "_____")          ==>  ()
;            (re-match (re-comp "f\\(.\\)\\(.\\)bar")
;                      "foobar")                      ==>  ((0 6) (1 2) (2 3))
;            (re-match (re-comp "a\\(.\\)a") "aba_aca_ada" 'all)
;                                                     ==> (((0 3) (1 2))
;                                                          ((4 7) (5 6))
;                                                          ((8 11) (9 10)))
;            (re-subst (re-comp "\\([0-9]+\\)\\.\\([0-9]+\\)\\.")
;                      "_01.1._31.12._"
;                      "\\2/\\1"
;                      'all)                          ==>  "_1/01_12/31_"

(load-from-library "and-letstar.scm")

(define (re-comp re)

  (define LP   #\()
  (define RP   #\))

  (define (make-range c0 cn cls)
      (if (> c0 cn)
          (make-range (+ 1 c0)
                      (cons (integer->char c0) cls))))

  (define (compile-class in out cls first)
    (cond ((null? in)
          ((char=? #\] (car in))
            (list (cdr in) (cons (reverse! cls) out)))
          ((and first (char=? #\^ (car in)))
            (compile-class (cdr in) out (list #\]) #f))
          ((and (not first)
                (not (null? (cdr cls)))
                (char=? #\- (car in))
                (pair? (cdr in))
                (not (char=? #\] (cadr in))))
            (let ((c0 (char->integer (car cls)))
                  (cn (char->integer (cadr in))))
              (if (< c0 cn)
                  (compile-class (cddr in)
                                 (make-range c0 cn (cdr cls)) #f)
                  (compile-class (cdr in)
                                 (cons #\- cls) #f))))
            (compile-class (cdr in)
                           (cons (car in) cls) #f))))

  (let compile ((in-sub #f)
                (in     (string->list re))
                (out    '()))
      ((not in)
      ((null? in)
        (if in-sub
            (reverse! out)))
        (case (car in)
                (if (pair? (cdr in))
                    (if (memv (cadr in) '(#\( #\)))
                        (if (or (and      in-sub  (char=? LP (cadr in)))
                                (and (not in-sub) (char=? RP (cadr in))))
                            (compile (char=? (cadr in) LP)
                                     (cddr in)
                                     (cons (list (cadr in)) out)))
                        (compile in-sub
                                 (cddr in)
                                 (cons (cadr in) out)))
              ((#\^ #\$ #\.)
                (compile in-sub
                         (cdr in)
                         (cons (list (car in)) out)))
              ((#\* #\?)
                (compile in-sub
                         (cdr in)
                         (if (null? out)
                             (cons (car in) out)
                             (cons (list (car in) (car out))
                                   (cdr out)))))
                (compile in-sub
                         (cdr in)
                         (if (null? out)
                             (cons (car in) out)
                             (cons (list #\* (car out)) out))))
                (let ((class (compile-class (cdr in) out (list #\[) #t)))
                  (if class
                      (apply compile in-sub class)
                (compile in-sub
                         (cdr in)
                         (cons (car in) out))))))))

(define (re-match cre s . opts)

  (define lazy        #f)
  (define all-matches #f)
  (define LP          #\()
  (define RP          #\))

  (define (match-char p c)
    (cond ((char? p)
            (char=? p c))
          ((char=? #\. (car p)))
          ((char=? #\[ (car p))
            (memv c (cdr p)))
          ((char=? #\] (car p))
            (not (memv c (cdr p))))

  (define (extend-match m n)
    (if (null? m)
        (list (list n 0))
        (cons (cons (+ 1 (caar m)) (cdar m)) (cdr m))))

  (define (make-choices cre s m)
    (if (or (null? s)
            (not (match-char (cadar cre) (car s))))
        (list (list s m))
        (cons (list s m)
              (make-choices cre (cdr s) (extend-match m 1)))))

  (define (begin-submatch m)
    (if (null? m)
        `((0 0) (0 0))
        `((,(caar m) ,(caar m)) ,(car m) ,@(cdr m))))

  (define (end-submatch m)
    (let ((sub  (car m))
          (main (cadr m)))
      `((,(car sub) ,(cadr main)) ,sub ,@(cddr m))))

  (define (match-cre cre s pos m)
      ((null? cre)
        (map reverse! m))
      ((null? s)
        (cond ((equal? cre '((#\$)))
                (match-cre '() '() pos m))
              ((equal? cre `((,RP)))
                (match-cre '() '() pos (end-submatch m)))
              ((and (pair? (car cre))
                    (char=? #\* (caar cre))
                    (null? (cdr cre)))
      ((pair? (car cre))
        (cond ((char=? LP (caar cre))
                (match-cre (cdr cre)
                           (begin-submatch m)))
              ((char=? RP (caar cre))
                (match-cre (cdr cre)
                           (end-submatch m)))
              ((char=? #\* (caar cre))
                (match-star cre s pos m))
              ((char=? #\? (caar cre))
                (if (match-char (cadar cre) (car s))
                    (match-cre (cdr cre)
                               (cdr s)
                               (+ 1 pos)
                               (extend-match m 1))
                    (match-cre (cdr cre) s pos m)))
              ((match-char (car cre) (car s))
                (match-cre (cdr cre)
                           (cdr s)
                           (+ 1 pos)
                           (extend-match m 1)))
      ((char=? (car cre) (car s))
        (match-cre (cdr cre)
                   (cdr s)
                   (+ 1 pos)
                   (extend-match m 1)))

  (define (match-star cre s pos m)
    (let try-choices ((c* (if lazy
                              (make-choices cre s m)
                              (reverse (make-choices cre s m)))))
      (and (not (null? c*))
           (let ((r (match-cre (cdr cre) (caar c*) pos (cadar c*))))
             (or r
                 (try-choices (cdr c*)))))))

  (define (adjust res pos)
    (and res
         (let ((res (map (lambda (range)
                           (map (lambda (x)
                                  (+ x pos))
           `(,(car res) ,@(reverse! (cdr res))))))

  (define (try-matches cre s pos)
    (cond ((null? s)
            (match-cre cre s pos '()))
            (let ((r (match-cre cre s pos '())))
              (if (or (not r)
                      (null? r)
                      (and (pair? r)
                           (null? (car r))))
                  (try-matches cre (cdr s) (+ 1 pos))
                  (adjust r pos))))))

  (define (collect-matches cre s pos)
    (let loop ((r  (try-matches cre s pos))
               (m* '()))
      (if (or (not r)
              (null? r))
          (reverse! m*)
          (let ((k (cadar r)))
            (loop (try-matches cre (list-tail s k) (+ k pos))
                  (cons r m*))))))

  (if (memq 'lazy opts)
      (set! lazy #t))

  (if (memq 'all opts)
      (set! all-matches #t))

  (cond ((equal? cre '((#\^)))
          (if all-matches
              `(((0 0)))
              `((0 0))))
        ((equal? cre '((#\$)))
          (let ((k (string-length s)))
            (if all-matches
                `(((,k ,k)))
                `((,k ,k)))))
        ((and (pair? cre)
              (equal? '(#\^) (car cre)))
          (let ((m (adjust (match-cre (cdr cre) (string->list s) 0 '()) 0)))
            (if all-matches
                (list m)
          (collect-matches cre (string->list s) 0))
          (try-matches cre (string->list s) 0))))

(define (replacement-string new m*)
       (lambda (m* i out)
         (let loop ((in  (string->list (list-ref m* i)))
                    (out out))
           (if (null? in)
               (loop (cdr in) (cons (car in) out)))))))
    (let ((k (string-length new))
          (n (length m*)))
      (let loop ((i   0)
                 (out '()))
        (cond ((>= i k)
                (list->string (reverse! out)))
              ((char=? #\\ (string-ref new i))
                (cond ((and-let* ((_ (< i (- k 1)))
                                  (c (string-ref new (+ 1 i)))
                                  (_ (char<=? #\0 c #\9))
                                  (j (- (char->integer c)
                                        (char->integer #\0)))
                                  (_ (<= 0 j n)))
                         => (lambda (j)
                              (loop (+ 2 i) (append-match m* j out))))
                       ((< i (- k 1))
                         (loop (+ 2 i) (cons (string-ref new (+ 1 i)) out)))
                         (loop (+ 1 i) out))))
                  (loop (+ 1 i) (cons (string-ref new i) out))))))))

(define re-subst
  (let ((replacement-string replacement-string))
    (lambda (cre old new . opts)
      (and-let* ((pos** (apply re-match cre old opts))
                 (_     (not (null? pos**)))
                 (pos** (if (memq 'all opts)
                            (list pos**)))
                 (m**   (map (lambda (pos*)
                               (map (lambda (pos)
                                      (substring old (car pos) (cadr pos)))
        (let ((repl* (map (lambda (m*)
                            (replacement-string new m*))
          (let loop ((pos*  (append '((0 0))
                                    (map car pos**)
                                    (list (list (string-length old)
                                                (string-length old)))))
                     (repl* repl*)
                     (out   '()))
            (cond ((null? (cdr pos*))
                    (apply string-append (reverse! out)))
                    (let ((s (substring old (cadar pos*) (caadr pos*)))
                          (r (if (null? repl*) "" (car repl*))))
                      (loop (cdr pos*)
                            (if (null? repl*) '() (cdr repl*))
                            (cons r (cons s out))))))))))))

contact  |  privacy