diff options
author | Dan Nicolaescu <done@ece.arizona.edu> | 1998-06-22 02:03:41 +0000 |
---|---|---|
committer | Dan Nicolaescu <done@ece.arizona.edu> | 1998-06-22 02:03:41 +0000 |
commit | fcaa702ed4f25a52f812a69c4972c276c4af48ac (patch) | |
tree | 3cc6c61e5055bbb79298faf5c2f0a48a8a23a7a3 /lisp/emacs-lisp | |
parent | 92e7b975b219f57f401787ff7b46c7173011e756 (diff) | |
download | emacs-fcaa702ed4f25a52f812a69c4972c276c4af48ac.tar.gz |
Initial revision
Diffstat (limited to 'lisp/emacs-lisp')
-rw-r--r-- | lisp/emacs-lisp/sregex.el | 965 |
1 files changed, 965 insertions, 0 deletions
diff --git a/lisp/emacs-lisp/sregex.el b/lisp/emacs-lisp/sregex.el new file mode 100644 index 00000000000..38f4326f3ab --- /dev/null +++ b/lisp/emacs-lisp/sregex.el @@ -0,0 +1,965 @@ +;;; sregex.el --- symbolic regular expressions + +;; Copyright (C) 1997, 1998 Free Software Foundation, Inc. + +;; Author: Bob Glickstein <bobg+sregex@zanshin.com> +;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com> + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs; see the file COPYING. If not, write to the +;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. + +;;; Commentary: + +;; This package allows you to write regular expressions using a +;; totally new, Lisp-like syntax. + +;; A "symbolic regular expression" (sregex for short) is a Lisp form +;; that, when evaluated, produces the string form of the specified +;; regular expression. Here's a simple example: + +;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert" + +;; As you can see, an sregex is specified by placing one or more +;; special clauses in a call to `sregexq'. The clause in this case is +;; the `or' of two strings (not to be confused with the Lisp function +;; `or'). The list of allowable clauses appears below. + +;; With sregex, it is never necessary to "escape" magic characters +;; that are meant to be taken literally; that happens automatically. +;; For example: + +;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H" + +;; It is also unnecessary to "group" parts of the expression together +;; to overcome operator precedence; that also happens automatically. +;; For example: + +;; (sregexq (opt (or "Bob" "Robert"))) => "\\(Bob\\|Robert\\)?" + +;; It *is* possible to group parts of the expression in order to refer +;; to them with numbered backreferences: + +;; (sregexq (group (or "Go" "Run")) +;; ", Spot, " +;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1" + +;; If `sregexq' needs to introduce its own grouping parentheses, it +;; will automatically renumber your backreferences: + +;; (sregexq (opt "resent-") +;; (group (or "to" "cc" "bcc")) +;; ": " +;; (backref 1)) => "\\(resent-\\)?\\(to\\|cc\\|bcc\\): \\2" + +;; `sregexq' is a macro. Each time it is used, it constructs a simple +;; Lisp expression that then invokes a moderately complex engine to +;; interpret the sregex and render the string form. Because of this, +;; I don't recommend sprinkling calls to `sregexq' throughout your +;; code, the way one normally does with string regexes (which are +;; cheap to evaluate). Instead, it's wiser to precompute the regexes +;; you need wherever possible instead of repeatedly constructing the +;; same ones over and over. Example: + +;; (let ((field-regex (sregexq (opt "resent-") +;; (or "to" "cc" "bcc")))) +;; ... +;; (while ... +;; ... +;; (re-search-forward field-regex ...) +;; ...)) + +;; The arguments to `sregexq' are automatically quoted, but the +;; flipside of this is that it is not straightforward to include +;; computed (i.e., non-constant) values in `sregexq' expressions. So +;; `sregex' is a function that is like `sregexq' but which does not +;; automatically quote its values. Literal sregex clauses must be +;; explicitly quoted like so: + +;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert" + +;; but computed clauses can be included easily, allowing for the reuse +;; of common clauses: + +;; (let ((dotstar '(0+ any)) +;; (whitespace '(1+ (syntax ?-))) +;; (digits '(1+ (char (?0 . ?9))))) +;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+" + +;; This package also provides sregex-specific versions of the Emacs +;; functions `replace-match', `match-string', +;; `match-string-no-properties', `match-beginning', `match-end', and +;; `match-data'. In each case, the sregex version's name begins with +;; `sregex-' and takes one additional optional parameter, an sregex +;; "info" object. Each of these functions is concerned with numbered +;; submatches. Since sregex may renumber submatches, alternate +;; versions of these functions are needed that know how to adjust the +;; supplied number. + +;; The sregex info object for the most recently evaluated sregex can +;; be obtained with `sregex-info'; so if you precompute your sregexes +;; and you plan to use `replace-match' or one of the others with it, +;; you need to record the info object for later use: + +;; (let* ((regex (sregexq (opt "resent-") +;; (group (or "to" "cc" "bcc")) +;; ":")) +;; (regex-info (sregex-info))) +;; ... +;; (if (re-search-forward regex ...) +;; (let ((which (sregex-match-string 1 nil regex-info))) +;; ...))) + +;; In this example, `regex' is "\\(resent-\\)?\\(to\\|cc\\|bcc\\):", +;; so the call to (sregex-match-string 1 ...) is automatically turned +;; into a call to (match-string 2 ...). + +;; If the sregex info argument to `sregex-replace-match', +;; `sregex-match-string', `sregex-match-string-no-properties', +;; `sregex-match-beginning', `sregex-match-end', or +;; `sregex-match-data' is omitted, the current value of (sregex-info) +;; is used. + +;; You can do your own sregex submatch renumbering with +;; `sregex-backref-num'. + +;; Finally, `sregex-save-match-data' is like `save-match-data' but +;; also saves and restores the information maintained by +;; `sregex-info'. + +;; To use this package in a Lisp program, simply (require 'sregex). + +;; Here are the clauses allowed in an `sregex' or `sregexq' +;; expression: + +;; - a string +;; This stands for the literal string. If it contains +;; metacharacters, they will be escaped in the resulting regex +;; (using `regexp-quote'). + +;; - the symbol `any' +;; This stands for ".", a regex matching any character except +;; newline. + +;; - the symbol `bol' +;; Stands for "^", matching the empty string at the beginning of a line + +;; - the symbol `eol' +;; Stands for "$", matching the empty string at the end of a line + +;; - (group CLAUSE ...) +;; Groups the given CLAUSEs using "\\(" and "\\)". + +;; - (sequence CLAUSE ...) + +;; Groups the given CLAUSEs; may or may not use "\\(" and "\\)". +;; Clauses groups by `sequence' do not count for purposes of +;; numbering backreferences. Use `sequence' in situations like +;; this: + +;; (sregexq (or "dog" "cat" +;; (sequence (opt "sea ") "monkey"))) +;; => "dog\\|cat\\|\\(sea \\)?monkey" + +;; where a single `or' alternate needs to contain multiple +;; subclauses. + +;; - (backref N) +;; Matches the same string previously matched by the Nth "group" in +;; the same sregex. N is a positive integer. In the resulting +;; regex, N may be adjusted to account for automatically introduced +;; groups. + +;; - (or CLAUSE ...) +;; Matches any one of the CLAUSEs by separating them with "\\|". + +;; - (0+ CLAUSE ...) +;; Concatenates the given CLAUSEs and matches zero or more +;; occurrences by appending "*". + +;; - (1+ CLAUSE ...) +;; Concatenates the given CLAUSEs and matches one or more +;; occurrences by appending "+". + +;; - (opt CLAUSE ...) +;; Concatenates the given CLAUSEs and matches zero or one occurrence +;; by appending "?". + +;; - (repeat MIN MAX CLAUSE ...) +;; Concatenates the given CLAUSEs and constructs a regex matching at +;; least MIN occurrences and at most MAX occurrences. MIN must be a +;; non-negative integer. MAX must be a non-negative integer greater +;; than or equal to MIN; or MAX can be nil to mean "infinity." + +;; - (char CHAR-CLAUSE ...) +;; Creates a "character class" matching one character from the given +;; set. See below for how to construct a CHAR-CLAUSE. + +;; - (not-char CHAR-CLAUSE ...) +;; Creates a "character class" matching any one character not in the +;; given set. See below for how to construct a CHAR-CLAUSE. + +;; - the symbol `bot' +;; Stands for "\\`", matching the empty string at the beginning of +;; text (beginning of a string or of a buffer). + +;; - the symbol `eot' +;; Stands for "\\'", matching the empty string at the end of text. + +;; - the symbol `point' +;; Stands for "\\=", matching the empty string at point. + +;; - the symbol `word-boundary' +;; Stands for "\\b", matching the empty string at the beginning or +;; end of a word. + +;; - the symbol `not-word-boundary' +;; Stands for "\\B", matching the empty string not at the beginning +;; or end of a word. + +;; - the symbol `bow' +;; Stands for "\\<", matching the empty string at the beginning of a +;; word. + +;; - the symbol `eow' +;; Stands for "\\>", matching the empty string at the end of a word. + +;; - the symbol `wordchar' +;; Stands for the regex "\\w", matching a word-constituent character +;; (as determined by the current syntax table) + +;; - the symbol `not-wordchar' +;; Stands for the regex "\\W", matching a non-word-constituent +;; character. + +;; - (syntax CODE) +;; Stands for the regex "\\sCODE", where CODE is a syntax table code +;; (a single character). Matches any character with the requested +;; syntax. + +;; - (not-syntax CODE) +;; Stands for the regex "\\SCODE", where CODE is a syntax table code +;; (a single character). Matches any character without the +;; requested syntax. + +;; - (regex REGEX) +;; This is a "trapdoor" for including ordinary regular expression +;; strings in the result. Some regular expressions are clearer when +;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for +;; instance. However, see the note under "Bugs," below. + +;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) +;; has one of the following forms: + +;; - a character +;; Adds that character to the set. + +;; - a string +;; Adds all the characters in the string to the set. + +;; - A pair (MIN . MAX) +;; Where MIN and MAX are characters, adds the range of characters +;; from MIN through MAX to the set. + +;;; To do: + +;; Make (sregexq (or "a" (sequence "b" "c"))) return "a\\|bc" instead +;; of "a\\|\\(bc\\)" + +;; An earlier version of this package could optionally translate the +;; symbolic regex into other languages' syntaxes, e.g. Perl. For +;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would +;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore +;; such a facility. + +;;; Bugs: + +;; The (regex REGEX) form can confuse the code that distinguishes +;; introduced groups from user-specified groups. Try to avoid using +;; grouping within a `regex' form. Failing that, try to avoid using +;; backrefs if you're using `regex'. + +;;; Code: + +(defsubst sregex--value-unitp (val) (nth 0 val)) +(defsubst sregex--value-groups (val) (nth 1 val)) +(defsubst sregex--value-tree (val) (nth 2 val)) + +(defun sregex--make-value (unitp groups tree) + (list unitp groups tree)) + +(defvar sregex--current-sregex nil + "Global state for `sregex-info'.") + +(defun sregex-info () + "Return extra information about the latest call to `sregex'. +This extra information is needed in order to adjust user-requested +backreference numbers to numbers suitable for the generated regexp. +See e.g. `sregex-match-string' and `sregex-backref-num'." + sregex--current-sregex) + +; (require 'advice) +; (defadvice save-match-data (around sregex-save-match-data protect) +; (let ((sregex--saved-sregex sregex--current-sregex)) +; (unwind-protect +; ad-do-it +; (setq sregex--current-sregex sregex--saved-sregex)))) +(defmacro sregex-save-match-data (&rest forms) + "Like `save-match-data', but also saves and restores `sregex-info' data." + `(let ((sregex--saved-sregex sregex--current-sregex)) + (unwind-protect + (save-match-data ,@forms) + (setq sregex--current-sregex sregex--saved-sregex)))) + +(defun sregex-replace-match (replacement + &optional fixedcase literal string subexp sregex) + "Like `replace-match', for a regexp made with `sregex'. +This takes one additional optional argument, the `sregex' info, which +can be obtained with `sregex-info'. The SUBEXP argument is adjusted +to allow for \"introduced groups\". If the extra argument is omitted +or nil, it defaults to the current value of (sregex-info)." + (replace-match replacement fixedcase literal string + (and subexp + (sregex-backref-num + subexp + (sregex--value-groups (or sregex + sregex--current-sregex)))))) + +(defun sregex-match-string (count &optional in-string sregex) + "Like `match-string', for a regexp made with `sregex'. +This takes one additional optional argument, the `sregex' info, which +can be obtained with `sregex-info'. The COUNT argument is adjusted to +allow for \"introduced groups\". If the extra argument is omitted or +nil, it defaults to the current value of (sregex-info)." + (match-string (and count + (sregex-backref-num + count + (sregex--value-groups (or sregex + sregex--current-sregex)))) + in-string)) + +(defun sregex-match-string-no-properties (count &optional in-string sregex) + "Like `match-string-no-properties', for a regexp made with `sregex'. +This takes one additional optional argument, the `sregex' info, which +can be obtained with `sregex-info'. The COUNT argument is adjusted to +allow for \"introduced groups\". If the extra argument is omitted or +nil, it defaults to the current value of (sregex-info)." + (match-string-no-properties + (and count + (sregex-backref-num + count + (sregex--value-groups (or sregex + sregex--current-sregex)))) + in-string)) + +(defun sregex-match-beginning (count &optional sregex) + "Like `match-beginning', for a regexp made with `sregex'. +This takes one additional optional argument, the `sregex' info, which +can be obtained with `sregex-info'. The COUNT argument is adjusted to +allow for \"introduced groups\". If the extra argument is omitted or +nil, it defaults to the current value of (sregex-info)." + (match-beginning (sregex-backref-num + count + (sregex--value-groups (or sregex + sregex--current-sregex))))) + +(defun sregex-match-end (count &optional sregex) + "Like `match-end', for a regexp made with `sregex'. +This takes one additional optional argument, the `sregex' info, which +can be obtained with `sregex-info'. The COUNT argument is adjusted to +allow for \"introduced groups\". If the extra argument is omitted or +nil, it defaults to the current value of (sregex-info)." + (match-end (sregex-backref-num + count + (sregex--value-groups (or sregex + sregex--current-sregex))))) + +(defun sregex-match-data (&optional sregex) + "Like `match-data', for a regexp made with `sregex'. +This takes one additional optional argument, the `sregex' info, which +can be obtained with `sregex-info'. \"Introduced groups\" are removed +from the result. If the extra argument is omitted or nil, it defaults +to the current value of (sregex-info)." + (let* ((data (match-data)) + (groups (sregex--value-groups (or sregex + sregex--current-sregex))) + (result (list (car (cdr data)) + (car data)))) + (setq data (cdr (cdr data))) + (while data + (if (car groups) + (setq result (append (list (car (cdr data)) + (car data)) + result))) + (setq groups (cdr groups) + data (cdr (cdr data)))) + (reverse result))) + +(defun sregex--render-tree (tree sregex) + (let ((key (car tree))) + (cond ((eq key 'str) + (cdr tree)) + ((eq key 'or) + (mapconcat '(lambda (x) + (sregex--render-tree x sregex)) + (cdr tree) + "\\|")) + ((eq key 'sequence) + (apply 'concat + (mapcar '(lambda (x) + (sregex--render-tree x sregex)) + (cdr tree)))) + ((eq key 'group) + (concat "\\(" + (sregex--render-tree (cdr tree) sregex) + "\\)")) + ((eq key 'opt) + (concat (sregex--render-tree (cdr tree) sregex) + "?")) + ((eq key '0+) + (concat (sregex--render-tree (cdr tree) sregex) + "*")) + ((eq key '1+) + (concat (sregex--render-tree (cdr tree) sregex) + "+")) + ((eq key 'backref) + (let ((num (sregex-backref-num (cdr tree) sregex))) + (if (> num 9) + (error "sregex: backref number %d too high after adjustment" + num) + (concat "\\" (int-to-string num))))) + (t (error "sregex internal error: unknown tree type %S" + key))))) + +(defun sregex (&rest exps) + "Symbolic regular expression interpreter. +This is exactly like `sregexq' (q.v.) except that it evaluates all its +arguments, so literal sregex clauses must be quoted. For example: + + (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" + +An argument-evaluating sregex interpreter lets you reuse sregex +subexpressions: + + (let ((dotstar '(0+ any)) + (whitespace '(1+ (syntax ?-))) + (digits '(1+ (char (?0 . ?9))))) + (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"" + (progn + (setq sregex--current-sregex (sregex--sequence exps nil)) + (sregex--render-tree (sregex--value-tree sregex--current-sregex) + sregex--current-sregex))) + +(defmacro sregexq (&rest exps) + "Symbolic regular expression interpreter. +This macro allows you to specify a regular expression (regexp) in +symbolic form, and converts it into the string form required by Emacs's +regex functions such as `re-search-forward' and `looking-at'. Here is +a simple example: + + (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" + +As you can see, an sregex is specified by placing one or more special +clauses in a call to `sregexq'. The clause in this case is the `or' +of two strings (not to be confused with the Lisp function `or'). The +list of allowable clauses appears below. + +With `sregex', it is never necessary to \"escape\" magic characters +that are meant to be taken literally; that happens automatically. +For example: + + (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\" + +It is also unnecessary to \"group\" parts of the expression together +to overcome operator precedence; that also happens automatically. +For example: + + (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\" + +It *is* possible to group parts of the expression in order to refer +to them with numbered backreferences: + + (sregexq (group (or \"Go\" \"Run\")) + \", Spot, \" + (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\" + +If `sregexq' needs to introduce its own grouping parentheses, it will +automatically renumber your backreferences: + + (sregexq (opt \"resent-\") + (group (or \"to\" \"cc\" \"bcc\")) + \": \" + (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\" + +`sregexq' is a macro. Each time it is used, it constructs a simple +Lisp expression that then invokes a moderately complex engine to +interpret the sregex and render the string form. Because of this, I +don't recommend sprinkling calls to `sregexq' throughout your code, +the way one normally does with string regexes (which are cheap to +evaluate). Instead, it's wiser to precompute the regexes you need +wherever possible instead of repeatedly constructing the same ones +over and over. Example: + + (let ((field-regex (sregexq (opt \"resent-\") + (or \"to\" \"cc\" \"bcc\")))) + ... + (while ... + ... + (re-search-forward field-regex ...) + ...)) + +The arguments to `sregexq' are automatically quoted, but the +flipside of this is that it is not straightforward to include +computed (i.e., non-constant) values in `sregexq' expressions. So +`sregex' is a function that is like `sregexq' but which does not +automatically quote its values. Literal sregex clauses must be +explicitly quoted like so: + + (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" + +but computed clauses can be included easily, allowing for the reuse +of common clauses: + + (let ((dotstar '(0+ any)) + (whitespace '(1+ (syntax ?-))) + (digits '(1+ (char (?0 . ?9))))) + (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\" + +Here are the clauses allowed in an `sregex' or `sregexq' expression: + +- a string + This stands for the literal string. If it contains + metacharacters, they will be escaped in the resulting regex + (using `regexp-quote'). + +- the symbol `any' + This stands for \".\", a regex matching any character except + newline. + +- the symbol `bol' + Stands for \"^\", matching the empty string at the beginning of a line + +- the symbol `eol' + Stands for \"$\", matching the empty string at the end of a line + +- (group CLAUSE ...) + Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\". + +- (sequence CLAUSE ...) + + Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\". + Clauses groups by `sequence' do not count for purposes of + numbering backreferences. Use `sequence' in situations like + this: + + (sregexq (or \"dog\" \"cat\" + (sequence (opt \"sea \") \"monkey\"))) + => \"dog\\\\|cat\\\\|\\\\(sea \\\\)?monkey\" + + where a single `or' alternate needs to contain multiple + subclauses. + +- (backref N) + Matches the same string previously matched by the Nth \"group\" in + the same sregex. N is a positive integer. In the resulting + regex, N may be adjusted to account for automatically introduced + groups. + +- (or CLAUSE ...) + Matches any one of the CLAUSEs by separating them with \"\\\\|\". + +- (0+ CLAUSE ...) + Concatenates the given CLAUSEs and matches zero or more + occurrences by appending \"*\". + +- (1+ CLAUSE ...) + Concatenates the given CLAUSEs and matches one or more + occurrences by appending \"+\". + +- (opt CLAUSE ...) + Concatenates the given CLAUSEs and matches zero or one occurrence + by appending \"?\". + +- (repeat MIN MAX CLAUSE ...) + Concatenates the given CLAUSEs and constructs a regex matching at + least MIN occurrences and at most MAX occurrences. MIN must be a + non-negative integer. MAX must be a non-negative integer greater + than or equal to MIN; or MAX can be nil to mean \"infinity.\" + +- (char CHAR-CLAUSE ...) + Creates a \"character class\" matching one character from the given + set. See below for how to construct a CHAR-CLAUSE. + +- (not-char CHAR-CLAUSE ...) + Creates a \"character class\" matching any one character not in the + given set. See below for how to construct a CHAR-CLAUSE. + +- the symbol `bot' + Stands for \"\\\\`\", matching the empty string at the beginning of + text (beginning of a string or of a buffer). + +- the symbol `eot' + Stands for \"\\\\'\", matching the empty string at the end of text. + +- the symbol `point' + Stands for \"\\\\=\", matching the empty string at point. + +- the symbol `word-boundary' + Stands for \"\\\\b\", matching the empty string at the beginning or + end of a word. + +- the symbol `not-word-boundary' + Stands for \"\\\\B\", matching the empty string not at the beginning + or end of a word. + +- the symbol `bow' + Stands for \"\\\\\\=<\", matching the empty string at the beginning of a + word. + +- the symbol `eow' + Stands for \"\\\\\\=>\", matching the empty string at the end of a word. + +- the symbol `wordchar' + Stands for the regex \"\\\\w\", matching a word-constituent character + (as determined by the current syntax table) + +- the symbol `not-wordchar' + Stands for the regex \"\\\\W\", matching a non-word-constituent + character. + +- (syntax CODE) + Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code + (a single character). Matches any character with the requested + syntax. + +- (not-syntax CODE) + Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code + (a single character). Matches any character without the + requested syntax. + +- (regex REGEX) + This is a \"trapdoor\" for including ordinary regular expression + strings in the result. Some regular expressions are clearer when + written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for + instance. However, using this can confuse the code that + distinguishes introduced groups from user-specified groups. Avoid + using grouping within a `regex' form. Failing that, avoid using + backrefs if you're using `regex'. + +Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) +has one of the following forms: + +- a character + Adds that character to the set. + +- a string + Adds all the characters in the string to the set. + +- A pair (MIN . MAX) + Where MIN and MAX are characters, adds the range of characters + from MIN through MAX to the set." + `(apply 'sregex ',exps)) + +(defun sregex--engine (exp combine) + (let* ((val (cond ((stringp exp) + (sregex--make-value (or (not (eq combine 'suffix)) + (= (length exp) 1)) + nil + (cons 'str + (regexp-quote exp)))) + ((symbolp exp) + (funcall (intern (concat "sregex--" + (symbol-name exp))) + combine)) + ((consp exp) + (funcall (intern (concat "sregex--" + (symbol-name (car exp)))) + (cdr exp) + combine)) + (t (error "Invalid expression: %s" exp)))) + (unitp (sregex--value-unitp val)) + (groups (sregex--value-groups val)) + (tree (sregex--value-tree val))) + (if (and combine (not unitp)) + (sregex--make-value t + (cons nil groups) + (cons 'group tree)) + (sregex--make-value unitp groups tree)))) + +(defun sregex--sequence (exps combine) + (if (= (length exps) 1) + (sregex--engine (car exps) combine) + (let ((groups nil) + (trees nil)) ;grows in reverse + (while exps + (let ((val (sregex--engine (car exps) 'concat))) + (setq groups (append groups + (sregex--value-groups val)) + trees (cons (sregex--value-tree val) trees) + exps (cdr exps)))) + (setq trees (nreverse trees)) + (if (eq combine 'suffix) + (sregex--make-value t + (cons nil groups) + (cons 'group + (cons 'sequence trees))) + (sregex--make-value (not (eq combine 'suffix)) + groups + (cons 'sequence trees)))))) + +(defun sregex--group (exps combine) + (let ((val (sregex--sequence exps nil))) + (sregex--make-value t + (cons t (sregex--value-groups val)) + (cons 'group (sregex--value-tree val))))) + +(defun sregex-backref-num (n &optional sregex) + "Adjust backreference number N according to SREGEX. +When `sregex' introduces parenthesized groups that the user didn't ask +for, the numbering of the groups that the user *did* ask for gets all +out of whack. This function accounts for introduced groups. Example: + + (sregexq (opt \"ab\") + (group (or \"c\" \"d\"))) => \"\\\\(ab\\\\)?\\\\(c\\\\|d\\\\)\" + (setq info (sregex-info)) + (sregex-backref-num 1 info) => 2 + +The SREGEX parameter is optional and defaults to the current value of +`sregex-info'." + (let ((groups (sregex--value-groups (or sregex + sregex--current-sregex))) + (result 0)) + (while (and groups (> n 0)) + (if (car groups) + (setq n (1- n))) + (setq result (1+ result) + groups (cdr groups))) + result)) + +(defun sregex--backref (exps combine) + (sregex--make-value t nil (cons 'backref (car exps)))) + +(defun sregex--any (combine) + (sregex--make-value t nil '(str . "."))) + +(defun sregex--opt (exps combine) + (let ((val (sregex--sequence exps 'suffix))) + (sregex--make-value t + (sregex--value-groups val) + (cons 'opt (sregex--value-tree val))))) + +(defun sregex--0+ (exps combine) + (let ((val (sregex--sequence exps 'suffix))) + (sregex--make-value t + (sregex--value-groups val) + (cons '0+ (sregex--value-tree val))))) +(defun sregex--1+ (exps combine) + (let ((val (sregex--sequence exps 'suffix))) + (sregex--make-value t + (sregex--value-groups val) + (cons '1+ (sregex--value-tree val))))) + +(defun sregex--repeat (exps combine) + (let ((min (or (car exps) 0)) + (max (car (cdr exps)))) + (setq exps (cdr (cdr exps))) + (cond ((zerop min) + (cond ((equal max 0) ;degenerate + (sregex--make-value t nil nil)) + ((equal max 1) + (sregex--opt exps combine)) + ((not max) + (sregex--0+ exps combine)) + (t (sregex--sequence (make-list max + (cons 'opt exps)) + combine)))) + ((= min 1) + (cond ((equal max 1) + (sregex--sequence exps combine)) + ((not max) + (sregex--1+ exps combine)) + (t (sregex--sequence (append exps + (make-list (1- max) + (cons 'opt exps))) + combine)))) + (t (sregex--sequence (append exps + (list (append (list 'repeat + (1- min) + (and max + (1- max))) + exps))) + combine))))) + +(defun sregex--or (exps combine) + (if (= (length exps) 1) + (sregex--engine (car exps) combine) + (let ((groups nil) + (trees nil)) + (while exps + (let ((val (sregex--engine (car exps) 'or))) + (setq groups (append groups + (sregex--value-groups val)) + trees (cons (sregex--value-tree val) trees) + exps (cdr exps)))) + (sregex--make-value (eq combine 'or) + groups + (cons 'or (nreverse trees)))))) + +(defmacro sregex--char-range-aux () + '(if start + (let (startc endc) + (if (and (<= 32 start) + (<= start 127)) + (setq startc (char-to-string start) + endc (char-to-string end)) + (setq startc (format "\\%03o" start) + endc (format "\\%03o" end))) + (if (> end start) + (if (> end (+ start 1)) + (setq class (concat class startc "-" endc)) + (setq class (concat class startc endc))) + (setq class (concat class startc)))))) + +(defmacro sregex--char-range (rstart rend) + `(let ((i ,rstart) + start end) + (while (<= i ,rend) + (if (aref chars i) + (progn + (if start + (setq end i) + (setq start i + end i)) + (aset chars i nil)) + (sregex--char-range-aux) + (setq start nil + end nil)) + (setq i (1+ i))) + (sregex--char-range-aux))) + +(defun sregex--char-aux (complement args) + (let ((chars (make-vector 256 nil))) + (while args + (let ((arg (car args))) + (cond ((integerp arg) + (aset chars arg t)) + ((stringp arg) + (mapcar (function + (lambda (c) + (aset chars c t))) + arg)) + ((consp arg) + (let ((start (car arg)) + (end (cdr arg))) + (if (> start end) + (let ((tmp start)) + (setq start end + end tmp))) + ;; now start <= end + (let ((i start)) + (while (<= i end) + (aset chars i t) + (setq i (1+ i)))))))) + (setq args (cdr args))) + ;; now chars is a map of the characters in the class + (let ((class "") + (caret (aref chars ?^))) + (aset chars ?^ nil) + (if (aref chars ?\]) + (progn + (setq class (concat class "]")) + (aset chars ?\] nil))) + (if (aref chars ?-) + (progn + (setq class (concat class "-")) + (aset chars ?- nil))) + (if (aref chars ?\\) + (progn + (setq class (concat class "\\\\")) + (aset chars ?\\ nil))) + + (sregex--char-range ?A ?Z) + (sregex--char-range ?a ?z) + (sregex--char-range ?0 ?9) + + (let ((i 32)) + (while (< i 128) + (if (aref chars i) + (progn + (setq class (concat class (char-to-string i))) + (aset chars i nil))) + (setq i (1+ i)))) + + (sregex--char-range 0 31) + (sregex--char-range 128 255) + + (let ((i 0)) + (while (< i 256) + (if (aref chars i) + (setq class (concat class (format "\\%03o" i)))) + (setq i (1+ i)))) + + (if caret + (setq class (concat class "^"))) + (concat "[" (if complement "^") class "]")))) + +(defun sregex--char (exps combine) + (sregex--make-value t nil (cons 'str (sregex--char-aux nil exps)))) +(defun sregex--not-char (exps combine) + (sregex--make-value t nil (cons 'str (sregex--char-aux t exps)))) + +(defun sregex--bol (combine) + (sregex--make-value t nil '(str . "^"))) +(defun sregex--eol (combine) + (sregex--make-value t nil '(str . "$"))) + +(defun sregex--wordchar (combine) + (sregex--make-value t nil '(str . "\\w"))) +(defun sregex--not-wordchar (combine) + (sregex--make-value t nil '(str . "\\W"))) + +(defun sregex--syntax (exps combine) + (sregex--make-value t nil (cons 'str (format "\\s%c" (car exps))))) +(defun sregex--not-syntax (exps combine) + (sregex--make-value t nil (cons 'str (format "\\S%c" (car exps))))) + +(defun sregex--bot (combine) + (sregex--make-value t nil (cons 'str "\\`"))) +(defun sregex--eot (combine) + (sregex--make-value t nil (cons 'str "\\'"))) + +(defun sregex--point (combine) + (sregex--make-value t nil '(str . "\\="))) + +(defun sregex--word-boundary (combine) + (sregex--make-value t nil '(str . "\\b"))) +(defun sregex--not-word-boundary (combine) + (sregex--make-value t nil '(str . "\\B"))) + +(defun sregex--bow (combine) + (sregex--make-value t nil '(str . "\\<"))) +(defun sregex--eow (combine) + (sregex--make-value t nil '(str . "\\>"))) + + +;; trapdoor - usage discouraged +(defun sregex--regex (exps combine) + (sregex--make-value nil nil (car exps))) + +(provide 'sregex) + +;;; sregex.el ends here + |