1
0
mirror of git://git.gnupg.org/gnupg.git synced 2025-01-25 15:27:03 +01:00
gnupg/regexp/jimregexp.c
NIIBE Yutaka efe993f4a6 regexp: Remove including Jim Tcl headers.
Signed-off-by: NIIBE Yutaka <gniibe@fsij.org>
2020-02-12 11:42:56 +09:00

1897 lines
45 KiB
C

/*
* vi:se ts=8:
*
* regcomp and regexec -- regsub and regerror are elsewhere
*
* Copyright (c) 1986 by University of Toronto.
* Written by Henry Spencer. Not derived from licensed software.
*
* Permission is granted to anyone to use this software for any
* purpose on any computer system, and to redistribute it freely,
* subject to the following restrictions:
*
* 1. The author is not responsible for the consequences of use of
* this software, no matter how awful, even if they arise
* from defects in it.
*
* 2. The origin of this software must not be misrepresented, either
* by explicit claim or by omission.
*
* 3. Altered versions must be plainly marked as such, and must not
* be misrepresented as being the original software.
*** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
*** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
*** to assist in implementing egrep.
*** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
*** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
*** as in BSD grep and ex.
*** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
*** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
*** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
*** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
*** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
*** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
*** was moved into regexp.h, and the include of regexp.h now uses "'s
*** to avoid conflicting with the system regexp.h. Const, bless its
*** soul, was removed so it can compile everywhere. The declaration
*** of strchr() was in conflict on AIX, so it was removed (as it is
*** happily defined in string.h).
*** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
*** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
*** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
*** seiwald@perforce.com, on 05 November 2002, to const string literals.
*
* THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
* on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
* This includes counted repetitions, UTF-8 support, character classes,
* shorthand character classes, increased number of parentheses to 100,
* backslash escape sequences. It also removes \n as an alternative to |.
*
* Beware that some of this code is subtly aware of the way operator
* precedence is structured in regular expressions. Serious changes in
* regular-expression syntax might require a total rethink.
*/
#if defined(JIM_REGEXP)
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "jimregexp.h"
#include "utf8.h"
/* An arbitrary limit, but this seems enough. Must be less than 1000. */
#define REG_MAX_PAREN 100
/*
* Structure for regexp "program". This is essentially a linear encoding
* of a nondeterministic finite-state machine (aka syntax charts or
* "railroad normal form" in parsing technology). Each node is an opcode
* plus a "next" pointer, possibly plus an operand. "Next" pointers of
* all nodes except BRANCH implement concatenation; a "next" pointer with
* a BRANCH on both ends of it is connecting two alternatives. (Here we
* have one of the subtle syntax dependencies: an individual BRANCH (as
* opposed to a collection of them) is never concatenated with anything
* because of operator precedence.) The operand of some types of node is
* a literal string; for others, it is a node leading into a sub-FSM. In
* particular, the operand of a BRANCH node is the first node of the branch.
* (NB this is *not* a tree structure: the tail of the branch connects
* to the thing following the set of BRANCHes.) The opcodes are:
*/
/* definition number opnd? meaning */
#define END 0 /* no End of program. */
#define BOL 1 /* no Match "" at beginning of line. */
#define EOL 2 /* no Match "" at end of line. */
#define ANY 3 /* no Match any one character. */
#define ANYOF 4 /* str Match any character in this string. */
#define ANYBUT 5 /* str Match any character not in this string. */
#define BRANCH 6 /* node Match this alternative, or the next... */
#define BACK 7 /* no Match "", "next" ptr points backward. */
#define EXACTLY 8 /* str Match this string. */
#define NOTHING 9 /* no Match empty string. */
#define REP 10 /* max,min Match this (simple) thing [min,max] times. */
#define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
#define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
#define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
#define BOLX 14 /* no Match "" at beginning of input. */
#define EOLX 15 /* no Match "" at end of input. */
#define WORDA 16 /* no Match "" at wordchar, where prev is nonword */
#define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */
#define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
#define OPEN 1001 /* no Mark this point in input as start of #n. */
/* OPEN+1 is number 1, etc. */
/* must not be any other opts between OPEN and CLOSE */
#define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
#define CLOSE 2001 /* no Analogous to OPEN. */
#define CLOSE_END (CLOSE+REG_MAX_PAREN)
/*
* The first word of the regexp internal "program" is actually this magic
* number; the start node begins in the second word.
*/
#define REG_MAGIC 0xFADED00D
/*
* Opcode notes:
*
* BRANCH The set of branches constituting a single choice are hooked
* together with their "next" pointers, since precedence prevents
* anything being concatenated to any individual branch. The
* "next" pointer of the last BRANCH in a choice points to the
* thing following the whole choice. This is also where the
* final "next" pointer of each individual branch points; each
* branch starts with the operand node of a BRANCH node.
*
* BACK Normal "next" pointers all implicitly point forward; BACK
* exists to make loop structures possible.
*
* REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
* as either simple repeats (REP) or complex repeats (REPX).
* These opcodes include a "min" and "max" count after the opcode.
* This is followed by a fourth "current count" word that is
* only used by REPX, as it implements a recursive match.
* REPMIN and REPXMIN are identical except they implement minimal repeats.
*
* OPEN,CLOSE ...are numbered at compile time.
*/
/*
* A node is one word of opcode followed by one word of "next" pointer.
* The "next" pointer value is a positive offset from the opcode of the node
* containing it.
* An operand, if any, simply follows the node. (Note that much of the
* code generation knows about this implicit relationship.)
*/
#define OP(preg, p) (preg->program[p])
#define NEXT(preg, p) (preg->program[p + 1])
#define OPERAND(p) ((p) + 2)
/*
* See regmagic.h for one further detail of program structure.
*/
/*
* Utility definitions.
*/
#define FAIL(R,M) { (R)->err = (M); return (M); }
#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
#define META "^$.[()|?{+*"
/*
* Flags to be passed up and down.
*/
#define HASWIDTH 1 /* Known never to match null string. */
#define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
#define SPSTART 4 /* Starts with * or +. */
#define WORST 0 /* Worst case. */
#define MAX_REP_COUNT 1000000
/*
* Forward declarations for regcomp()'s friends.
*/
static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
static int regpiece(regex_t *preg, int *flagp );
static int regbranch(regex_t *preg, int *flagp );
static int regatom(regex_t *preg, int *flagp );
static int regnode(regex_t *preg, int op );
static int regnext(regex_t *preg, int p );
static void regc(regex_t *preg, int b );
static int reginsert(regex_t *preg, int op, int size, int opnd );
static void regtail(regex_t *preg, int p, int val);
static void regoptail(regex_t *preg, int p, int val );
static int regopsize(regex_t *preg, int p );
static int reg_range_find(const int *string, int c);
static const char *str_find(const char *string, int c, int nocase);
static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
/*#define DEBUG*/
#ifdef DEBUG
static int regnarrate = 0;
static void regdump(regex_t *preg);
static const char *regprop( int op );
#endif
/**
* Returns the length of the null-terminated integer sequence.
*/
static int str_int_len(const int *seq)
{
int n = 0;
while (*seq++) {
n++;
}
return n;
}
/*
- regcomp - compile a regular expression into internal code
*
* We can't allocate space until we know how big the compiled form will be,
* but we can't compile it (and thus know how big it is) until we've got a
* place to put the code. So we cheat: we compile it twice, once with code
* generation turned off and size counting turned on, and once "for real".
* This also means that we don't allocate space until we are sure that the
* thing really will compile successfully, and we never have to move the
* code and thus invalidate pointers into it. (Note that it has to be in
* one piece because free() must be able to free it all.)
*
* Beware that the optimization-preparation code in here knows about some
* of the structure of the compiled regexp.
*/
int regcomp(regex_t *preg, const char *exp, int cflags)
{
int scan;
int longest;
unsigned len;
int flags;
#ifdef DEBUG
fprintf(stderr, "Compiling: '%s'\n", exp);
#endif
memset(preg, 0, sizeof(*preg));
if (exp == NULL)
FAIL(preg, REG_ERR_NULL_ARGUMENT);
/* First pass: determine size, legality. */
preg->cflags = cflags;
preg->regparse = exp;
/* Allocate space. */
preg->proglen = (strlen(exp) + 1) * 5;
preg->program = malloc(preg->proglen * sizeof(int));
if (preg->program == NULL)
FAIL(preg, REG_ERR_NOMEM);
/* Note that since we store a magic value as the first item in the program,
* program offsets will never be 0
*/
regc(preg, REG_MAGIC);
if (reg(preg, 0, &flags) == 0) {
return preg->err;
}
/* Small enough for pointer-storage convention? */
if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
FAIL(preg,REG_ERR_TOO_BIG);
/* Dig out information for optimizations. */
preg->regstart = 0; /* Worst-case defaults. */
preg->reganch = 0;
preg->regmust = 0;
preg->regmlen = 0;
scan = 1; /* First BRANCH. */
if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */
scan = OPERAND(scan);
/* Starting-point info. */
if (OP(preg, scan) == EXACTLY) {
preg->regstart = preg->program[OPERAND(scan)];
}
else if (OP(preg, scan) == BOL)
preg->reganch++;
/*
* If there's something expensive in the r.e., find the
* longest literal string that must appear and make it the
* regmust. Resolve ties in favor of later strings, since
* the regstart check works with the beginning of the r.e.
* and avoiding duplication strengthens checking. Not a
* strong reason, but sufficient in the absence of others.
*/
if (flags&SPSTART) {
longest = 0;
len = 0;
for (; scan != 0; scan = regnext(preg, scan)) {
if (OP(preg, scan) == EXACTLY) {
int plen = str_int_len(preg->program + OPERAND(scan));
if (plen >= len) {
longest = OPERAND(scan);
len = plen;
}
}
}
preg->regmust = longest;
preg->regmlen = len;
}
}
#ifdef DEBUG
regdump(preg);
#endif
return 0;
}
/*
- reg - regular expression, i.e. main body or parenthesized thing
*
* Caller must absorb opening parenthesis.
*
* Combining parenthesis handling with the base level of regular expression
* is a trifle forced, but the need to tie the tails of the branches to what
* follows makes it hard to avoid.
*/
static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
{
int ret;
int br;
int ender;
int parno = 0;
int flags;
*flagp = HASWIDTH; /* Tentatively. */
/* Make an OPEN node, if parenthesized. */
if (paren) {
if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
/* non-capturing paren */
preg->regparse += 2;
parno = -1;
}
else {
parno = ++preg->re_nsub;
}
ret = regnode(preg, OPEN+parno);
} else
ret = 0;
/* Pick up the branches, linking them together. */
br = regbranch(preg, &flags);
if (br == 0)
return 0;
if (ret != 0)
regtail(preg, ret, br); /* OPEN -> first. */
else
ret = br;
if (!(flags&HASWIDTH))
*flagp &= ~HASWIDTH;
*flagp |= flags&SPSTART;
while (*preg->regparse == '|') {
preg->regparse++;
br = regbranch(preg, &flags);
if (br == 0)
return 0;
regtail(preg, ret, br); /* BRANCH -> BRANCH. */
if (!(flags&HASWIDTH))
*flagp &= ~HASWIDTH;
*flagp |= flags&SPSTART;
}
/* Make a closing node, and hook it on the end. */
ender = regnode(preg, (paren) ? CLOSE+parno : END);
regtail(preg, ret, ender);
/* Hook the tails of the branches to the closing node. */
for (br = ret; br != 0; br = regnext(preg, br))
regoptail(preg, br, ender);
/* Check for proper termination. */
if (paren && *preg->regparse++ != ')') {
preg->err = REG_ERR_UNMATCHED_PAREN;
return 0;
} else if (!paren && *preg->regparse != '\0') {
if (*preg->regparse == ')') {
preg->err = REG_ERR_UNMATCHED_PAREN;
return 0;
} else {
preg->err = REG_ERR_JUNK_ON_END;
return 0;
}
}
return(ret);
}
/*
- regbranch - one alternative of an | operator
*
* Implements the concatenation operator.
*/
static int regbranch(regex_t *preg, int *flagp )
{
int ret;
int chain;
int latest;
int flags;
*flagp = WORST; /* Tentatively. */
ret = regnode(preg, BRANCH);
chain = 0;
while (*preg->regparse != '\0' && *preg->regparse != ')' &&
*preg->regparse != '|') {
latest = regpiece(preg, &flags);
if (latest == 0)
return 0;
*flagp |= flags&HASWIDTH;
if (chain == 0) {/* First piece. */
*flagp |= flags&SPSTART;
}
else {
regtail(preg, chain, latest);
}
chain = latest;
}
if (chain == 0) /* Loop ran zero times. */
(void) regnode(preg, NOTHING);
return(ret);
}
/*
- regpiece - something followed by possible [*+?]
*
* Note that the branching code sequences used for ? and the general cases
* of * and + are somewhat optimized: they use the same NOTHING node as
* both the endmarker for their branch list and the body of the last branch.
* It might seem that this node could be dispensed with entirely, but the
* endmarker role is not redundant.
*/
static int regpiece(regex_t *preg, int *flagp)
{
int ret;
char op;
int next;
int flags;
int min;
int max;
ret = regatom(preg, &flags);
if (ret == 0)
return 0;
op = *preg->regparse;
if (!ISMULT(op)) {
*flagp = flags;
return(ret);
}
if (!(flags&HASWIDTH) && op != '?') {
preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
return 0;
}
/* Handle braces (counted repetition) by expansion */
if (op == '{') {
char *end;
min = strtoul(preg->regparse + 1, &end, 10);
if (end == preg->regparse + 1) {
preg->err = REG_ERR_BAD_COUNT;
return 0;
}
if (*end == '}') {
max = min;
}
else if (*end == '\0') {
preg->err = REG_ERR_UNMATCHED_BRACES;
return 0;
}
else {
preg->regparse = end;
max = strtoul(preg->regparse + 1, &end, 10);
if (*end != '}') {
preg->err = REG_ERR_UNMATCHED_BRACES;
return 0;
}
}
if (end == preg->regparse + 1) {
max = MAX_REP_COUNT;
}
else if (max < min || max >= 100) {
preg->err = REG_ERR_BAD_COUNT;
return 0;
}
if (min >= 100) {
preg->err = REG_ERR_BAD_COUNT;
return 0;
}
preg->regparse = strchr(preg->regparse, '}');
}
else {
min = (op == '+');
max = (op == '?' ? 1 : MAX_REP_COUNT);
}
if (preg->regparse[1] == '?') {
preg->regparse++;
next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
}
else {
next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
}
preg->program[ret + 2] = max;
preg->program[ret + 3] = min;
preg->program[ret + 4] = 0;
*flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
if (!(flags & SIMPLE)) {
int back = regnode(preg, BACK);
regtail(preg, back, ret);
regtail(preg, next, back);
}
preg->regparse++;
if (ISMULT(*preg->regparse)) {
preg->err = REG_ERR_NESTED_COUNT;
return 0;
}
return ret;
}
/**
* Add all characters in the inclusive range between lower and upper.
*
* Handles a swapped range (upper < lower).
*/
static void reg_addrange(regex_t *preg, int lower, int upper)
{
if (lower > upper) {
reg_addrange(preg, upper, lower);
}
/* Add a range as length, start */
regc(preg, upper - lower + 1);
regc(preg, lower);
}
/**
* Add a null-terminated literal string as a set of ranges.
*/
static void reg_addrange_str(regex_t *preg, const char *str)
{
while (*str) {
reg_addrange(preg, *str, *str);
str++;
}
}
/**
* Extracts the next unicode char from utf8.
*
* If 'upper' is set, converts the char to uppercase.
*/
static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
{
int l = utf8_tounicode(s, uc);
if (upper) {
*uc = utf8_upper(*uc);
}
return l;
}
/**
* Converts a hex digit to decimal.
*
* Returns -1 for an invalid hex digit.
*/
static int hexdigitval(int c)
{
if (c >= '0' && c <= '9')
return c - '0';
if (c >= 'a' && c <= 'f')
return c - 'a' + 10;
if (c >= 'A' && c <= 'F')
return c - 'A' + 10;
return -1;
}
/**
* Parses up to 'n' hex digits at 's' and stores the result in *uc.
*
* Returns the number of hex digits parsed.
* If there are no hex digits, returns 0 and stores nothing.
*/
static int parse_hex(const char *s, int n, int *uc)
{
int val = 0;
int k;
for (k = 0; k < n; k++) {
int c = hexdigitval(*s++);
if (c == -1) {
break;
}
val = (val << 4) | c;
}
if (k) {
*uc = val;
}
return k;
}
/**
* Call for chars after a backlash to decode the escape sequence.
*
* Stores the result in *ch.
*
* Returns the number of bytes consumed.
*/
static int reg_decode_escape(const char *s, int *ch)
{
int n;
const char *s0 = s;
*ch = *s++;
switch (*ch) {
case 'b': *ch = '\b'; break;
case 'e': *ch = 27; break;
case 'f': *ch = '\f'; break;
case 'n': *ch = '\n'; break;
case 'r': *ch = '\r'; break;
case 't': *ch = '\t'; break;
case 'v': *ch = '\v'; break;
case 'u':
if (*s == '{') {
/* Expect \u{NNNN} */
n = parse_hex(s + 1, 6, ch);
if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
s += n + 2;
}
else {
/* Invalid, so just treat as an escaped 'u' */
*ch = 'u';
}
}
else if ((n = parse_hex(s, 4, ch)) > 0) {
s += n;
}
break;
case 'U':
if ((n = parse_hex(s, 8, ch)) > 0) {
s += n;
}
break;
case 'x':
if ((n = parse_hex(s, 2, ch)) > 0) {
s += n;
}
break;
case '\0':
s--;
*ch = '\\';
break;
}
return s - s0;
}
/*
- regatom - the lowest level
*
* Optimization: gobbles an entire sequence of ordinary characters so that
* it can turn them into a single node, which is smaller to store and
* faster to run. Backslashed characters are exceptions, each becoming a
* separate node; the code is simpler that way and it's not worth fixing.
*/
static int regatom(regex_t *preg, int *flagp)
{
int ret;
int flags;
int nocase = (preg->cflags & REG_ICASE);
int ch;
int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
*flagp = WORST; /* Tentatively. */
preg->regparse += n;
switch (ch) {
/* FIXME: these chars only have meaning at beg/end of pat? */
case '^':
ret = regnode(preg, BOL);
break;
case '$':
ret = regnode(preg, EOL);
break;
case '.':
ret = regnode(preg, ANY);
*flagp |= HASWIDTH|SIMPLE;
break;
case '[': {
const char *pattern = preg->regparse;
if (*pattern == '^') { /* Complement of range. */
ret = regnode(preg, ANYBUT);
pattern++;
} else
ret = regnode(preg, ANYOF);
/* Special case. If the first char is ']' or '-', it is part of the set */
if (*pattern == ']' || *pattern == '-') {
reg_addrange(preg, *pattern, *pattern);
pattern++;
}
while (*pattern && *pattern != ']') {
/* Is this a range? a-z */
int start;
int end;
enum {
CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
CC_NUM
};
int cc;
pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
if (start == '\\') {
/* First check for class shorthand escapes */
switch (*pattern) {
case 's':
pattern++;
cc = CC_SPACE;
goto cc_switch;
case 'd':
pattern++;
cc = CC_DIGIT;
goto cc_switch;
case 'w':
pattern++;
reg_addrange(preg, '_', '_');
cc = CC_ALNUM;
goto cc_switch;
}
pattern += reg_decode_escape(pattern, &start);
if (start == 0) {
preg->err = REG_ERR_NULL_CHAR;
return 0;
}
}
if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
/* skip '-' */
pattern += utf8_tounicode(pattern, &end);
pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
if (end == '\\') {
pattern += reg_decode_escape(pattern, &end);
if (end == 0) {
preg->err = REG_ERR_NULL_CHAR;
return 0;
}
}
reg_addrange(preg, start, end);
continue;
}
if (start == '[' && pattern[0] == ':') {
static const char *character_class[] = {
":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
};
for (cc = 0; cc < CC_NUM; cc++) {
n = strlen(character_class[cc]);
if (strncmp(pattern, character_class[cc], n) == 0) {
/* Found a character class */
pattern += n + 1;
break;
}
}
if (cc != CC_NUM) {
cc_switch:
switch (cc) {
case CC_ALNUM:
reg_addrange(preg, '0', '9');
/* Fall through */
case CC_ALPHA:
if ((preg->cflags & REG_ICASE) == 0) {
reg_addrange(preg, 'a', 'z');
}
reg_addrange(preg, 'A', 'Z');
break;
case CC_SPACE:
reg_addrange_str(preg, " \t\r\n\f\v");
break;
case CC_BLANK:
reg_addrange_str(preg, " \t");
break;
case CC_UPPER:
reg_addrange(preg, 'A', 'Z');
break;
case CC_LOWER:
reg_addrange(preg, 'a', 'z');
break;
case CC_XDIGIT:
reg_addrange(preg, 'a', 'f');
reg_addrange(preg, 'A', 'F');
/* Fall through */
case CC_DIGIT:
reg_addrange(preg, '0', '9');
break;
case CC_CNTRL:
reg_addrange(preg, 0, 31);
reg_addrange(preg, 127, 127);
break;
case CC_PRINT:
reg_addrange(preg, ' ', '~');
break;
case CC_GRAPH:
reg_addrange(preg, '!', '~');
break;
case CC_PUNCT:
reg_addrange(preg, '!', '/');
reg_addrange(preg, ':', '@');
reg_addrange(preg, '[', '`');
reg_addrange(preg, '{', '~');
break;
}
continue;
}
}
/* Not a range, so just add the char */
reg_addrange(preg, start, start);
}
regc(preg, '\0');
if (*pattern) {
pattern++;
}
preg->regparse = pattern;
*flagp |= HASWIDTH|SIMPLE;
}
break;
case '(':
ret = reg(preg, 1, &flags);
if (ret == 0)
return 0;
*flagp |= flags&(HASWIDTH|SPSTART);
break;
case '\0':
case '|':
case ')':
preg->err = REG_ERR_INTERNAL;
return 0; /* Supposed to be caught earlier. */
case '?':
case '+':
case '*':
case '{':
preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
return 0;
case '\\':
ch = *preg->regparse++;
switch (ch) {
case '\0':
preg->err = REG_ERR_TRAILING_BACKSLASH;
return 0;
case 'A':
ret = regnode(preg, BOLX);
break;
case 'Z':
ret = regnode(preg, EOLX);
break;
case '<':
case 'm':
ret = regnode(preg, WORDA);
break;
case '>':
case 'M':
ret = regnode(preg, WORDZ);
break;
case 'd':
case 'D':
ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
reg_addrange(preg, '0', '9');
regc(preg, '\0');
*flagp |= HASWIDTH|SIMPLE;
break;
case 'w':
case 'W':
ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
if ((preg->cflags & REG_ICASE) == 0) {
reg_addrange(preg, 'a', 'z');
}
reg_addrange(preg, 'A', 'Z');
reg_addrange(preg, '0', '9');
reg_addrange(preg, '_', '_');
regc(preg, '\0');
*flagp |= HASWIDTH|SIMPLE;
break;
case 's':
case 'S':
ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
reg_addrange_str(preg," \t\r\n\f\v");
regc(preg, '\0');
*flagp |= HASWIDTH|SIMPLE;
break;
/* FIXME: Someday handle \1, \2, ... */
default:
/* Handle general quoted chars in exact-match routine */
/* Back up to include the backslash */
preg->regparse--;
goto de_fault;
}
break;
de_fault:
default: {
/*
* Encode a string of characters to be matched exactly.
*/
int added = 0;
/* Back up to pick up the first char of interest */
preg->regparse -= n;
ret = regnode(preg, EXACTLY);
/* Note that a META operator such as ? or * consumes the
* preceding char.
* Thus we must be careful to look ahead by 2 and add the
* last char as it's own EXACTLY if necessary
*/
/* Until end of string or a META char is reached */
while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
if (ch == '\\' && preg->regparse[n]) {
/* Non-trailing backslash.
* Is this a special escape, or a regular escape?
*/
if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
/* A special escape. All done with EXACTLY */
break;
}
/* Decode it. Note that we add the length for the escape
* sequence to the length for the backlash so we can skip
* the entire sequence, or not as required.
*/
n += reg_decode_escape(preg->regparse + n, &ch);
if (ch == 0) {
preg->err = REG_ERR_NULL_CHAR;
return 0;
}
}
/* Now we have one char 'ch' of length 'n'.
* Check to see if the following char is a MULT
*/
if (ISMULT(preg->regparse[n])) {
/* Yes. But do we already have some EXACTLY chars? */
if (added) {
/* Yes, so return what we have and pick up the current char next time around */
break;
}
/* No, so add this single char and finish */
regc(preg, ch);
added++;
preg->regparse += n;
break;
}
/* No, so just add this char normally */
regc(preg, ch);
added++;
preg->regparse += n;
}
regc(preg, '\0');
*flagp |= HASWIDTH;
if (added == 1)
*flagp |= SIMPLE;
break;
}
break;
}
return(ret);
}
static void reg_grow(regex_t *preg, int n)
{
if (preg->p + n >= preg->proglen) {
preg->proglen = (preg->p + n) * 2;
preg->program = realloc(preg->program, preg->proglen * sizeof(int));
}
}
/*
- regnode - emit a node
*/
/* Location. */
static int regnode(regex_t *preg, int op)
{
reg_grow(preg, 2);
/* The OP followed by a next pointer */
preg->program[preg->p++] = op;
preg->program[preg->p++] = 0;
/* Return the start of the node */
return preg->p - 2;
}
/*
- regc - emit (if appropriate) a byte of code
*/
static void regc(regex_t *preg, int b )
{
reg_grow(preg, 1);
preg->program[preg->p++] = b;
}
/*
- reginsert - insert an operator in front of already-emitted operand
*
* Means relocating the operand.
* Returns the new location of the original operand.
*/
static int reginsert(regex_t *preg, int op, int size, int opnd )
{
reg_grow(preg, size);
/* Move everything from opnd up */
memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
/* Zero out the new space */
memset(preg->program + opnd, 0, sizeof(int) * size);
preg->program[opnd] = op;
preg->p += size;
return opnd + size;
}
/*
- regtail - set the next-pointer at the end of a node chain
*/
static void regtail(regex_t *preg, int p, int val)
{
int scan;
int temp;
int offset;
/* Find last node. */
scan = p;
for (;;) {
temp = regnext(preg, scan);
if (temp == 0)
break;
scan = temp;
}
if (OP(preg, scan) == BACK)
offset = scan - val;
else
offset = val - scan;
preg->program[scan + 1] = offset;
}
/*
- regoptail - regtail on operand of first argument; nop if operandless
*/
static void regoptail(regex_t *preg, int p, int val )
{
/* "Operandless" and "op != BRANCH" are synonymous in practice. */
if (p != 0 && OP(preg, p) == BRANCH) {
regtail(preg, OPERAND(p), val);
}
}
/*
* regexec and friends
*/
/*
* Forwards.
*/
static int regtry(regex_t *preg, const char *string );
static int regmatch(regex_t *preg, int prog);
static int regrepeat(regex_t *preg, int p, int max);
/*
- regexec - match a regexp against a string
*/
int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
{
const char *s;
int scan;
/* Be paranoid... */
if (preg == NULL || preg->program == NULL || string == NULL) {
return REG_ERR_NULL_ARGUMENT;
}
/* Check validity of program. */
if (*preg->program != REG_MAGIC) {
return REG_ERR_CORRUPTED;
}
#ifdef DEBUG
fprintf(stderr, "regexec: %s\n", string);
regdump(preg);
#endif
preg->eflags = eflags;
preg->pmatch = pmatch;
preg->nmatch = nmatch;
preg->start = string; /* All offsets are computed from here */
/* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
int op = OP(preg, scan);
if (op == END)
break;
if (op == REPX || op == REPXMIN)
preg->program[scan + 4] = 0;
}
/* If there is a "must appear" string, look for it. */
if (preg->regmust != 0) {
s = string;
while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
break;
}
s++;
}
if (s == NULL) /* Not present. */
return REG_NOMATCH;
}
/* Mark beginning of line for ^ . */
preg->regbol = string;
/* Simplest case: anchored match need be tried only once (maybe per line). */
if (preg->reganch) {
if (eflags & REG_NOTBOL) {
/* This is an anchored search, but not an BOL, so possibly skip to the next line */
goto nextline;
}
while (1) {
if (regtry(preg, string)) {
return REG_NOERROR;
}
if (*string) {
nextline:
if (preg->cflags & REG_NEWLINE) {
/* Try the next anchor? */
string = strchr(string, '\n');
if (string) {
preg->regbol = ++string;
continue;
}
}
}
return REG_NOMATCH;
}
}
/* Messy cases: unanchored match. */
s = string;
if (preg->regstart != '\0') {
/* We know what char it must start with. */
while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
if (regtry(preg, s))
return REG_NOERROR;
s++;
}
}
else
/* We don't -- general case. */
while (1) {
if (regtry(preg, s))
return REG_NOERROR;
if (*s == '\0') {
break;
}
else {
int c;
s += utf8_tounicode(s, &c);
}
}
/* Failure. */
return REG_NOMATCH;
}
/*
- regtry - try match at specific point
*/
/* 0 failure, 1 success */
static int regtry( regex_t *preg, const char *string )
{
int i;
preg->reginput = string;
for (i = 0; i < preg->nmatch; i++) {
preg->pmatch[i].rm_so = -1;
preg->pmatch[i].rm_eo = -1;
}
if (regmatch(preg, 1)) {
preg->pmatch[0].rm_so = string - preg->start;
preg->pmatch[0].rm_eo = preg->reginput - preg->start;
return(1);
} else
return(0);
}
/**
* Returns bytes matched if 'pattern' is a prefix of 'string'.
*
* If 'nocase' is non-zero, does a case-insensitive match.
*
* Returns -1 on not found.
*/
static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
{
const char *s = string;
while (proglen && *s) {
int ch;
int n = reg_utf8_tounicode_case(s, &ch, nocase);
if (ch != *prog) {
return -1;
}
prog++;
s += n;
proglen--;
}
if (proglen == 0) {
return s - string;
}
return -1;
}
/**
* Searchs for 'c' in the range 'range'.
*
* Returns 1 if found, or 0 if not.
*/
static int reg_range_find(const int *range, int c)
{
while (*range) {
/*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
return 1;
}
range += 2;
}
return 0;
}
/**
* Search for the character 'c' in the utf-8 string 'string'.
*
* If 'nocase' is set, the 'string' is assumed to be uppercase
* and 'c' is converted to uppercase before matching.
*
* Returns the byte position in the string where the 'c' was found, or
* NULL if not found.
*/
static const char *str_find(const char *string, int c, int nocase)
{
if (nocase) {
/* The "string" should already be converted to uppercase */
c = utf8_upper(c);
}
while (*string) {
int ch;
int n = reg_utf8_tounicode_case(string, &ch, nocase);
if (c == ch) {
return string;
}
string += n;
}
return NULL;
}
/**
* Returns true if 'ch' is an end-of-line char.
*
* In REG_NEWLINE mode, \n is considered EOL in
* addition to \0
*/
static int reg_iseol(regex_t *preg, int ch)
{
if (preg->cflags & REG_NEWLINE) {
return ch == '\0' || ch == '\n';
}
else {
return ch == '\0';
}
}
static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
{
int nextch = '\0';
const char *save;
int no;
int c;
int max = preg->program[scan + 2];
int min = preg->program[scan + 3];
int next = regnext(preg, scan);
/*
* Lookahead to avoid useless match attempts
* when we know what character comes next.
*/
if (OP(preg, next) == EXACTLY) {
nextch = preg->program[OPERAND(next)];
}
save = preg->reginput;
no = regrepeat(preg, scan + 5, max);
if (no < min) {
return 0;
}
if (matchmin) {
/* from min up to no */
max = no;
no = min;
}
/* else from no down to min */
while (1) {
if (matchmin) {
if (no > max) {
break;
}
}
else {
if (no < min) {
break;
}
}
preg->reginput = save + utf8_index(save, no);
reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
/* If it could work, try it. */
if (reg_iseol(preg, nextch) || c == nextch) {
if (regmatch(preg, next)) {
return(1);
}
}
if (matchmin) {
/* Couldn't or didn't, add one more */
no++;
}
else {
/* Couldn't or didn't -- back up. */
no--;
}
}
return(0);
}
static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
{
int *scanpt = preg->program + scan;
int max = scanpt[2];
int min = scanpt[3];
/* Have we reached min? */
if (scanpt[4] < min) {
/* No, so get another one */
scanpt[4]++;
if (regmatch(preg, scan + 5)) {
return 1;
}
scanpt[4]--;
return 0;
}
if (scanpt[4] > max) {
return 0;
}
if (matchmin) {
/* minimal, so try other branch first */
if (regmatch(preg, regnext(preg, scan))) {
return 1;
}
/* No, so try one more */
scanpt[4]++;
if (regmatch(preg, scan + 5)) {
return 1;
}
scanpt[4]--;
return 0;
}
/* maximal, so try this branch again */
if (scanpt[4] < max) {
scanpt[4]++;
if (regmatch(preg, scan + 5)) {
return 1;
}
scanpt[4]--;
}
/* At this point we are at max with no match. Try the other branch */
return regmatch(preg, regnext(preg, scan));
}
/*
- regmatch - main matching routine
*
* Conceptually the strategy is simple: check to see whether the current
* node matches, call self recursively to see whether the rest matches,
* and then act accordingly. In practice we make some effort to avoid
* recursion, in particular by going through "ordinary" nodes (that don't
* need to know whether the rest of the match failed) by a loop instead of
* by recursion.
*/
/* 0 failure, 1 success */
static int regmatch(regex_t *preg, int prog)
{
int scan; /* Current node. */
int next; /* Next node. */
const char *save;
scan = prog;
#ifdef DEBUG
if (scan != 0 && regnarrate)
fprintf(stderr, "%s(\n", regprop(scan));
#endif
while (scan != 0) {
int n;
int c;
#ifdef DEBUG
if (regnarrate) {
fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
}
#endif
next = regnext(preg, scan);
n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
switch (OP(preg, scan)) {
case BOLX:
if ((preg->eflags & REG_NOTBOL)) {
return(0);
}
/* Fall through */
case BOL:
if (preg->reginput != preg->regbol) {
return(0);
}
break;
case EOLX:
if (c != 0) {
/* For EOLX, only match real end of line, not newline */
return 0;
}
break;
case EOL:
if (!reg_iseol(preg, c)) {
return(0);
}
break;
case WORDA:
/* Must be looking at a letter, digit, or _ */
if ((!isalnum(UCHAR(c))) && c != '_')
return(0);
/* Prev must be BOL or nonword */
if (preg->reginput > preg->regbol &&
(isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
return(0);
break;
case WORDZ:
/* Can't match at BOL */
if (preg->reginput > preg->regbol) {
/* Current must be EOL or nonword */
if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
c = preg->reginput[-1];
/* Previous must be word */
if (isalnum(UCHAR(c)) || c == '_') {
break;
}
}
}
/* No */
return(0);
case ANY:
if (reg_iseol(preg, c))
return 0;
preg->reginput += n;
break;
case EXACTLY: {
int opnd;
int len;
int slen;
opnd = OPERAND(scan);
len = str_int_len(preg->program + opnd);
slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
if (slen < 0) {
return(0);
}
preg->reginput += slen;
}
break;
case ANYOF:
if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
return(0);
}
preg->reginput += n;
break;
case ANYBUT:
if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
return(0);
}
preg->reginput += n;
break;
case NOTHING:
break;
case BACK:
break;
case BRANCH:
if (OP(preg, next) != BRANCH) /* No choice. */
next = OPERAND(scan); /* Avoid recursion. */
else {
do {
save = preg->reginput;
if (regmatch(preg, OPERAND(scan))) {
return(1);
}
preg->reginput = save;
scan = regnext(preg, scan);
} while (scan != 0 && OP(preg, scan) == BRANCH);
return(0);
/* NOTREACHED */
}
break;
case REP:
case REPMIN:
return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
case REPX:
case REPXMIN:
return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
case END:
return 1; /* Success! */
case OPENNC:
case CLOSENC:
return regmatch(preg, next);
default:
if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
save = preg->reginput;
if (regmatch(preg, next)) {
if (OP(preg, scan) < CLOSE) {
int no = OP(preg, scan) - OPEN;
if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
preg->pmatch[no].rm_so = save - preg->start;
}
}
else {
int no = OP(preg, scan) - CLOSE;
if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
preg->pmatch[no].rm_eo = save - preg->start;
}
}
return(1);
}
/* Restore input position after failure */
preg->reginput = save;
return(0);
}
return REG_ERR_INTERNAL;
}
scan = next;
}
/*
* We get here only if there's trouble -- normally "case END" is
* the terminating point.
*/
return REG_ERR_INTERNAL;
}
/*
- regrepeat - repeatedly match something simple, report how many
*/
static int regrepeat(regex_t *preg, int p, int max)
{
int count = 0;
const char *scan;
int opnd;
int ch;
int n;
scan = preg->reginput;
opnd = OPERAND(p);
switch (OP(preg, p)) {
case ANY:
while (!reg_iseol(preg, *scan) && count < max) {
count++;
scan += utf8_charlen(*scan);
}
break;
case EXACTLY:
while (count < max) {
n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
if (preg->program[opnd] != ch) {
break;
}
count++;
scan += n;
}
break;
case ANYOF:
while (count < max) {
n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
break;
}
count++;
scan += n;
}
break;
case ANYBUT:
while (count < max) {
n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
break;
}
count++;
scan += n;
}
break;
default: /* Oh dear. Called inappropriately. */
preg->err = REG_ERR_INTERNAL;
count = 0; /* Best compromise. */
break;
}
preg->reginput = scan;
return(count);
}
/*
- regnext - dig the "next" pointer out of a node
*/
static int regnext(regex_t *preg, int p )
{
int offset;
offset = NEXT(preg, p);
if (offset == 0)
return 0;
if (OP(preg, p) == BACK)
return(p-offset);
else
return(p+offset);
}
/*
- regopsize - returns the size of opcode + operands at 'p' in words
*/
static int regopsize(regex_t *preg, int p )
{
/* Almost all opcodes are 2 words, but some are more */
switch (OP(preg, p)) {
case REP:
case REPMIN:
case REPX:
case REPXMIN:
return 5;
case ANYOF:
case ANYBUT:
case EXACTLY: {
int s = p + 2;
while (preg->program[s++]) {
}
return s - p;
}
}
return 2;
}
#if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
/*
- regdump - dump a regexp onto stdout in vaguely comprehensible form
*/
static void regdump(regex_t *preg)
{
int s;
int op = EXACTLY; /* Arbitrary non-END op. */
int next;
char buf[MAX_UTF8_LEN + 1];
int i;
for (i = 1; i < preg->p; i++) {
printf("%02x ", (unsigned char)preg->program[i]);
if (i % 16 == 0) {
printf("\n");
}
}
printf("\n");
s = 1;
while (op != END && s < preg->p) { /* While that wasn't END last time... */
op = OP(preg, s);
printf("%3d: %s", s, regprop(op)); /* Where, what. */
next = regnext(preg, s);
if (next == 0) /* Next ptr. */
printf("(0)");
else
printf("(%d)", next);
s += 2;
if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
int max = preg->program[s];
int min = preg->program[s + 1];
if (max == 65535) {
printf("{%d,*}", min);
}
else {
printf("{%d,%d}", min, max);
}
printf(" %d", preg->program[s + 2]);
s += 3;
}
else if (op == ANYOF || op == ANYBUT) {
/* set of ranges */
while (preg->program[s]) {
int len = preg->program[s++];
int first = preg->program[s++];
buf[utf8_getchars(buf, first)] = 0;
printf("%s", buf);
if (len > 1) {
buf[utf8_getchars(buf, first + len - 1)] = 0;
printf("-%s", buf);
}
}
s++;
}
else if (op == EXACTLY) {
/* Literal string, where present. */
while (preg->program[s]) {
buf[utf8_getchars(buf, preg->program[s])] = 0;
printf("%s", buf);
s++;
}
s++;
}
putchar('\n');
}
if (op == END) {
/* Header fields of interest. */
if (preg->regstart) {
buf[utf8_getchars(buf, preg->regstart)] = 0;
printf("start '%s' ", buf);
}
if (preg->reganch)
printf("anchored ");
if (preg->regmust != 0) {
int i;
printf("must have:");
for (i = 0; i < preg->regmlen; i++) {
putchar(preg->program[preg->regmust + i]);
}
putchar('\n');
}
}
printf("\n");
}
/*
- regprop - printable representation of opcode
*/
static const char *regprop( int op )
{
static char buf[50];
switch (op) {
case BOL:
return "BOL";
case EOL:
return "EOL";
case BOLX:
return "BOLX";
case EOLX:
return "EOLX";
case ANY:
return "ANY";
case ANYOF:
return "ANYOF";
case ANYBUT:
return "ANYBUT";
case BRANCH:
return "BRANCH";
case EXACTLY:
return "EXACTLY";
case NOTHING:
return "NOTHING";
case BACK:
return "BACK";
case END:
return "END";
case REP:
return "REP";
case REPMIN:
return "REPMIN";
case REPX:
return "REPX";
case REPXMIN:
return "REPXMIN";
case WORDA:
return "WORDA";
case WORDZ:
return "WORDZ";
case OPENNC:
return "OPEN";
case CLOSENC:
return "CLOSE";
default:
if (op >= OPEN && op < CLOSE) {
snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
}
else if (op >= CLOSE && op < CLOSE_END) {
snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
}
else {
snprintf(buf, sizeof(buf), "?%d?\n", op);
}
return(buf);
}
}
#endif /* JIM_BOOTSTRAP */
size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
{
static const char *error_strings[] = {
"success",
"no match",
"bad pattern",
"null argument",
"unknown error",
"too big",
"out of memory",
"too many ()",
"parentheses () not balanced",
"braces {} not balanced",
"invalid repetition count(s)",
"extra characters",
"*+ of empty atom",
"nested count",
"internal error",
"count follows nothing",
"trailing backslash",
"corrupted program",
"contains null char",
};
const char *err;
if (errcode < 0 || errcode >= REG_ERR_NUM) {
err = "Bad error code";
}
else {
err = error_strings[errcode];
}
return snprintf(errbuf, errbuf_size, "%s", err);
}
void regfree(regex_t *preg)
{
free(preg->program);
}
#endif