regexp: Add regexp module from Jim Tcl.

-- From Jim Tcl (version 0.79+) master commit of: 7a101ca903c44be10a692e7264b3160115edf7cf Signed-off-by: NIIBE Yutaka <gniibe@fsij.org>
2025-07-14 21:47:19 +02:00 · 2020-02-12 11:05:38 +09:00 · 2020-02-12 11:05:38 +09:00 · c2d1511f0b
commit c2d1511f0b
parent 9c719c9c1f
5 changed files with 2465 additions and 0 deletions
--- a/regexp/LICENSE
+++ b/regexp/LICENSE
@ -0,0 +1,45 @@
+Unless explicitly stated, all files within Jim repository are released
+under following license:
+
+/* Jim - A small embeddable Tcl interpreter
+ *
+ * Copyright 2005 Salvatore Sanfilippo <antirez@invece.org>
+ * Copyright 2005 Clemens Hintze <c.hintze@gmx.net>
+ * Copyright 2005 patthoyts - Pat Thoyts <patthoyts@users.sf.net>
+ * Copyright 2008 oharboe - Øyvind Harboe - oyvind.harboe@zylin.com
+ * Copyright 2008 Andrew Lunn <andrew@lunn.ch>
+ * Copyright 2008 Duane Ellis <openocd@duaneellis.com>
+ * Copyright 2008 Uwe Klein <uklein@klein-messgeraete.de>
+ * Copyright 2008 Steve Bennett <steveb@workware.net.au>
+ * Copyright 2009 Nico Coesel <ncoesel@dealogic.nl>
+ * Copyright 2009 Zachary T Welch zw@superlucidity.net
+ * Copyright 2009 David Brownell
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE JIM TCL PROJECT ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * JIM TCL PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation
+ * are those of the authors and should not be interpreted as representing
+ * official policies, either expressed or implied, of the Jim Tcl Project.
+ */
--- a/regexp/jimregexp.c
+++ b/regexp/jimregexp.c
--- a/regexp/jimregexp.h
+++ b/regexp/jimregexp.h
@ -0,0 +1,109 @@
+#ifndef JIMREGEXP_H
+#define JIMREGEXP_H
+
+/** regexp(3)-compatible regular expression implementation for Jim.
+ *
+ * See jimregexp.c for details
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdlib.h>
+
+typedef struct {
+	int rm_so;
+	int rm_eo;
+} regmatch_t;
+
+/*
+ * The "internal use only" fields in regexp.h are present to pass info from
+ * compile to execute that permits the execute phase to run lots faster on
+ * simple cases.  They are:
+ *
+ * regstart	char that must begin a match; '\0' if none obvious
+ * reganch	is the match anchored (at beginning-of-line only)?
+ * regmust	string (pointer into program) that match must include, or NULL
+ * regmlen	length of regmust string
+ *
+ * Regstart and reganch permit very fast decisions on suitable starting points
+ * for a match, cutting down the work a lot.  Regmust permits fast rejection
+ * of lines that cannot possibly match.  The regmust tests are costly enough
+ * that regcomp() supplies a regmust only if the r.e. contains something
+ * potentially expensive (at present, the only such thing detected is * or +
+ * at the start of the r.e., which can involve a lot of backup).  Regmlen is
+ * supplied because the test in regexec() needs it and regcomp() is computing
+ * it anyway.
+ */
+
+typedef struct regexp {
+	/* -- public -- */
+	int re_nsub;		/* number of parenthesized subexpressions */
+
+	/* -- private -- */
+	int cflags;			/* Flags used when compiling */
+	int err;			/* Any error which occurred during compile */
+	int regstart;		/* Internal use only. */
+	int reganch;		/* Internal use only. */
+	int regmust;		/* Internal use only. */
+	int regmlen;		/* Internal use only. */
+	int *program;		/* Allocated */
+
+	/* working state - compile */
+	const char *regparse;		/* Input-scan pointer. */
+	int p;				/* Current output pos in program */
+	int proglen;		/* Allocated program size */
+
+	/* working state - exec */
+	int eflags;				/* Flags used when executing */
+	const char *start;		/* Initial string pointer. */
+	const char *reginput;	/* Current input pointer. */
+	const char *regbol;		/* Beginning of input, for ^ check. */
+
+	/* Input to regexec() */
+	regmatch_t *pmatch;		/* submatches will be stored here */
+	int nmatch;				/* size of pmatch[] */
+} regexp;
+
+typedef regexp regex_t;
+
+#define REG_EXTENDED 0
+#define REG_NEWLINE 1
+#define REG_ICASE 2
+
+#define REG_NOTBOL 16
+
+enum {
+	REG_NOERROR,      /* Success.  */
+	REG_NOMATCH,      /* Didn't find a match (for regexec).  */
+	REG_BADPAT,		  /* >= REG_BADPAT is an error */
+	REG_ERR_NULL_ARGUMENT,
+	REG_ERR_UNKNOWN,
+	REG_ERR_TOO_BIG,
+	REG_ERR_NOMEM,
+	REG_ERR_TOO_MANY_PAREN,
+	REG_ERR_UNMATCHED_PAREN,
+	REG_ERR_UNMATCHED_BRACES,
+	REG_ERR_BAD_COUNT,
+	REG_ERR_JUNK_ON_END,
+	REG_ERR_OPERAND_COULD_BE_EMPTY,
+	REG_ERR_NESTED_COUNT,
+	REG_ERR_INTERNAL,
+	REG_ERR_COUNT_FOLLOWS_NOTHING,
+	REG_ERR_TRAILING_BACKSLASH,
+	REG_ERR_CORRUPTED,
+	REG_ERR_NULL_CHAR,
+	REG_ERR_NUM
+};
+
+int regcomp(regex_t *preg, const char *regex, int cflags);
+int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
+size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size);
+void regfree(regex_t *preg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/regexp/utf8.c
+++ b/regexp/utf8.c
@ -0,0 +1,262 @@
+/**
+ * UTF-8 utility functions
+ *
+ * (c) 2010-2016 Steve Bennett <steveb@workware.net.au>
+ *
+ * See LICENCE for licence details.
+ */
+
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "utf8.h"
+
+/* This one is always implemented */
+int utf8_fromunicode(char *p, unsigned uc)
+{
+    if (uc <= 0x7f) {
+        *p = uc;
+        return 1;
+    }
+    else if (uc <= 0x7ff) {
+        *p++ = 0xc0 | ((uc & 0x7c0) >> 6);
+        *p = 0x80 | (uc & 0x3f);
+        return 2;
+    }
+    else if (uc <= 0xffff) {
+        *p++ = 0xe0 | ((uc & 0xf000) >> 12);
+        *p++ = 0x80 | ((uc & 0xfc0) >> 6);
+        *p = 0x80 | (uc & 0x3f);
+        return 3;
+    }
+    /* Note: We silently truncate to 21 bits here: 0x1fffff */
+    else {
+        *p++ = 0xf0 | ((uc & 0x1c0000) >> 18);
+        *p++ = 0x80 | ((uc & 0x3f000) >> 12);
+        *p++ = 0x80 | ((uc & 0xfc0) >> 6);
+        *p = 0x80 | (uc & 0x3f);
+        return 4;
+    }
+}
+
+#if defined(USE_UTF8) && !defined(JIM_BOOTSTRAP)
+int utf8_charlen(int c)
+{
+    if ((c & 0x80) == 0) {
+        return 1;
+    }
+    if ((c & 0xe0) == 0xc0) {
+        return 2;
+    }
+    if ((c & 0xf0) == 0xe0) {
+        return 3;
+    }
+    if ((c & 0xf8) == 0xf0) {
+        return 4;
+    }
+    /* Invalid sequence, so treat it as a single byte */
+    return 1;
+}
+
+int utf8_strlen(const char *str, int bytelen)
+{
+    int charlen = 0;
+    if (bytelen < 0) {
+        bytelen = strlen(str);
+    }
+    while (bytelen > 0) {
+        int c;
+        int l = utf8_tounicode(str, &c);
+        charlen++;
+        str += l;
+        bytelen -= l;
+    }
+    return charlen;
+}
+
+int utf8_strwidth(const char *str, int charlen)
+{
+    int width = 0;
+    while (charlen) {
+        int c;
+        int l = utf8_tounicode(str, &c);
+        width += utf8_width(c);
+        str += l;
+        charlen--;
+    }
+    return width;
+}
+
+int utf8_index(const char *str, int index)
+{
+    const char *s = str;
+    while (index--) {
+        s += utf8_charlen(*s);
+    }
+    return s - str;
+}
+
+int utf8_prev_len(const char *str, int len)
+{
+    int n = 1;
+
+    assert(len > 0);
+
+    /* Look up to len chars backward for a start-of-char byte */
+    while (--len) {
+        if ((str[-n] & 0x80) == 0) {
+            /* Start of a 1-byte char */
+            break;
+        }
+        if ((str[-n] & 0xc0) == 0xc0) {
+            /* Start of a multi-byte char */
+            break;
+        }
+        n++;
+    }
+    return n;
+}
+
+int utf8_tounicode(const char *str, int *uc)
+{
+    unsigned const char *s = (unsigned const char *)str;
+
+    if (s[0] < 0xc0) {
+        *uc = s[0];
+        return 1;
+    }
+    if (s[0] < 0xe0) {
+        if ((s[1] & 0xc0) == 0x80) {
+            *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80);
+            if (*uc >= 0x80) {
+                return 2;
+            }
+            /* Otherwise this is an invalid sequence */
+        }
+    }
+    else if (s[0] < 0xf0) {
+        if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
+            *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);
+            if (*uc >= 0x800) {
+                return 3;
+            }
+            /* Otherwise this is an invalid sequence */
+        }
+    }
+    else if (s[0] < 0xf8) {
+        if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
+            *uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);
+            if (*uc >= 0x10000) {
+                return 4;
+            }
+            /* Otherwise this is an invalid sequence */
+        }
+    }
+
+    /* Invalid sequence, so just return the byte */
+    *uc = *s;
+    return 1;
+}
+
+struct casemap {
+    unsigned short code;        /* code point */
+    unsigned short altcode;     /* alternate case code point */
+};
+
+struct utf8range {
+    unsigned lower;     /* lower inclusive */
+    unsigned upper;     /* upper exclusive */
+};
+
+
+/* Generated mapping tables */
+#include "_unicode_mapping.c"
+
+#define ARRAYSIZE(A) sizeof(A) / sizeof(*(A))
+
+static int cmp_casemap(const void *key, const void *cm)
+{
+    return *(int *)key - (int)((const struct casemap *)cm)->code;
+}
+
+static int utf8_map_case(const struct casemap *mapping, int num, int ch)
+{
+    /* We only support 16 bit case mapping */
+    if (ch <= 0xffff) {
+        const struct casemap *cm =
+            bsearch(&ch, mapping, num, sizeof(*mapping), cmp_casemap);
+
+        if (cm) {
+            return cm->altcode;
+        }
+    }
+    return ch;
+}
+
+static int cmp_range(const void *key, const void *cm)
+{
+    const struct utf8range *range = (const struct utf8range *)cm;
+    unsigned ch = *(unsigned *)key;
+    if (ch < range->lower) {
+        return -1;
+    }
+    if (ch >= range->upper) {
+        return 1;
+    }
+    return 0;
+}
+
+static int utf8_in_range(const struct utf8range *range, int num, int ch)
+{
+    const struct utf8range *r =
+        bsearch(&ch, range, num, sizeof(*range), cmp_range);
+
+    if (r) {
+        return 1;
+    }
+    return 0;
+}
+
+int utf8_upper(int ch)
+{
+    if (isascii(ch)) {
+        return toupper(ch);
+    }
+    return utf8_map_case(unicode_case_mapping_upper, ARRAYSIZE(unicode_case_mapping_upper), ch);
+}
+
+int utf8_lower(int ch)
+{
+    if (isascii(ch)) {
+        return tolower(ch);
+    }
+    return utf8_map_case(unicode_case_mapping_lower, ARRAYSIZE(unicode_case_mapping_lower), ch);
+}
+
+int utf8_title(int ch)
+{
+    if (!isascii(ch)) {
+        int newch = utf8_map_case(unicode_case_mapping_title, ARRAYSIZE(unicode_case_mapping_title), ch);
+        if (newch != ch) {
+            return newch ? newch : ch;
+        }
+    }
+    return utf8_upper(ch);
+}
+
+int utf8_width(int ch)
+{
+    if (!isascii(ch)) {
+        if (utf8_in_range(unicode_range_combining, ARRAYSIZE(unicode_range_combining), ch)) {
+            return 0;
+        }
+        if (utf8_in_range(unicode_range_wide, ARRAYSIZE(unicode_range_wide), ch)) {
+            return 2;
+        }
+    }
+    return 1;
+}
+
+#endif /* JIM_BOOTSTRAP */
--- a/regexp/utf8.h
+++ b/regexp/utf8.h
@ -0,0 +1,150 @@
+#ifndef UTF8_UTIL_H
+#define UTF8_UTIL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * UTF-8 utility functions
+ *
+ * (c) 2010-2016 Steve Bennett <steveb@workware.net.au>
+ *
+ * See LICENCE for licence details.
+ */
+#include <jim-config.h>
+
+/* Currently we support unicode points up to 2^22-1 */
+#define MAX_UTF8_LEN 4
+
+/**
+ * Converts the given unicode codepoint (0 - 0x1fffff) to utf-8
+ * and stores the result at 'p'.
+ *
+ * Returns the number of utf-8 characters (up to MAX_UTF8_LEN).
+ */
+int utf8_fromunicode(char *p, unsigned uc);
+
+#ifndef JIM_UTF8
+#include <ctype.h>
+
+/* No utf-8 support. 1 byte = 1 char */
+#define utf8_strlen(S, B) ((B) < 0 ? (int)strlen(S) : (B))
+#define utf8_strwidth(S, B) utf8_strlen((S), (B))
+#define utf8_tounicode(S, CP) (*(CP) = (unsigned char)*(S), 1)
+#define utf8_getchars(CP, C) (*(CP) = (C), 1)
+#define utf8_upper(C) toupper(C)
+#define utf8_title(C) toupper(C)
+#define utf8_lower(C) tolower(C)
+#define utf8_index(C, I) (I)
+#define utf8_charlen(C) 1
+#define utf8_prev_len(S, L) 1
+#define utf8_width(C) 1
+
+#else
+#if !defined(JIM_BOOTSTRAP)
+
+#define utf8_getchars utf8_fromunicode
+
+/**
+ * Returns the length of the utf-8 sequence starting with 'c'.
+ *
+ * Returns 1-4.
+ * If 'c' is not a valid start byte, returns 1.
+ */
+int utf8_charlen(int c);
+
+/**
+ * Returns the number of characters in the utf-8
+ * string of the given byte length.
+ *
+ * Any bytes which are not part of an valid utf-8
+ * sequence are treated as individual characters.
+ *
+ * The string *must* be null terminated.
+ *
+ * Does not support unicode code points > \u1fffff
+ */
+int utf8_strlen(const char *str, int bytelen);
+
+/**
+ * Calculates the display width of the first 'charlen' characters in 'str'.
+ * See utf8_width()
+ */
+int utf8_strwidth(const char *str, int charlen);
+
+/**
+ * Returns the byte index of the given character in the utf-8 string.
+ *
+ * The string *must* be null terminated.
+ *
+ * This will return the byte length of a utf-8 string
+ * if given the char length.
+ */
+int utf8_index(const char *str, int charindex);
+
+/**
+ * Returns the unicode codepoint corresponding to the
+ * utf-8 sequence 'str'.
+ *
+ * Stores the result in *uc and returns the number of bytes
+ * consumed.
+ *
+ * If 'str' is null terminated, then an invalid utf-8 sequence
+ * at the end of the string will be returned as individual bytes.
+ *
+ * If it is not null terminated, the length *must* be checked first.
+ *
+ * Does not support unicode code points > \u1fffff
+ */
+int utf8_tounicode(const char *str, int *uc);
+
+/**
+ * Returns the number of bytes before 'str' that the previous
+ * utf-8 character sequence starts (which may be the middle of a sequence).
+ *
+ * Looks back at most 'len' bytes backwards, which must be > 0.
+ * If no start char is found, returns -len
+ */
+int utf8_prev_len(const char *str, int len);
+
+/**
+ * Returns the upper-case variant of the given unicode codepoint.
+ *
+ * Unicode code points > \uffff are returned unchanged.
+ */
+int utf8_upper(int uc);
+
+/**
+ * Returns the title-case variant of the given unicode codepoint.
+ *
+ * If none, returns utf8_upper().
+ *
+ * Unicode code points > \uffff are returned unchanged.
+ */
+int utf8_title(int uc);
+
+/**
+ * Returns the lower-case variant of the given unicode codepoint.
+ *
+ * NOTE: Use utf8_upper() in preference for case-insensitive matching.
+ *
+ * Unicode code points > \uffff are returned unchanged.
+ */
+int utf8_lower(int uc);
+
+/**
+ * Returns the width (in characters) of the given unicode codepoint.
+ * This is 1 for normal letters and 0 for combining characters and 2 for wide characters.
+ */
+int utf8_width(int ch);
+
+#endif /* JIM_BOOTSTRAP */
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif