1
0
mirror of git://git.gnupg.org/gnupg.git synced 2024-06-29 02:22:45 +02:00

regexp: Add regexp module from Jim Tcl.

--

From Jim Tcl (version 0.79+) master commit of:

	7a101ca903c44be10a692e7264b3160115edf7cf

Signed-off-by: NIIBE Yutaka <gniibe@fsij.org>
This commit is contained in:
NIIBE Yutaka 2020-02-12 11:05:38 +09:00
parent 9c719c9c1f
commit c2d1511f0b
5 changed files with 2465 additions and 0 deletions

45
regexp/LICENSE Normal file
View File

@ -0,0 +1,45 @@
Unless explicitly stated, all files within Jim repository are released
under following license:
/* Jim - A small embeddable Tcl interpreter
*
* Copyright 2005 Salvatore Sanfilippo <antirez@invece.org>
* Copyright 2005 Clemens Hintze <c.hintze@gmx.net>
* Copyright 2005 patthoyts - Pat Thoyts <patthoyts@users.sf.net>
* Copyright 2008 oharboe - Øyvind Harboe - oyvind.harboe@zylin.com
* Copyright 2008 Andrew Lunn <andrew@lunn.ch>
* Copyright 2008 Duane Ellis <openocd@duaneellis.com>
* Copyright 2008 Uwe Klein <uklein@klein-messgeraete.de>
* Copyright 2008 Steve Bennett <steveb@workware.net.au>
* Copyright 2009 Nico Coesel <ncoesel@dealogic.nl>
* Copyright 2009 Zachary T Welch zw@superlucidity.net
* Copyright 2009 David Brownell
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE JIM TCL PROJECT ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* JIM TCL PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the Jim Tcl Project.
*/

1899
regexp/jimregexp.c Normal file

File diff suppressed because it is too large Load Diff

109
regexp/jimregexp.h Normal file
View File

@ -0,0 +1,109 @@
#ifndef JIMREGEXP_H
#define JIMREGEXP_H
/** regexp(3)-compatible regular expression implementation for Jim.
*
* See jimregexp.c for details
*/
#ifdef __cplusplus
extern "C" {
#endif
#include <stdlib.h>
typedef struct {
int rm_so;
int rm_eo;
} regmatch_t;
/*
* The "internal use only" fields in regexp.h are present to pass info from
* compile to execute that permits the execute phase to run lots faster on
* simple cases. They are:
*
* regstart char that must begin a match; '\0' if none obvious
* reganch is the match anchored (at beginning-of-line only)?
* regmust string (pointer into program) that match must include, or NULL
* regmlen length of regmust string
*
* Regstart and reganch permit very fast decisions on suitable starting points
* for a match, cutting down the work a lot. Regmust permits fast rejection
* of lines that cannot possibly match. The regmust tests are costly enough
* that regcomp() supplies a regmust only if the r.e. contains something
* potentially expensive (at present, the only such thing detected is * or +
* at the start of the r.e., which can involve a lot of backup). Regmlen is
* supplied because the test in regexec() needs it and regcomp() is computing
* it anyway.
*/
typedef struct regexp {
/* -- public -- */
int re_nsub; /* number of parenthesized subexpressions */
/* -- private -- */
int cflags; /* Flags used when compiling */
int err; /* Any error which occurred during compile */
int regstart; /* Internal use only. */
int reganch; /* Internal use only. */
int regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
int *program; /* Allocated */
/* working state - compile */
const char *regparse; /* Input-scan pointer. */
int p; /* Current output pos in program */
int proglen; /* Allocated program size */
/* working state - exec */
int eflags; /* Flags used when executing */
const char *start; /* Initial string pointer. */
const char *reginput; /* Current input pointer. */
const char *regbol; /* Beginning of input, for ^ check. */
/* Input to regexec() */
regmatch_t *pmatch; /* submatches will be stored here */
int nmatch; /* size of pmatch[] */
} regexp;
typedef regexp regex_t;
#define REG_EXTENDED 0
#define REG_NEWLINE 1
#define REG_ICASE 2
#define REG_NOTBOL 16
enum {
REG_NOERROR, /* Success. */
REG_NOMATCH, /* Didn't find a match (for regexec). */
REG_BADPAT, /* >= REG_BADPAT is an error */
REG_ERR_NULL_ARGUMENT,
REG_ERR_UNKNOWN,
REG_ERR_TOO_BIG,
REG_ERR_NOMEM,
REG_ERR_TOO_MANY_PAREN,
REG_ERR_UNMATCHED_PAREN,
REG_ERR_UNMATCHED_BRACES,
REG_ERR_BAD_COUNT,
REG_ERR_JUNK_ON_END,
REG_ERR_OPERAND_COULD_BE_EMPTY,
REG_ERR_NESTED_COUNT,
REG_ERR_INTERNAL,
REG_ERR_COUNT_FOLLOWS_NOTHING,
REG_ERR_TRAILING_BACKSLASH,
REG_ERR_CORRUPTED,
REG_ERR_NULL_CHAR,
REG_ERR_NUM
};
int regcomp(regex_t *preg, const char *regex, int cflags);
int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size);
void regfree(regex_t *preg);
#ifdef __cplusplus
}
#endif
#endif

262
regexp/utf8.c Normal file
View File

@ -0,0 +1,262 @@
/**
* UTF-8 utility functions
*
* (c) 2010-2016 Steve Bennett <steveb@workware.net.au>
*
* See LICENCE for licence details.
*/
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include "utf8.h"
/* This one is always implemented */
int utf8_fromunicode(char *p, unsigned uc)
{
if (uc <= 0x7f) {
*p = uc;
return 1;
}
else if (uc <= 0x7ff) {
*p++ = 0xc0 | ((uc & 0x7c0) >> 6);
*p = 0x80 | (uc & 0x3f);
return 2;
}
else if (uc <= 0xffff) {
*p++ = 0xe0 | ((uc & 0xf000) >> 12);
*p++ = 0x80 | ((uc & 0xfc0) >> 6);
*p = 0x80 | (uc & 0x3f);
return 3;
}
/* Note: We silently truncate to 21 bits here: 0x1fffff */
else {
*p++ = 0xf0 | ((uc & 0x1c0000) >> 18);
*p++ = 0x80 | ((uc & 0x3f000) >> 12);
*p++ = 0x80 | ((uc & 0xfc0) >> 6);
*p = 0x80 | (uc & 0x3f);
return 4;
}
}
#if defined(USE_UTF8) && !defined(JIM_BOOTSTRAP)
int utf8_charlen(int c)
{
if ((c & 0x80) == 0) {
return 1;
}
if ((c & 0xe0) == 0xc0) {
return 2;
}
if ((c & 0xf0) == 0xe0) {
return 3;
}
if ((c & 0xf8) == 0xf0) {
return 4;
}
/* Invalid sequence, so treat it as a single byte */
return 1;
}
int utf8_strlen(const char *str, int bytelen)
{
int charlen = 0;
if (bytelen < 0) {
bytelen = strlen(str);
}
while (bytelen > 0) {
int c;
int l = utf8_tounicode(str, &c);
charlen++;
str += l;
bytelen -= l;
}
return charlen;
}
int utf8_strwidth(const char *str, int charlen)
{
int width = 0;
while (charlen) {
int c;
int l = utf8_tounicode(str, &c);
width += utf8_width(c);
str += l;
charlen--;
}
return width;
}
int utf8_index(const char *str, int index)
{
const char *s = str;
while (index--) {
s += utf8_charlen(*s);
}
return s - str;
}
int utf8_prev_len(const char *str, int len)
{
int n = 1;
assert(len > 0);
/* Look up to len chars backward for a start-of-char byte */
while (--len) {
if ((str[-n] & 0x80) == 0) {
/* Start of a 1-byte char */
break;
}
if ((str[-n] & 0xc0) == 0xc0) {
/* Start of a multi-byte char */
break;
}
n++;
}
return n;
}
int utf8_tounicode(const char *str, int *uc)
{
unsigned const char *s = (unsigned const char *)str;
if (s[0] < 0xc0) {
*uc = s[0];
return 1;
}
if (s[0] < 0xe0) {
if ((s[1] & 0xc0) == 0x80) {
*uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80);
if (*uc >= 0x80) {
return 2;
}
/* Otherwise this is an invalid sequence */
}
}
else if (s[0] < 0xf0) {
if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
*uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);
if (*uc >= 0x800) {
return 3;
}
/* Otherwise this is an invalid sequence */
}
}
else if (s[0] < 0xf8) {
if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
*uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);
if (*uc >= 0x10000) {
return 4;
}
/* Otherwise this is an invalid sequence */
}
}
/* Invalid sequence, so just return the byte */
*uc = *s;
return 1;
}
struct casemap {
unsigned short code; /* code point */
unsigned short altcode; /* alternate case code point */
};
struct utf8range {
unsigned lower; /* lower inclusive */
unsigned upper; /* upper exclusive */
};
/* Generated mapping tables */
#include "_unicode_mapping.c"
#define ARRAYSIZE(A) sizeof(A) / sizeof(*(A))
static int cmp_casemap(const void *key, const void *cm)
{
return *(int *)key - (int)((const struct casemap *)cm)->code;
}
static int utf8_map_case(const struct casemap *mapping, int num, int ch)
{
/* We only support 16 bit case mapping */
if (ch <= 0xffff) {
const struct casemap *cm =
bsearch(&ch, mapping, num, sizeof(*mapping), cmp_casemap);
if (cm) {
return cm->altcode;
}
}
return ch;
}
static int cmp_range(const void *key, const void *cm)
{
const struct utf8range *range = (const struct utf8range *)cm;
unsigned ch = *(unsigned *)key;
if (ch < range->lower) {
return -1;
}
if (ch >= range->upper) {
return 1;
}
return 0;
}
static int utf8_in_range(const struct utf8range *range, int num, int ch)
{
const struct utf8range *r =
bsearch(&ch, range, num, sizeof(*range), cmp_range);
if (r) {
return 1;
}
return 0;
}
int utf8_upper(int ch)
{
if (isascii(ch)) {
return toupper(ch);
}
return utf8_map_case(unicode_case_mapping_upper, ARRAYSIZE(unicode_case_mapping_upper), ch);
}
int utf8_lower(int ch)
{
if (isascii(ch)) {
return tolower(ch);
}
return utf8_map_case(unicode_case_mapping_lower, ARRAYSIZE(unicode_case_mapping_lower), ch);
}
int utf8_title(int ch)
{
if (!isascii(ch)) {
int newch = utf8_map_case(unicode_case_mapping_title, ARRAYSIZE(unicode_case_mapping_title), ch);
if (newch != ch) {
return newch ? newch : ch;
}
}
return utf8_upper(ch);
}
int utf8_width(int ch)
{
if (!isascii(ch)) {
if (utf8_in_range(unicode_range_combining, ARRAYSIZE(unicode_range_combining), ch)) {
return 0;
}
if (utf8_in_range(unicode_range_wide, ARRAYSIZE(unicode_range_wide), ch)) {
return 2;
}
}
return 1;
}
#endif /* JIM_BOOTSTRAP */

150
regexp/utf8.h Normal file
View File

@ -0,0 +1,150 @@
#ifndef UTF8_UTIL_H
#define UTF8_UTIL_H
#ifdef __cplusplus
extern "C" {
#endif
/**
* UTF-8 utility functions
*
* (c) 2010-2016 Steve Bennett <steveb@workware.net.au>
*
* See LICENCE for licence details.
*/
#include <jim-config.h>
/* Currently we support unicode points up to 2^22-1 */
#define MAX_UTF8_LEN 4
/**
* Converts the given unicode codepoint (0 - 0x1fffff) to utf-8
* and stores the result at 'p'.
*
* Returns the number of utf-8 characters (up to MAX_UTF8_LEN).
*/
int utf8_fromunicode(char *p, unsigned uc);
#ifndef JIM_UTF8
#include <ctype.h>
/* No utf-8 support. 1 byte = 1 char */
#define utf8_strlen(S, B) ((B) < 0 ? (int)strlen(S) : (B))
#define utf8_strwidth(S, B) utf8_strlen((S), (B))
#define utf8_tounicode(S, CP) (*(CP) = (unsigned char)*(S), 1)
#define utf8_getchars(CP, C) (*(CP) = (C), 1)
#define utf8_upper(C) toupper(C)
#define utf8_title(C) toupper(C)
#define utf8_lower(C) tolower(C)
#define utf8_index(C, I) (I)
#define utf8_charlen(C) 1
#define utf8_prev_len(S, L) 1
#define utf8_width(C) 1
#else
#if !defined(JIM_BOOTSTRAP)
#define utf8_getchars utf8_fromunicode
/**
* Returns the length of the utf-8 sequence starting with 'c'.
*
* Returns 1-4.
* If 'c' is not a valid start byte, returns 1.
*/
int utf8_charlen(int c);
/**
* Returns the number of characters in the utf-8
* string of the given byte length.
*
* Any bytes which are not part of an valid utf-8
* sequence are treated as individual characters.
*
* The string *must* be null terminated.
*
* Does not support unicode code points > \u1fffff
*/
int utf8_strlen(const char *str, int bytelen);
/**
* Calculates the display width of the first 'charlen' characters in 'str'.
* See utf8_width()
*/
int utf8_strwidth(const char *str, int charlen);
/**
* Returns the byte index of the given character in the utf-8 string.
*
* The string *must* be null terminated.
*
* This will return the byte length of a utf-8 string
* if given the char length.
*/
int utf8_index(const char *str, int charindex);
/**
* Returns the unicode codepoint corresponding to the
* utf-8 sequence 'str'.
*
* Stores the result in *uc and returns the number of bytes
* consumed.
*
* If 'str' is null terminated, then an invalid utf-8 sequence
* at the end of the string will be returned as individual bytes.
*
* If it is not null terminated, the length *must* be checked first.
*
* Does not support unicode code points > \u1fffff
*/
int utf8_tounicode(const char *str, int *uc);
/**
* Returns the number of bytes before 'str' that the previous
* utf-8 character sequence starts (which may be the middle of a sequence).
*
* Looks back at most 'len' bytes backwards, which must be > 0.
* If no start char is found, returns -len
*/
int utf8_prev_len(const char *str, int len);
/**
* Returns the upper-case variant of the given unicode codepoint.
*
* Unicode code points > \uffff are returned unchanged.
*/
int utf8_upper(int uc);
/**
* Returns the title-case variant of the given unicode codepoint.
*
* If none, returns utf8_upper().
*
* Unicode code points > \uffff are returned unchanged.
*/
int utf8_title(int uc);
/**
* Returns the lower-case variant of the given unicode codepoint.
*
* NOTE: Use utf8_upper() in preference for case-insensitive matching.
*
* Unicode code points > \uffff are returned unchanged.
*/
int utf8_lower(int uc);
/**
* Returns the width (in characters) of the given unicode codepoint.
* This is 1 for normal letters and 0 for combining characters and 2 for wide characters.
*/
int utf8_width(int ch);
#endif /* JIM_BOOTSTRAP */
#endif
#ifdef __cplusplus
}
#endif
#endif