1
0
Fork 0
mirror of git://git.gnupg.org/gnupg.git synced 2025-07-02 22:46:30 +02:00

gpg: Add regular expression support.

* AUTHORS, COPYING.other: Update.
* Makefile.am (SUBDIRS): Add regexp sub directory.
* configure.ac (DISABLE_REGEX): Remove.
* g10/Makefile.am (needed_libs): Add libregexp.a.
* g10/trustdb.c: Remove DISABLE_REGEX support.
* regexp/LICENSE, regexp/jimregexp.c, regexp/jimregexp.h,
  regexp/utf8.c, regexp/utf8.h: New from Jim Tcl.
* regexp/UnicodeData.txt: New from Unicode.
* regexp/Makefile.am, regexp/parse-unidata.awk: New.
* tests/openpgp/Makefile.am: Remove DISABLE_REGEX support.
* tools/Makefile.am: Remove DISABLE_REGEX support.

GnuPG-bug-id: 4843
Signed-off-by: NIIBE Yutaka <gniibe@fsij.org>
This commit is contained in:
NIIBE Yutaka 2020-04-03 15:30:08 +09:00
parent 61c5b0767f
commit ba247a114c
17 changed files with 36313 additions and 94 deletions

45
regexp/LICENSE Normal file
View file

@ -0,0 +1,45 @@
Unless explicitly stated, all files within Jim repository are released
under following license:
/* Jim - A small embeddable Tcl interpreter
*
* Copyright 2005 Salvatore Sanfilippo <antirez@invece.org>
* Copyright 2005 Clemens Hintze <c.hintze@gmx.net>
* Copyright 2005 patthoyts - Pat Thoyts <patthoyts@users.sf.net>
* Copyright 2008 oharboe - Øyvind Harboe - oyvind.harboe@zylin.com
* Copyright 2008 Andrew Lunn <andrew@lunn.ch>
* Copyright 2008 Duane Ellis <openocd@duaneellis.com>
* Copyright 2008 Uwe Klein <uklein@klein-messgeraete.de>
* Copyright 2008 Steve Bennett <steveb@workware.net.au>
* Copyright 2009 Nico Coesel <ncoesel@dealogic.nl>
* Copyright 2009 Zachary T Welch zw@superlucidity.net
* Copyright 2009 David Brownell
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE JIM TCL PROJECT ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* JIM TCL PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the Jim Tcl Project.
*/

38
regexp/Makefile.am Normal file
View file

@ -0,0 +1,38 @@
# Makefile for common gnupg modules
# Copyright (C) 2020 g10 Code GmbH
#
# This file is part of GnuPG.
#
# GnuPG is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GnuPG is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <https://www.gnu.org/licenses/>.
## Process this file with automake to produce Makefile.in
noinst_LIBRARIES = libregexp.a
AM_CPPFLAGS = -DJIM_REGEXP -DJIM_UTF8 -DUSE_UTF8
AM_CFLAGS =
libregexp_a_SOURCES = jimregexp.h utf8.h jimregexp.c utf8.c
libregexp_a_CFLAGS = $(AM_CFLAGS)
EXTRA_DIST = parse-unidata.awk UnicodeData.txt _unicode_mapping.c
if MAINTAINER_MODE
BUILT_SOURCES = _unicode_mapping.c
MAINTAINERCLEANFILES = _unicode_mapping.c
_unicode_mapping.c: parse-unidata.awk UnicodeData.txt
$(AWK) -f $(srcdir)/parse-unidata.awk $(srcdir)/UnicodeData.txt >$@
endif

33797
regexp/UnicodeData.txt Normal file

File diff suppressed because it is too large Load diff

1909
regexp/jimregexp.c Normal file

File diff suppressed because it is too large Load diff

109
regexp/jimregexp.h Normal file
View file

@ -0,0 +1,109 @@
#ifndef JIMREGEXP_H
#define JIMREGEXP_H
/** regexp(3)-compatible regular expression implementation for Jim.
*
* See jimregexp.c for details
*/
#ifdef __cplusplus
extern "C" {
#endif
#include <stdlib.h>
typedef struct {
int rm_so;
int rm_eo;
} regmatch_t;
/*
* The "internal use only" fields in regexp.h are present to pass info from
* compile to execute that permits the execute phase to run lots faster on
* simple cases. They are:
*
* regstart char that must begin a match; '\0' if none obvious
* reganch is the match anchored (at beginning-of-line only)?
* regmust string (pointer into program) that match must include, or NULL
* regmlen length of regmust string
*
* Regstart and reganch permit very fast decisions on suitable starting points
* for a match, cutting down the work a lot. Regmust permits fast rejection
* of lines that cannot possibly match. The regmust tests are costly enough
* that regcomp() supplies a regmust only if the r.e. contains something
* potentially expensive (at present, the only such thing detected is * or +
* at the start of the r.e., which can involve a lot of backup). Regmlen is
* supplied because the test in regexec() needs it and regcomp() is computing
* it anyway.
*/
struct regexp {
/* -- public -- */
int re_nsub; /* number of parenthesized subexpressions */
/* -- private -- */
int cflags; /* Flags used when compiling */
int err; /* Any error which occurred during compile */
int regstart; /* Internal use only. */
int reganch; /* Internal use only. */
int regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
int *program; /* Allocated */
/* working state - compile */
const char *regparse; /* Input-scan pointer. */
int p; /* Current output pos in program */
int proglen; /* Allocated program size */
/* working state - exec */
int eflags; /* Flags used when executing */
const char *start; /* Initial string pointer. */
const char *reginput; /* Current input pointer. */
const char *regbol; /* Beginning of input, for ^ check. */
/* Input to regexec() */
regmatch_t *pmatch; /* submatches will be stored here */
int nmatch; /* size of pmatch[] */
};
typedef struct regexp regex_t;
#define REG_EXTENDED 0
#define REG_NEWLINE 1
#define REG_ICASE 2
#define REG_NOTBOL 16
enum {
REG_NOERROR, /* Success. */
REG_NOMATCH, /* Didn't find a match (for regexec). */
REG_BADPAT, /* >= REG_BADPAT is an error */
REG_ERR_NULL_ARGUMENT,
REG_ERR_UNKNOWN,
REG_ERR_TOO_BIG,
REG_ERR_NOMEM,
REG_ERR_TOO_MANY_PAREN,
REG_ERR_UNMATCHED_PAREN,
REG_ERR_UNMATCHED_BRACES,
REG_ERR_BAD_COUNT,
REG_ERR_JUNK_ON_END,
REG_ERR_OPERAND_COULD_BE_EMPTY,
REG_ERR_NESTED_COUNT,
REG_ERR_INTERNAL,
REG_ERR_COUNT_FOLLOWS_NOTHING,
REG_ERR_TRAILING_BACKSLASH,
REG_ERR_CORRUPTED,
REG_ERR_NULL_CHAR,
REG_ERR_NUM
};
int regcomp(regex_t *preg, const char *regex, int cflags);
int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size);
void regfree(regex_t *preg);
#ifdef __cplusplus
}
#endif
#endif

62
regexp/parse-unidata.awk Normal file
View file

@ -0,0 +1,62 @@
#
# parse-unidata.awk - generate a table (unicode_case_mapping_upper)
#
# Copyright (C) 2020 g10 Code GmbH
#
# This file is part of GnuPG.
#
# GnuPG is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GnuPG is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <https://www.gnu.org/licenses/>.
#
# Parse the unicode data from:
# https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
# to generate case mapping table
BEGIN {
print("/* Generated from UnicodeData.txt */")
print("")
print("static const struct casemap unicode_case_mapping_upper[] = {")
FS = ";"
count = 0
}
{
code = strtonum(("0x" $1))
name = $2
class = $3
upper = $13
lower = $14
title = $15
if (code <= 0x7f) {
next
}
if (code > 0xffff) {
next
}
if ($3 !~ /^L.*/) {
next
}
if (upper != "") {
printf("\t{ 0x" tolower($1) ", 0x" tolower(upper) " },")
count++
if ((count % 4) == 0) {
print("")
}
}
}
END {
print("\n};")
}

150
regexp/utf8.c Normal file
View file

@ -0,0 +1,150 @@
/**
* UTF-8 utility functions
*
* (c) 2010-2016 Steve Bennett <steveb@workware.net.au>
*
* See LICENCE for licence details.
*/
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include "utf8.h"
/* This one is always implemented */
int utf8_fromunicode(char *p, unsigned uc)
{
if (uc <= 0x7f) {
*p = uc;
return 1;
}
else if (uc <= 0x7ff) {
*p++ = 0xc0 | ((uc & 0x7c0) >> 6);
*p = 0x80 | (uc & 0x3f);
return 2;
}
else if (uc <= 0xffff) {
*p++ = 0xe0 | ((uc & 0xf000) >> 12);
*p++ = 0x80 | ((uc & 0xfc0) >> 6);
*p = 0x80 | (uc & 0x3f);
return 3;
}
/* Note: We silently truncate to 21 bits here: 0x1fffff */
else {
*p++ = 0xf0 | ((uc & 0x1c0000) >> 18);
*p++ = 0x80 | ((uc & 0x3f000) >> 12);
*p++ = 0x80 | ((uc & 0xfc0) >> 6);
*p = 0x80 | (uc & 0x3f);
return 4;
}
}
#if defined(USE_UTF8) && !defined(JIM_BOOTSTRAP)
int utf8_charlen(int c)
{
if ((c & 0x80) == 0) {
return 1;
}
if ((c & 0xe0) == 0xc0) {
return 2;
}
if ((c & 0xf0) == 0xe0) {
return 3;
}
if ((c & 0xf8) == 0xf0) {
return 4;
}
/* Invalid sequence, so treat it as a single byte */
return 1;
}
int utf8_index(const char *str, int index)
{
const char *s = str;
while (index--) {
s += utf8_charlen(*s);
}
return s - str;
}
int utf8_tounicode(const char *str, int *uc)
{
unsigned const char *s = (unsigned const char *)str;
if (s[0] < 0xc0) {
*uc = s[0];
return 1;
}
if (s[0] < 0xe0) {
if ((s[1] & 0xc0) == 0x80) {
*uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80);
if (*uc >= 0x80) {
return 2;
}
/* Otherwise this is an invalid sequence */
}
}
else if (s[0] < 0xf0) {
if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
*uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);
if (*uc >= 0x800) {
return 3;
}
/* Otherwise this is an invalid sequence */
}
}
else if (s[0] < 0xf8) {
if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80) && ((str[3] & 0xc0) == 0x80)) {
*uc = ((s[0] & ~0xf0) << 18) | ((s[1] & ~0x80) << 12) | ((s[2] & ~0x80) << 6) | (s[3] & ~0x80);
if (*uc >= 0x10000) {
return 4;
}
/* Otherwise this is an invalid sequence */
}
}
/* Invalid sequence, so just return the byte */
*uc = *s;
return 1;
}
struct casemap {
unsigned short code; /* code point */
unsigned short altcode; /* alternate case code point */
};
/* Generated mapping tables */
#include "_unicode_mapping.c"
#define ARRAYSIZE(A) sizeof(A) / sizeof(*(A))
static int cmp_casemap(const void *key, const void *cm)
{
return *(int *)key - (int)((const struct casemap *)cm)->code;
}
static int utf8_map_case(const struct casemap *mapping, int num, int ch)
{
/* We only support 16 bit case mapping */
if (ch <= 0xffff) {
const struct casemap *cm =
bsearch(&ch, mapping, num, sizeof(*mapping), cmp_casemap);
if (cm) {
return cm->altcode;
}
}
return ch;
}
int utf8_upper(int ch)
{
if (isascii(ch)) {
return toupper(ch);
}
return utf8_map_case(unicode_case_mapping_upper, ARRAYSIZE(unicode_case_mapping_upper), ch);
}
#endif /* JIM_BOOTSTRAP */

90
regexp/utf8.h Normal file
View file

@ -0,0 +1,90 @@
#ifndef UTF8_UTIL_H
#define UTF8_UTIL_H
#ifdef __cplusplus
extern "C" {
#endif
/**
* UTF-8 utility functions
*
* (c) 2010-2016 Steve Bennett <steveb@workware.net.au>
*
* See LICENCE for licence details.
*/
/* Currently we support unicode points up to 2^22-1 */
#define MAX_UTF8_LEN 4
/**
* Converts the given unicode codepoint (0 - 0x1fffff) to utf-8
* and stores the result at 'p'.
*
* Returns the number of utf-8 characters (up to MAX_UTF8_LEN).
*/
int utf8_fromunicode(char *p, unsigned uc);
#ifndef JIM_UTF8
#include <ctype.h>
/* No utf-8 support. 1 byte = 1 char */
#define utf8_tounicode(S, CP) (*(CP) = (unsigned char)*(S), 1)
#define utf8_getchars(CP, C) (*(CP) = (C), 1)
#define utf8_upper(C) toupper(C)
#define utf8_index(C, I) (I)
#define utf8_charlen(C) 1
#else
#if !defined(JIM_BOOTSTRAP)
#define utf8_getchars utf8_fromunicode
/**
* Returns the length of the utf-8 sequence starting with 'c'.
*
* Returns 1-4.
* If 'c' is not a valid start byte, returns 1.
*/
int utf8_charlen(int c);
/**
* Returns the byte index of the given character in the utf-8 string.
*
* The string *must* be null terminated.
*
* This will return the byte length of a utf-8 string
* if given the char length.
*/
int utf8_index(const char *str, int charindex);
/**
* Returns the unicode codepoint corresponding to the
* utf-8 sequence 'str'.
*
* Stores the result in *uc and returns the number of bytes
* consumed.
*
* If 'str' is null terminated, then an invalid utf-8 sequence
* at the end of the string will be returned as individual bytes.
*
* If it is not null terminated, the length *must* be checked first.
*
* Does not support unicode code points > \u1fffff
*/
int utf8_tounicode(const char *str, int *uc);
/**
* Returns the upper-case variant of the given unicode codepoint.
*
* Unicode code points > \uffff are returned unchanged.
*/
int utf8_upper(int uc);
#endif /* JIM_BOOTSTRAP */
#endif
#ifdef __cplusplus
}
#endif
#endif