1
0
mirror of git://git.gnupg.org/gnupg.git synced 2024-12-31 11:41:32 +01:00
gnupg/jnlib/utf8conv.c
Werner Koch c0c2c58054 Finished the bulk of changes for gnupg 1.9. This included switching
to libgcrypt functions, using shared error codes from libgpg-error,
replacing the old functions we used to have in ../util by those in
../jnlib and ../common, renaming the malloc functions and a couple of
types.  Note, that not all changes are listed below becuause they are
too similar and done at far too many places.  As of today the code
builds using the current libgcrypt from CVS but it is very unlikely
that it actually works.
2003-06-18 19:56:13 +00:00

449 lines
10 KiB
C

/* utf8conf.c - UTF8 character set conversion
* Copyright (C) 1994, 1998, 1999, 2000, 2001,
* 2003 Free Software Foundation, Inc.
*
* This file is part of GnuPG.
*
* GnuPG is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* GnuPG is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*/
#include <config.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <ctype.h>
#ifdef HAVE_LANGINFO_CODESET
#include <langinfo.h>
#endif
#include "libjnlib-config.h"
#include "stringhelp.h"
#include "utf8conv.h"
static ushort koi8_unicode[128] = {
0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524,
0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248,
0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7,
0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556,
0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e,
0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565,
0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9,
0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433,
0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e,
0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432,
0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a,
0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413,
0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e,
0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412,
0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a
};
static ushort latin2_unicode[128] = {
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9
};
static const char *active_charset_name = "iso-8859-1";
static ushort *active_charset = NULL;
static int no_translation = 0;
int
set_native_charset (const char *newset)
{
if (!newset)
#ifdef HAVE_LANGINFO_CODESET
newset = nl_langinfo (CODESET);
#else
newset = "8859-1";
#endif
if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3))
{
newset += 3;
if (*newset == '-' || *newset == '_')
newset++;
}
if (!*newset
|| !ascii_strcasecmp (newset, "8859-1")
|| !ascii_strcasecmp (newset, "8859-15"))
{
active_charset_name = "iso-8859-1";
no_translation = 0;
active_charset = NULL;
}
else if (!ascii_strcasecmp (newset, "8859-2"))
{
active_charset_name = "iso-8859-2";
no_translation = 0;
active_charset = latin2_unicode;
}
else if (!ascii_strcasecmp (newset, "koi8-r"))
{
active_charset_name = "koi8-r";
no_translation = 0;
active_charset = koi8_unicode;
}
else if (!ascii_strcasecmp (newset, "utf8")
|| !ascii_strcasecmp (newset, "utf-8"))
{
active_charset_name = "utf-8";
no_translation = 1;
active_charset = NULL;
}
else
return -1;
return 0;
}
const char *
get_native_charset ()
{
return active_charset_name;
}
/****************
* Convert string, which is in native encoding to UTF8 and return the
* new allocated UTF8 string.
*/
char *
native_to_utf8 (const char *string)
{
const byte *s;
char *buffer;
byte *p;
size_t length = 0;
if (no_translation)
{
buffer = jnlib_xstrdup (string);
}
else if (active_charset)
{
for (s = string; *s; s++)
{
length++;
if (*s & 0x80)
length += 2; /* we may need 3 bytes */
}
buffer = jnlib_xmalloc (length + 1);
for (p = buffer, s = string; *s; s++)
{
if ((*s & 0x80))
{
ushort val = active_charset[*s & 0x7f];
if (val < 0x0800)
{
*p++ = 0xc0 | ((val >> 6) & 0x1f);
*p++ = 0x80 | (val & 0x3f);
}
else
{
*p++ = 0xe0 | ((val >> 12) & 0x0f);
*p++ = 0x80 | ((val >> 6) & 0x3f);
*p++ = 0x80 | (val & 0x3f);
}
}
else
*p++ = *s;
}
*p = 0;
}
else
{
for (s = string; *s; s++)
{
length++;
if (*s & 0x80)
length++;
}
buffer = jnlib_xmalloc (length + 1);
for (p = buffer, s = string; *s; s++)
{
if (*s & 0x80)
{
*p++ = 0xc0 | ((*s >> 6) & 3);
*p++ = 0x80 | (*s & 0x3f);
}
else
*p++ = *s;
}
*p = 0;
}
return buffer;
}
/* Convert string, which is in UTF8 to native encoding. Replace
* illegal encodings by some "\xnn" and quote all control
* characters. A character with value DELIM will always be quoted, it
* must be a vanilla ASCII character. */
char *
utf8_to_native (const char *string, size_t length, int delim)
{
int nleft;
int i;
byte encbuf[8];
int encidx;
const byte *s;
size_t n;
byte *buffer = NULL, *p = NULL;
unsigned long val = 0;
size_t slen;
int resync = 0;
/* 1. pass (p==NULL): count the extended utf-8 characters */
/* 2. pass (p!=NULL): create string */
for (;;)
{
for (slen = length, nleft = encidx = 0, n = 0, s = string; slen;
s++, slen--)
{
if (resync)
{
if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)))
{
/* still invalid */
if (p)
{
sprintf (p, "\\x%02x", *s);
p += 4;
}
n += 4;
continue;
}
resync = 0;
}
if (!nleft)
{
if (!(*s & 0x80))
{ /* plain ascii */
if (*s < 0x20 || *s == 0x7f || *s == delim ||
(delim && *s == '\\'))
{
n++;
if (p)
*p++ = '\\';
switch (*s)
{
case '\n':
n++;
if (p)
*p++ = 'n';
break;
case '\r':
n++;
if (p)
*p++ = 'r';
break;
case '\f':
n++;
if (p)
*p++ = 'f';
break;
case '\v':
n++;
if (p)
*p++ = 'v';
break;
case '\b':
n++;
if (p)
*p++ = 'b';
break;
case 0:
n++;
if (p)
*p++ = '0';
break;
default:
n += 3;
if (p)
{
sprintf (p, "x%02x", *s);
p += 3;
}
break;
}
}
else
{
if (p)
*p++ = *s;
n++;
}
}
else if ((*s & 0xe0) == 0xc0)
{ /* 110x xxxx */
val = *s & 0x1f;
nleft = 1;
encidx = 0;
encbuf[encidx++] = *s;
}
else if ((*s & 0xf0) == 0xe0)
{ /* 1110 xxxx */
val = *s & 0x0f;
nleft = 2;
encidx = 0;
encbuf[encidx++] = *s;
}
else if ((*s & 0xf8) == 0xf0)
{ /* 1111 0xxx */
val = *s & 0x07;
nleft = 3;
encidx = 0;
encbuf[encidx++] = *s;
}
else if ((*s & 0xfc) == 0xf8)
{ /* 1111 10xx */
val = *s & 0x03;
nleft = 4;
encidx = 0;
encbuf[encidx++] = *s;
}
else if ((*s & 0xfe) == 0xfc)
{ /* 1111 110x */
val = *s & 0x01;
nleft = 5;
encidx = 0;
encbuf[encidx++] = *s;
}
else
{ /* invalid encoding: print as \xnn */
if (p)
{
sprintf (p, "\\x%02x", *s);
p += 4;
}
n += 4;
resync = 1;
}
}
else if (*s < 0x80 || *s >= 0xc0)
{ /* invalid */
if (p)
{
for (i = 0; i < encidx; i++)
{
sprintf (p, "\\x%02x", encbuf[i]);
p += 4;
}
sprintf (p, "\\x%02x", *s);
p += 4;
}
n += 4 + 4 * encidx;
nleft = 0;
encidx = 0;
resync = 1;
}
else
{
encbuf[encidx++] = *s;
val <<= 6;
val |= *s & 0x3f;
if (!--nleft)
{ /* ready */
if (no_translation)
{
if (p)
{
for (i = 0; i < encidx; i++)
*p++ = encbuf[i];
}
n += encidx;
encidx = 0;
}
else if (active_charset)
{ /* table lookup */
for (i = 0; i < 128; i++)
{
if (active_charset[i] == val)
break;
}
if (i < 128)
{ /* we can print this one */
if (p)
*p++ = i + 128;
n++;
}
else
{ /* we do not have a translation: print utf8 */
if (p)
{
for (i = 0; i < encidx; i++)
{
sprintf (p, "\\x%02x", encbuf[i]);
p += 4;
}
}
n += encidx * 4;
encidx = 0;
}
}
else
{ /* native set */
if (val >= 0x80 && val < 256)
{
n++; /* we can simply print this character */
if (p)
*p++ = val;
}
else
{ /* we do not have a translation: print utf8 */
if (p)
{
for (i = 0; i < encidx; i++)
{
sprintf (p, "\\x%02x", encbuf[i]);
p += 4;
}
}
n += encidx * 4;
encidx = 0;
}
}
}
}
}
if (!buffer)
{ /* allocate the buffer after the first pass */
buffer = p = jnlib_xmalloc (n + 1);
}
else
{
*p = 0; /* make a string */
return buffer;
}
}
}