mirror of
git://git.gnupg.org/gnupg.git
synced 2025-07-03 22:56:33 +02:00
Added iconv support and doc cleanups.
This commit is contained in:
parent
c9f8a69f0f
commit
2db8df0ba3
12 changed files with 487 additions and 480 deletions
|
@ -1,3 +1,9 @@
|
|||
2006-09-22 Werner Koch <wk@g10code.com>
|
||||
|
||||
* utf8conv.c: Reworked to match the gnupg 1.4.5 code. This now
|
||||
requires iconv support but this is reasonable for all modern
|
||||
systems.
|
||||
|
||||
2006-08-29 Werner Koch <wk@g10code.com>
|
||||
|
||||
* logging.c (do_logv): Emit a missing LF for fatal errors.
|
||||
|
|
576
jnlib/utf8conv.c
576
jnlib/utf8conv.c
|
@ -28,101 +28,225 @@
|
|||
#ifdef HAVE_LANGINFO_CODESET
|
||||
#include <langinfo.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <iconv.h>
|
||||
|
||||
#include "libjnlib-config.h"
|
||||
#include "stringhelp.h"
|
||||
#include "utf8conv.h"
|
||||
|
||||
|
||||
static ushort koi8_unicode[128] = {
|
||||
0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524,
|
||||
0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
|
||||
0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248,
|
||||
0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7,
|
||||
0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556,
|
||||
0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e,
|
||||
0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565,
|
||||
0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9,
|
||||
0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433,
|
||||
0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e,
|
||||
0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432,
|
||||
0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a,
|
||||
0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413,
|
||||
0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e,
|
||||
0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412,
|
||||
0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a
|
||||
};
|
||||
|
||||
static ushort latin2_unicode[128] = {
|
||||
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
|
||||
0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
|
||||
0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
|
||||
0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
|
||||
0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
|
||||
0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
|
||||
0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
|
||||
0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
|
||||
0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
|
||||
0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
|
||||
0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
|
||||
0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
|
||||
0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
|
||||
0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9
|
||||
};
|
||||
|
||||
#ifndef MB_LEN_MAX
|
||||
#define MB_LEN_MAX 16
|
||||
#endif
|
||||
|
||||
static const char *active_charset_name = "iso-8859-1";
|
||||
static ushort *active_charset = NULL;
|
||||
static int no_translation = 0;
|
||||
static unsigned short *active_charset;
|
||||
static int no_translation; /* Set to true if we let simply pass through. */
|
||||
static int use_iconv; /* iconv comversion fucntions required. */
|
||||
|
||||
|
||||
|
||||
/* Error handler for iconv failures. This is needed to not clutter the
|
||||
output with repeated diagnostics about a missing conversion. */
|
||||
static void
|
||||
handle_iconv_error (const char *to, const char *from, int use_fallback)
|
||||
{
|
||||
if (errno == EINVAL)
|
||||
{
|
||||
static int shown1, shown2;
|
||||
int x;
|
||||
|
||||
if (to && !strcmp (to, "utf-8"))
|
||||
{
|
||||
x = shown1;
|
||||
shown1 = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
x = shown2;
|
||||
shown2 = 1;
|
||||
}
|
||||
|
||||
if (!x)
|
||||
log_info (_("conversion from `%s' to `%s' not available\n"),
|
||||
from, to);
|
||||
}
|
||||
else
|
||||
{
|
||||
static int shown;
|
||||
|
||||
if (!shown)
|
||||
log_info (_("iconv_open failed: %s\n"), strerror (errno));
|
||||
shown = 1;
|
||||
}
|
||||
|
||||
if (use_fallback)
|
||||
{
|
||||
/* To avoid further error messages we fallback to Latin-1 for the
|
||||
native encoding. This is justified as one can expect that on a
|
||||
utf-8 enabled system nl_langinfo() will work and thus we won't
|
||||
never get to here. Thus Latin-1 seems to be a reasonable
|
||||
default. */
|
||||
active_charset_name = "iso-8859-1";
|
||||
no_translation = 0;
|
||||
active_charset = NULL;
|
||||
use_iconv = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
set_native_charset (const char *newset)
|
||||
{
|
||||
if (!newset)
|
||||
#ifdef HAVE_LANGINFO_CODESET
|
||||
newset = nl_langinfo (CODESET);
|
||||
#else
|
||||
newset = "8859-1";
|
||||
#endif
|
||||
const char *full_newset;
|
||||
|
||||
if (!newset)
|
||||
{
|
||||
#ifdef HABE_W32_SYSTEM
|
||||
static char codepage[30];
|
||||
unsigned int cpno;
|
||||
const char *aliases;
|
||||
|
||||
/* We are a console program thus we need to use the
|
||||
GetConsoleOutputCP function and not the the GetACP which
|
||||
would give the codepage for a GUI program. Note this is not
|
||||
a bulletproof detection because GetConsoleCP might return a
|
||||
different one for console input. Not sure how to cope with
|
||||
that. If the console Code page is not known we fall back to
|
||||
the system code page. */
|
||||
cpno = GetConsoleOutputCP ();
|
||||
if (!cpno)
|
||||
cpno = GetACP ();
|
||||
sprintf (codepage, "CP%u", cpno );
|
||||
/* Resolve alias. We use a long string string and not the usual
|
||||
array to optimize if the code is taken to a DSO. Taken from
|
||||
libiconv 1.9.2. */
|
||||
newset = codepage;
|
||||
for (aliases = ("CP936" "\0" "GBK" "\0"
|
||||
"CP1361" "\0" "JOHAB" "\0"
|
||||
"CP20127" "\0" "ASCII" "\0"
|
||||
"CP20866" "\0" "KOI8-R" "\0"
|
||||
"CP21866" "\0" "KOI8-RU" "\0"
|
||||
"CP28591" "\0" "ISO-8859-1" "\0"
|
||||
"CP28592" "\0" "ISO-8859-2" "\0"
|
||||
"CP28593" "\0" "ISO-8859-3" "\0"
|
||||
"CP28594" "\0" "ISO-8859-4" "\0"
|
||||
"CP28595" "\0" "ISO-8859-5" "\0"
|
||||
"CP28596" "\0" "ISO-8859-6" "\0"
|
||||
"CP28597" "\0" "ISO-8859-7" "\0"
|
||||
"CP28598" "\0" "ISO-8859-8" "\0"
|
||||
"CP28599" "\0" "ISO-8859-9" "\0"
|
||||
"CP28605" "\0" "ISO-8859-15" "\0"
|
||||
"CP65001" "\0" "UTF-8" "\0");
|
||||
*aliases;
|
||||
aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
|
||||
{
|
||||
if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1]))
|
||||
{
|
||||
newset = aliases + strlen (aliases) + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#else /*!HAVE_W32_SYSTEM*/
|
||||
|
||||
#ifdef HAVE_LANGINFO_CODESET
|
||||
newset = nl_langinfo (CODESET);
|
||||
#else /*!HAVE_LANGINFO_CODESET*/
|
||||
/* Try to get the used charset from environment variables. */
|
||||
static char codepage[30];
|
||||
const char *lc, *dot, *mod;
|
||||
|
||||
strcpy (codepage, "iso-8859-1");
|
||||
lc = getenv ("LC_ALL");
|
||||
if (!lc || !*lc)
|
||||
{
|
||||
lc = getenv ("LC_CTYPE");
|
||||
if (!lc || !*lc)
|
||||
lc = getenv ("LANG");
|
||||
}
|
||||
if (lc && *lc)
|
||||
{
|
||||
dot = strchr (lc, '.');
|
||||
if (dot)
|
||||
{
|
||||
mod = strchr (++dot, '@');
|
||||
if (!mod)
|
||||
mod = dot + strlen (dot);
|
||||
if (mod - dot < sizeof codepage && dot != mod)
|
||||
{
|
||||
memcpy (codepage, dot, mod - dot);
|
||||
codepage [mod - dot] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
newset = codepage;
|
||||
#endif /*!HAVE_LANGINFO_CODESET*/
|
||||
#endif /*!HAVE_W32_SYSTEM*/
|
||||
}
|
||||
|
||||
full_newset = newset;
|
||||
if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3))
|
||||
{
|
||||
newset += 3;
|
||||
if (*newset == '-' || *newset == '_')
|
||||
newset++;
|
||||
newset++;
|
||||
}
|
||||
|
||||
if (!*newset
|
||||
|| !ascii_strcasecmp (newset, "8859-1")
|
||||
|| !ascii_strcasecmp (newset, "8859-15"))
|
||||
/* Note that we silently assume that plain ASCII is actually meant
|
||||
as Latin-1. This makes sense because many Unix system don't have
|
||||
their locale set up properly and thus would get annoying error
|
||||
messages and we have to handle all the "bug" reports. Latin-1 has
|
||||
always been the character set used for 8 bit characters on Unix
|
||||
systems. */
|
||||
if ( !*newset
|
||||
|| !ascii_strcasecmp (newset, "8859-1" )
|
||||
|| !ascii_strcasecmp (newset, "646" )
|
||||
|| !ascii_strcasecmp (newset, "ASCII" )
|
||||
|| !ascii_strcasecmp (newset, "ANSI_X3.4-1968" )
|
||||
)
|
||||
{
|
||||
active_charset_name = "iso-8859-1";
|
||||
no_translation = 0;
|
||||
active_charset = NULL;
|
||||
use_iconv = 0;
|
||||
}
|
||||
else if (!ascii_strcasecmp (newset, "8859-2"))
|
||||
{
|
||||
active_charset_name = "iso-8859-2";
|
||||
no_translation = 0;
|
||||
active_charset = latin2_unicode;
|
||||
}
|
||||
else if (!ascii_strcasecmp (newset, "koi8-r"))
|
||||
{
|
||||
active_charset_name = "koi8-r";
|
||||
no_translation = 0;
|
||||
active_charset = koi8_unicode;
|
||||
}
|
||||
else if (!ascii_strcasecmp (newset, "utf8")
|
||||
|| !ascii_strcasecmp (newset, "utf-8"))
|
||||
else if ( !ascii_strcasecmp (newset, "utf8" )
|
||||
|| !ascii_strcasecmp(newset, "utf-8") )
|
||||
{
|
||||
active_charset_name = "utf-8";
|
||||
no_translation = 1;
|
||||
active_charset = NULL;
|
||||
use_iconv = 0;
|
||||
}
|
||||
else
|
||||
return -1;
|
||||
{
|
||||
iconv_t cd;
|
||||
|
||||
#ifdef HAVE_W32_SYSTEM
|
||||
if (load_libiconv ())
|
||||
return -1;
|
||||
#endif /*HAVE_W32_SYSTEM*/
|
||||
|
||||
cd = iconv_open (full_newset, "utf-8");
|
||||
if (cd == (iconv_t)-1)
|
||||
{
|
||||
handle_iconv_error (full_newset, "utf-8", 0);
|
||||
return -1;
|
||||
}
|
||||
iconv_close (cd);
|
||||
cd = iconv_open ("utf-8", full_newset);
|
||||
if (cd == (iconv_t)-1)
|
||||
{
|
||||
handle_iconv_error ("utf-8", full_newset, 0);
|
||||
return -1;
|
||||
}
|
||||
iconv_close (cd);
|
||||
active_charset_name = full_newset;
|
||||
no_translation = 0;
|
||||
active_charset = NULL;
|
||||
use_iconv = 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -132,10 +256,9 @@ get_native_charset ()
|
|||
return active_charset_name;
|
||||
}
|
||||
|
||||
/****************
|
||||
* Convert string, which is in native encoding to UTF8 and return the
|
||||
* new allocated UTF8 string.
|
||||
*/
|
||||
|
||||
/* Convert string, which is in native encoding to UTF8 and return a
|
||||
new allocated UTF-8 string. */
|
||||
char *
|
||||
native_to_utf8 (const char *orig_string)
|
||||
{
|
||||
|
@ -147,41 +270,12 @@ native_to_utf8 (const char *orig_string)
|
|||
|
||||
if (no_translation)
|
||||
{
|
||||
/* Already utf-8 encoded. */
|
||||
buffer = jnlib_xstrdup (orig_string);
|
||||
}
|
||||
else if (active_charset)
|
||||
{
|
||||
for (s = string; *s; s++)
|
||||
{
|
||||
length++;
|
||||
if (*s & 0x80)
|
||||
length += 2; /* we may need 3 bytes */
|
||||
}
|
||||
buffer = jnlib_xmalloc (length + 1);
|
||||
for (p = (unsigned char *)buffer, s = string; *s; s++)
|
||||
{
|
||||
if ((*s & 0x80))
|
||||
{
|
||||
ushort val = active_charset[*s & 0x7f];
|
||||
if (val < 0x0800)
|
||||
{
|
||||
*p++ = 0xc0 | ((val >> 6) & 0x1f);
|
||||
*p++ = 0x80 | (val & 0x3f);
|
||||
}
|
||||
else
|
||||
{
|
||||
*p++ = 0xe0 | ((val >> 12) & 0x0f);
|
||||
*p++ = 0x80 | ((val >> 6) & 0x3f);
|
||||
*p++ = 0x80 | (val & 0x3f);
|
||||
}
|
||||
}
|
||||
else
|
||||
*p++ = *s;
|
||||
}
|
||||
*p = 0;
|
||||
}
|
||||
else
|
||||
else if (!active_charset && !use_iconv)
|
||||
{
|
||||
/* For Latin-1 we can avoid the iconv overhead. */
|
||||
for (s = string; *s; s++)
|
||||
{
|
||||
length++;
|
||||
|
@ -191,7 +285,7 @@ native_to_utf8 (const char *orig_string)
|
|||
buffer = jnlib_xmalloc (length + 1);
|
||||
for (p = (unsigned char *)buffer, s = string; *s; s++)
|
||||
{
|
||||
if (*s & 0x80)
|
||||
if ( (*s & 0x80 ))
|
||||
{
|
||||
*p++ = 0xc0 | ((*s >> 6) & 3);
|
||||
*p++ = 0x80 | (*s & 0x3f);
|
||||
|
@ -201,22 +295,68 @@ native_to_utf8 (const char *orig_string)
|
|||
}
|
||||
*p = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Need to use iconv. */
|
||||
iconv_t cd;
|
||||
const char *inptr;
|
||||
char *outptr;
|
||||
size_t inbytes, outbytes;
|
||||
|
||||
cd = iconv_open ("utf-8", active_charset_name);
|
||||
if (cd == (iconv_t)-1)
|
||||
{
|
||||
handle_iconv_error ("utf-8", active_charset_name, 1);
|
||||
return native_to_utf8 (string);
|
||||
}
|
||||
|
||||
for (s=string; *s; s++ )
|
||||
{
|
||||
length++;
|
||||
if ((*s & 0x80))
|
||||
length += 5; /* We may need up to 6 bytes for the utf8 output. */
|
||||
}
|
||||
buffer = jnlib_xmalloc (length + 1);
|
||||
|
||||
inptr = string;
|
||||
inbytes = strlen (string);
|
||||
outptr = buffer;
|
||||
outbytes = length;
|
||||
if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
|
||||
&outptr, &outbytes) == (size_t)-1)
|
||||
{
|
||||
static int shown;
|
||||
|
||||
if (!shown)
|
||||
log_info (_("conversion from `%s' to `%s' failed: %s\n"),
|
||||
active_charset_name, "utf-8", strerror (errno));
|
||||
shown = 1;
|
||||
/* We don't do any conversion at all but use the strings as is. */
|
||||
strcpy (buffer, string);
|
||||
}
|
||||
else /* Success. */
|
||||
{
|
||||
*outptr = 0;
|
||||
/* We could realloc the buffer now but I doubt that it makes
|
||||
much sense given that it will get freed anyway soon
|
||||
after. */
|
||||
}
|
||||
iconv_close (cd);
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
||||
/* Convert string, which is in UTF8 to native encoding. Replace
|
||||
* illegal encodings by some "\xnn" and quote all control
|
||||
* characters. A character with value DELIM will always be quoted, it
|
||||
* must be a vanilla ASCII character. */
|
||||
char *
|
||||
utf8_to_native (const char *string, size_t length, int delim)
|
||||
|
||||
static char *
|
||||
do_utf8_to_native (const char *string, size_t length, int delim,
|
||||
int with_iconv)
|
||||
{
|
||||
int nleft;
|
||||
int i;
|
||||
unsigned char encbuf[8];
|
||||
int encidx;
|
||||
const byte *s;
|
||||
const unsigned char *s;
|
||||
size_t n;
|
||||
char *buffer = NULL;
|
||||
char *p = NULL;
|
||||
|
@ -224,19 +364,20 @@ utf8_to_native (const char *string, size_t length, int delim)
|
|||
size_t slen;
|
||||
int resync = 0;
|
||||
|
||||
/* 1. pass (p==NULL): count the extended utf-8 characters */
|
||||
/* 2. pass (p!=NULL): create string */
|
||||
/* First pass (p==NULL): count the extended utf-8 characters. */
|
||||
/* Second pass (p!=NULL): create string. */
|
||||
for (;;)
|
||||
{
|
||||
for (slen = length, nleft = encidx = 0, n = 0,
|
||||
s = (const unsigned char *)string; slen;
|
||||
s = (const unsigned char *)string;
|
||||
slen;
|
||||
s++, slen--)
|
||||
{
|
||||
if (resync)
|
||||
{
|
||||
if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)))
|
||||
{
|
||||
/* still invalid */
|
||||
/* Still invalid. */
|
||||
if (p)
|
||||
{
|
||||
sprintf (p, "\\x%02x", *s);
|
||||
|
@ -250,45 +391,23 @@ utf8_to_native (const char *string, size_t length, int delim)
|
|||
if (!nleft)
|
||||
{
|
||||
if (!(*s & 0x80))
|
||||
{ /* plain ascii */
|
||||
if (*s < 0x20 || *s == 0x7f || *s == delim ||
|
||||
(delim && *s == '\\'))
|
||||
{
|
||||
/* Plain ascii. */
|
||||
if ( delim != -1
|
||||
&& (*s < 0x20 || *s == 0x7f || *s == delim
|
||||
|| (delim && *s == '\\')))
|
||||
{
|
||||
n++;
|
||||
if (p)
|
||||
*p++ = '\\';
|
||||
switch (*s)
|
||||
{
|
||||
case '\n':
|
||||
n++;
|
||||
if (p)
|
||||
*p++ = 'n';
|
||||
break;
|
||||
case '\r':
|
||||
n++;
|
||||
if (p)
|
||||
*p++ = 'r';
|
||||
break;
|
||||
case '\f':
|
||||
n++;
|
||||
if (p)
|
||||
*p++ = 'f';
|
||||
break;
|
||||
case '\v':
|
||||
n++;
|
||||
if (p)
|
||||
*p++ = 'v';
|
||||
break;
|
||||
case '\b':
|
||||
n++;
|
||||
if (p)
|
||||
*p++ = 'b';
|
||||
break;
|
||||
case 0:
|
||||
n++;
|
||||
if (p)
|
||||
*p++ = '0';
|
||||
break;
|
||||
case '\n': n++; if ( p ) *p++ = 'n'; break;
|
||||
case '\r': n++; if ( p ) *p++ = 'r'; break;
|
||||
case '\f': n++; if ( p ) *p++ = 'f'; break;
|
||||
case '\v': n++; if ( p ) *p++ = 'v'; break;
|
||||
case '\b': n++; if ( p ) *p++ = 'b'; break;
|
||||
case 0: n++; if ( p ) *p++ = '0'; break;
|
||||
default:
|
||||
n += 3;
|
||||
if (p)
|
||||
|
@ -306,43 +425,43 @@ utf8_to_native (const char *string, size_t length, int delim)
|
|||
n++;
|
||||
}
|
||||
}
|
||||
else if ((*s & 0xe0) == 0xc0)
|
||||
{ /* 110x xxxx */
|
||||
else if ((*s & 0xe0) == 0xc0) /* 110x xxxx */
|
||||
{
|
||||
val = *s & 0x1f;
|
||||
nleft = 1;
|
||||
encidx = 0;
|
||||
encbuf[encidx++] = *s;
|
||||
}
|
||||
else if ((*s & 0xf0) == 0xe0)
|
||||
{ /* 1110 xxxx */
|
||||
else if ((*s & 0xf0) == 0xe0) /* 1110 xxxx */
|
||||
{
|
||||
val = *s & 0x0f;
|
||||
nleft = 2;
|
||||
encidx = 0;
|
||||
encbuf[encidx++] = *s;
|
||||
}
|
||||
else if ((*s & 0xf8) == 0xf0)
|
||||
{ /* 1111 0xxx */
|
||||
else if ((*s & 0xf8) == 0xf0) /* 1111 0xxx */
|
||||
{
|
||||
val = *s & 0x07;
|
||||
nleft = 3;
|
||||
encidx = 0;
|
||||
encbuf[encidx++] = *s;
|
||||
}
|
||||
else if ((*s & 0xfc) == 0xf8)
|
||||
{ /* 1111 10xx */
|
||||
else if ((*s & 0xfc) == 0xf8) /* 1111 10xx */
|
||||
{
|
||||
val = *s & 0x03;
|
||||
nleft = 4;
|
||||
encidx = 0;
|
||||
encbuf[encidx++] = *s;
|
||||
}
|
||||
else if ((*s & 0xfe) == 0xfc)
|
||||
{ /* 1111 110x */
|
||||
else if ((*s & 0xfe) == 0xfc) /* 1111 110x */
|
||||
{
|
||||
val = *s & 0x01;
|
||||
nleft = 5;
|
||||
encidx = 0;
|
||||
encbuf[encidx++] = *s;
|
||||
}
|
||||
else
|
||||
{ /* invalid encoding: print as \xnn */
|
||||
else /* Invalid encoding: print as \xNN. */
|
||||
{
|
||||
if (p)
|
||||
{
|
||||
sprintf (p, "\\x%02x", *s);
|
||||
|
@ -352,8 +471,8 @@ utf8_to_native (const char *string, size_t length, int delim)
|
|||
resync = 1;
|
||||
}
|
||||
}
|
||||
else if (*s < 0x80 || *s >= 0xc0)
|
||||
{ /* invalid */
|
||||
else if (*s < 0x80 || *s >= 0xc0) /* Invalid utf-8 */
|
||||
{
|
||||
if (p)
|
||||
{
|
||||
for (i = 0; i < encidx; i++)
|
||||
|
@ -374,8 +493,8 @@ utf8_to_native (const char *string, size_t length, int delim)
|
|||
encbuf[encidx++] = *s;
|
||||
val <<= 6;
|
||||
val |= *s & 0x3f;
|
||||
if (!--nleft)
|
||||
{ /* ready */
|
||||
if (!--nleft) /* Ready. */
|
||||
{
|
||||
if (no_translation)
|
||||
{
|
||||
if (p)
|
||||
|
@ -386,43 +505,41 @@ utf8_to_native (const char *string, size_t length, int delim)
|
|||
n += encidx;
|
||||
encidx = 0;
|
||||
}
|
||||
else if (active_charset)
|
||||
{ /* table lookup */
|
||||
for (i = 0; i < 128; i++)
|
||||
{
|
||||
if (active_charset[i] == val)
|
||||
break;
|
||||
}
|
||||
if (i < 128)
|
||||
{ /* we can print this one */
|
||||
if (p)
|
||||
*p++ = i + 128;
|
||||
n++;
|
||||
}
|
||||
else
|
||||
{ /* we do not have a translation: print utf8 */
|
||||
if (p)
|
||||
{
|
||||
for (i = 0; i < encidx; i++)
|
||||
{
|
||||
sprintf (p, "\\x%02x", encbuf[i]);
|
||||
p += 4;
|
||||
}
|
||||
}
|
||||
n += encidx * 4;
|
||||
encidx = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{ /* native set */
|
||||
else if (with_iconv)
|
||||
{
|
||||
/* Our strategy for using iconv is a bit strange
|
||||
but it better keeps compatibility with
|
||||
previous versions in regard to how invalid
|
||||
encodings are displayed. What we do is to
|
||||
keep the utf-8 as is and have the real
|
||||
translation step then at the end. Yes, I
|
||||
know that this is ugly. However we are short
|
||||
of the 1.4 release and for this branch we
|
||||
should not mess too much around with iconv
|
||||
things. One reason for this is that we don't
|
||||
know enough about non-GNU iconv
|
||||
implementation and want to minimize the risk
|
||||
of breaking the code on too many platforms. */
|
||||
if ( p )
|
||||
{
|
||||
for (i=0; i < encidx; i++ )
|
||||
*p++ = encbuf[i];
|
||||
}
|
||||
n += encidx;
|
||||
encidx = 0;
|
||||
}
|
||||
else /* Latin-1 case. */
|
||||
{
|
||||
if (val >= 0x80 && val < 256)
|
||||
{
|
||||
n++; /* we can simply print this character */
|
||||
/* We can simply print this character */
|
||||
n++;
|
||||
if (p)
|
||||
*p++ = val;
|
||||
}
|
||||
else
|
||||
{ /* we do not have a translation: print utf8 */
|
||||
{
|
||||
/* We do not have a translation: print utf8. */
|
||||
if (p)
|
||||
{
|
||||
for (i = 0; i < encidx; i++)
|
||||
|
@ -440,13 +557,78 @@ utf8_to_native (const char *string, size_t length, int delim)
|
|||
}
|
||||
}
|
||||
if (!buffer)
|
||||
{ /* allocate the buffer after the first pass */
|
||||
{
|
||||
/* Allocate the buffer after the first pass. */
|
||||
buffer = p = jnlib_xmalloc (n + 1);
|
||||
}
|
||||
else
|
||||
else if (with_iconv)
|
||||
{
|
||||
/* Note: See above for comments. */
|
||||
iconv_t cd;
|
||||
const char *inptr;
|
||||
char *outbuf, *outptr;
|
||||
size_t inbytes, outbytes;
|
||||
|
||||
*p = 0; /* Terminate the buffer. */
|
||||
|
||||
cd = iconv_open (active_charset_name, "utf-8");
|
||||
if (cd == (iconv_t)-1)
|
||||
{
|
||||
handle_iconv_error (active_charset_name, "utf-8", 1);
|
||||
jnlib_free (buffer);
|
||||
return utf8_to_native (string, length, delim);
|
||||
}
|
||||
|
||||
/* Allocate a new buffer large enough to hold all possible
|
||||
encodings. */
|
||||
n = p - buffer + 1;
|
||||
inbytes = n - 1;;
|
||||
inptr = buffer;
|
||||
outbytes = n * MB_LEN_MAX;
|
||||
if (outbytes / MB_LEN_MAX != n)
|
||||
BUG (); /* Actually an overflow. */
|
||||
outbuf = outptr = jnlib_xmalloc (outbytes);
|
||||
if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
|
||||
&outptr, &outbytes) == (size_t)-1)
|
||||
{
|
||||
static int shown;
|
||||
|
||||
if (!shown)
|
||||
log_info (_("conversion from `%s' to `%s' failed: %s\n"),
|
||||
"utf-8", active_charset_name, strerror (errno));
|
||||
shown = 1;
|
||||
/* Didn't worked out. Try again but without iconv. */
|
||||
jnlib_free (buffer);
|
||||
buffer = NULL;
|
||||
jnlib_free (outbuf);
|
||||
outbuf = do_utf8_to_native (string, length, delim, 0);
|
||||
}
|
||||
else /* Success. */
|
||||
{
|
||||
*outptr = 0; /* Make sure it is a string. */
|
||||
/* We could realloc the buffer now but I doubt that it
|
||||
makes much sense given that it will get freed
|
||||
anyway soon after. */
|
||||
jnlib_free (buffer);
|
||||
}
|
||||
iconv_close (cd);
|
||||
return outbuf;
|
||||
}
|
||||
else /* Not using iconv. */
|
||||
{
|
||||
*p = 0; /* make a string */
|
||||
*p = 0; /* Make sure it is a string. */
|
||||
return buffer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Convert string, which is in UTF-8 to native encoding. Replace
|
||||
illegal encodings by some "\xnn" and quote all control
|
||||
characters. A character with value DELIM will always be quoted, it
|
||||
must be a vanilla ASCII character. A DELIM value of -1 is special:
|
||||
it disables all quoting of control characters. */
|
||||
char *
|
||||
utf8_to_native (const char *string, size_t length, int delim)
|
||||
{
|
||||
return do_utf8_to_native (string, length, delim, use_iconv);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue