From 00310b1aa868cc06cf486fcda6852e9750aa3564 Mon Sep 17 00:00:00 2001 From: David Shaw Date: Wed, 12 Aug 2009 05:01:08 +0000 Subject: [PATCH] Try and detect mis-coded Latin1 and convert it to UTF8. Whether the heuristics succeed or not, the resulting string must be valid UTF8 as LDAP requires that. This is bug 1055. --- g10/ChangeLog | 7 +++++ g10/keyserver.c | 39 +++++++++++++++++++++----- include/ChangeLog | 4 +++ include/util.h | 2 +- util/ChangeLog | 5 ++++ util/strgutil.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 120 insertions(+), 8 deletions(-) diff --git a/g10/ChangeLog b/g10/ChangeLog index 2f099afec..dc52ab990 100644 --- a/g10/ChangeLog +++ b/g10/ChangeLog @@ -1,3 +1,10 @@ +2009-08-11 David Shaw + + * keyserver.c (keyserver_spawn): Try and detect mis-coded Latin1 + and convert it to UTF8. Whether the heuristics succeed or not, + the resulting string must be valid UTF8 as LDAP requires that. + This is bug 1055. + 2009-08-03 Werner Koch * card-util.c (generate_card_keys): Ask for off-card keys only if diff --git a/g10/keyserver.c b/g10/keyserver.c index b3945bb04..0c67f9b64 100644 --- a/g10/keyserver.c +++ b/g10/keyserver.c @@ -1270,24 +1270,49 @@ keyserver_spawn(enum ks_action action,STRLIST list,KEYDB_SEARCH_DESC *desc, { PKT_user_id *uid=node->pkt->pkt.user_id; int r; + char *uidstr1,*uidstr2,*uidstr3; + size_t uidstrlen; if(uid->attrib_data) continue; fprintf(spawn->tochild,"uid:"); - /* Quote ':', '%', and any 8-bit - characters */ - for(r=0;rlen;r++) + /* Make sure it's real UTF8. What happens + here is that first we heuristically try + and convert the string (which may be + mis-coded) into UTF8. We then bring it + to native and then back to UTF8. For + true UTF8, this whole process should be + lossless. For the common Latin-1 + mis-encoding, it will become UTF8. For + other encodings, it will become UTF8 but + with unknown characters quoted. This + preserves the notion that anything in the + stream to the keyserver handler program + is UTF8. */ + uidstr1=string_to_utf8(uid->name); + uidstr2=utf8_to_native(uidstr1,strlen(uidstr1),-1); + uidstr3=native_to_utf8(uidstr2); + + uidstrlen=strlen(uidstr3); + + /* Quote ':', '%', and anything not + printable ASCII */ + for(r=0;rname[r]==':' || uid->name[r]=='%' - || uid->name[r]&0x80) + if(uidstr3[r]==':' || uidstr3[r]=='%' + || uidstr3[r]<' ' || uidstr3[r]>'~') fprintf(spawn->tochild,"%%%02X", - (byte)uid->name[r]); + (byte)uidstr3[r]); else - fprintf(spawn->tochild,"%c",uid->name[r]); + fprintf(spawn->tochild,"%c",uidstr3[r]); } + xfree(uidstr1); + xfree(uidstr2); + xfree(uidstr3); + fprintf(spawn->tochild,":%u:%u:", uid->created,uid->expiredate); diff --git a/include/ChangeLog b/include/ChangeLog index 572830ccb..c709688b4 100644 --- a/include/ChangeLog +++ b/include/ChangeLog @@ -1,3 +1,7 @@ +2009-08-11 David Shaw + + * util.h: Add string_to_utf8() from GPA. + 2009-07-21 Werner Koch * estream-printf.h: New. Taken from libestream.x diff --git a/include/util.h b/include/util.h index b15181eed..72e3959e2 100644 --- a/include/util.h +++ b/include/util.h @@ -190,7 +190,7 @@ int set_native_charset( const char *newset ); const char* get_native_charset(void); char *native_to_utf8( const char *string ); char *utf8_to_native( const char *string, size_t length, int delim); -int check_utf8_string( const char *string ); +char *string_to_utf8 (const char *string); int ascii_isupper (int c); int ascii_islower (int c); diff --git a/util/ChangeLog b/util/ChangeLog index 81466c710..dcb21678c 100644 --- a/util/ChangeLog +++ b/util/ChangeLog @@ -1,3 +1,8 @@ +2009-08-11 David Shaw + + * strgutil.c (string_to_utf8): New function to convert a Latin-1 + string to UTF8. From GPA. + 2009-07-23 David Shaw * srv.c (getsrv): Fix type-punning warning. diff --git a/util/strgutil.c b/util/strgutil.c index 0791dbbfa..5193df0f8 100644 --- a/util/strgutil.c +++ b/util/strgutil.c @@ -1048,6 +1048,77 @@ utf8_to_native( const char *string, size_t length, int delim ) } } +/* This is similar to native_to_utf8, except it can take any input + (which may or may not be UTF8 encoded) and return something that is + (almost) definitely UTF8. This code is mostly borrowed from + GPA. */ + +char * +string_to_utf8 (const char *string) +{ + const char *s; + + if (!string) + return NULL; + + /* Due to a bug in old and not so old PGP versions user IDs have + been copied verbatim into the key. Thus many users with Umlauts + et al. in their name will see their names garbled. Although this + is not an issue for me (;-)), I have a couple of friends with + Umlauts in their name, so let's try to make their life easier by + detecting invalid encodings and convert that to Latin-1. We use + this even for X.509 because it may make things even better given + all the invalid encodings often found in X.509 certificates. */ + for (s = string; *s && !(*s & 0x80); s++) + ; + if (*s && ((s[1] & 0xc0) == 0x80) && ( ((*s & 0xe0) == 0xc0) + || ((*s & 0xf0) == 0xe0) + || ((*s & 0xf8) == 0xf0) + || ((*s & 0xfc) == 0xf8) + || ((*s & 0xfe) == 0xfc)) ) + { + /* Possible utf-8 character followed by continuation byte. + Although this might still be Latin-1 we better assume that it + is valid utf-8. */ + return xstrdup (string); + } + else if (*s && !strchr (string, 0xc3)) + { + size_t length=0; + char *buffer,*p; + + /* No 0xC3 character in the string; assume that it is Latin-1. */ + + for(s=string; *s; s++ ) + { + length++; + if( *s & 0x80 ) + length++; + } + buffer = xmalloc( length + 1 ); + for(p=buffer, s=string; *s; s++ ) + { + if( *s & 0x80 ) + { + *p++ = 0xc0 | ((*s >> 6) & 3); + *p++ = 0x80 | ( *s & 0x3f ); + } + else + *p++ = *s; + } + *p = 0; + + return buffer; + } + else + { + /* Everything else is assumed to be UTF-8. We do this even that + we know the encoding is not valid. However as we only test + the first non-ascii character, valid encodings might + follow. */ + return xstrdup (string); + } +} /* Same as asprintf but return an allocated buffer suitable to be freed using xfree. This function simply dies on memory failure,