diff options
Diffstat (limited to 'common/utf8.c')
-rw-r--r-- | common/utf8.c | 328 |
1 files changed, 328 insertions, 0 deletions
diff --git a/common/utf8.c b/common/utf8.c new file mode 100644 index 0000000..5ce6889 --- /dev/null +++ b/common/utf8.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2013, Red Hat Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * * Redistributions in binary form must reproduce the + * above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * * The names of contributors to this software may not be + * used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Author: Stef Walter <stefw@redhat.com> + */ + +#include "config.h" + +#include "buffer.h" +#include "debug.h" +#include "utf8.h" + +#include <assert.h> +#include <stddef.h> +#include <string.h> + +/* + * Some parts come from FreeBSD utf8.c + * + * Copyright (c) 2002-2004 Tim J. Robbins + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +static ssize_t +utf8_to_wchar (const char *str, + size_t len, + wchar_t *wc) +{ + int ch, i, mask, want; + wchar_t lbound, wch; + + assert (str != NULL); + assert (len > 0); + assert (wc != NULL); + + if (((ch = (unsigned char)*str) & ~0x7f) == 0) { + /* Fast path for plain ASCII characters. */ + *wc = ch; + return 1; + } + + /* + * Determine the number of octets that make up this character + * from the first octet, and a mask that extracts the + * interesting bits of the first octet. We already know + * the character is at least two bytes long. + * + * We also specify a lower bound for the character code to + * detect redundant, non-"shortest form" encodings. For + * example, the sequence C0 80 is _not_ a legal representation + * of the null character. This enforces a 1-to-1 mapping + * between character codes and their multibyte representations. + */ + ch = (unsigned char)*str; + if ((ch & 0xe0) == 0xc0) { + mask = 0x1f; + want = 2; + lbound = 0x80; + } else if ((ch & 0xf0) == 0xe0) { + mask = 0x0f; + want = 3; + lbound = 0x800; + } else if ((ch & 0xf8) == 0xf0) { + mask = 0x07; + want = 4; + lbound = 0x10000; + } else if ((ch & 0xfc) == 0xf8) { + mask = 0x03; + want = 5; + lbound = 0x200000; + } else if ((ch & 0xfe) == 0xfc) { + mask = 0x01; + want = 6; + lbound = 0x4000000; + } else { + /* + * Malformed input; input is not UTF-8. + */ + return -1; + } + + if (want > len) { + /* Incomplete multibyte sequence. */ + return -1; + } + + /* + * Decode the octet sequence representing the character in chunks + * of 6 bits, most significant first. + */ + wch = (unsigned char)*str++ & mask; + for (i = 1; i < want; i++) { + if ((*str & 0xc0) != 0x80) { + /* + * Malformed input; bad characters in the middle + * of a character. + */ + return -1; + } + wch <<= 6; + wch |= *str++ & 0x3f; + } + if (wch < lbound) { + /* + * Malformed input; redundant encoding. + */ + return -1; + } + + *wc = wch; + return want; +} + +static size_t +utf8_for_wchar (wchar_t wc, + char *str, + size_t len) +{ + unsigned char lead; + int i, want; + + assert (str != NULL); + assert (len >= 6); + + if ((wc & ~0x7f) == 0) { + /* Fast path for plain ASCII characters. */ + *str = (char)wc; + return 1; + } + + /* + * Determine the number of octets needed to represent this character. + * We always output the shortest sequence possible. Also specify the + * first few bits of the first octet, which contains the information + * about the sequence length. + */ + if ((wc & ~0x7ff) == 0) { + lead = 0xc0; + want = 2; + } else if ((wc & ~0xffff) == 0) { + lead = 0xe0; + want = 3; + } else if ((wc & ~0x1fffff) == 0) { + lead = 0xf0; + want = 4; + } else if ((wc & ~0x3ffffff) == 0) { + lead = 0xf8; + want = 5; + } else if ((wc & ~0x7fffffff) == 0) { + lead = 0xfc; + want = 6; + } else { + return -1; + } + + assert (want <= len); + + /* + * Output the octets representing the character in chunks + * of 6 bits, least significant last. The first octet is + * a special case because it contains the sequence length + * information. + */ + for (i = want - 1; i > 0; i--) { + str[i] = (wc & 0x3f) | 0x80; + wc >>= 6; + } + *str = (wc & 0xff) | lead; + return want; +} + +static ssize_t +ucs2be_to_wchar (const unsigned char *str, + size_t len, + wchar_t *wc) +{ + assert (str != NULL); + assert (len != 0); + assert (wc != NULL); + + if (len < 2) + return -1; + + *wc = (str[0] << 8 | str[1]); + return 2; +} + +static ssize_t +ucs4be_to_wchar (const unsigned char *str, + size_t len, + wchar_t *wc) +{ + assert (str != NULL); + assert (len != 0); + assert (wc != NULL); + + if (len < 4) + return -1; + + *wc = (str[0] << 24 | str[1] << 16 | str[2] << 8 | str[3]); + return 4; +} + +bool +p11_utf8_validate (const char *str, + ssize_t len) +{ + wchar_t dummy; + ssize_t ret; + + if (len < 0) + len = strlen (str); + + while (len > 0) { + ret = utf8_to_wchar (str, len, &dummy); + if (ret < 0) + return false; + str += ret; + len -= ret; + } + + return true; +} + +static char * +utf8_for_convert (ssize_t (* convert) (const unsigned char *, size_t, wchar_t *), + const unsigned char *str, + size_t num_bytes, + size_t *ret_len) +{ + p11_buffer buf; + char block[6]; + wchar_t wc; + ssize_t ret; + + assert (convert); + + if (!p11_buffer_init_null (&buf, num_bytes)) + return_val_if_reached (NULL); + + while (num_bytes != 0) { + ret = (convert) (str, num_bytes, &wc); + if (ret < 0) { + p11_buffer_uninit (&buf); + return NULL; + } + + str += ret; + num_bytes -= ret; + + ret = utf8_for_wchar (wc, block, 6); + if (ret < 0) { + p11_buffer_uninit (&buf); + return NULL; + } + p11_buffer_add (&buf, block, ret); + } + + return_val_if_fail (p11_buffer_ok (&buf), NULL); + return p11_buffer_steal (&buf, ret_len); +} + +char * +p11_utf8_for_ucs2be (const unsigned char *str, + size_t num_bytes, + size_t *ret_len) +{ + assert (str != NULL); + return utf8_for_convert (ucs2be_to_wchar, str, num_bytes, ret_len); +} + +char * +p11_utf8_for_ucs4be (const unsigned char *str, + size_t num_bytes, + size_t *ret_len) +{ + assert (str != NULL); + return utf8_for_convert (ucs4be_to_wchar, str, num_bytes, ret_len); +} |