diff options
author | Dan Albert <danalbert@google.com> | 2014-06-04 16:10:50 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2014-06-04 16:10:50 +0000 |
commit | 452e09130010b641904e7411e0c539590a654022 (patch) | |
tree | bf46b10decf99bf66af7a3789080f443a011bafb | |
parent | 38ed337c1343af1aa55487777ed80de6b0d33638 (diff) | |
parent | 7a7f9952c12b216fbf91fc4cdbb97045e8861115 (diff) | |
download | bionic-452e09130010b641904e7411e0c539590a654022.zip bionic-452e09130010b641904e7411e0c539590a654022.tar.gz bionic-452e09130010b641904e7411e0c539590a654022.tar.bz2 |
Merge "Adds functionality specified by uchar.h"
-rw-r--r-- | libc/Android.mk | 5 | ||||
-rw-r--r-- | libc/bionic/c16rtomb.cpp | 67 | ||||
-rw-r--r-- | libc/bionic/c32rtomb.cpp | 97 | ||||
-rw-r--r-- | libc/bionic/mbrtoc16.cpp | 89 | ||||
-rw-r--r-- | libc/bionic/mbrtoc32.cpp | 138 | ||||
-rw-r--r-- | libc/bionic/mbstate.cpp | 57 | ||||
-rw-r--r-- | libc/bionic/wchar.cpp | 208 | ||||
-rw-r--r-- | libc/include/uchar.h | 53 | ||||
-rw-r--r-- | libc/private/bionic_mbstate.h | 54 | ||||
-rw-r--r-- | tests/Android.mk | 1 | ||||
-rw-r--r-- | tests/uchar_test.cpp | 412 |
11 files changed, 988 insertions, 193 deletions
diff --git a/libc/Android.mk b/libc/Android.mk index 526551e..cf407e1 100644 --- a/libc/Android.mk +++ b/libc/Android.mk @@ -103,6 +103,8 @@ libc_bionic_src_files := \ bionic/__bionic_name_mem.cpp \ bionic/bionic_time_conversions.cpp \ bionic/brk.cpp \ + bionic/c16rtomb.cpp \ + bionic/c32rtomb.cpp \ bionic/chmod.cpp \ bionic/chown.cpp \ bionic/clearenv.cpp \ @@ -140,6 +142,9 @@ libc_bionic_src_files := \ bionic/link.cpp \ bionic/locale.cpp \ bionic/lstat.cpp \ + bionic/mbrtoc16.cpp \ + bionic/mbrtoc32.cpp \ + bionic/mbstate.cpp \ bionic/mkdir.cpp \ bionic/mkfifo.cpp \ bionic/mknod.cpp \ diff --git a/libc/bionic/c16rtomb.cpp b/libc/bionic/c16rtomb.cpp new file mode 100644 index 0000000..77512be --- /dev/null +++ b/libc/bionic/c16rtomb.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <errno.h> +#include <uchar.h> +#include <wchar.h> + +#include "private/bionic_mbstate.h" + +static inline constexpr bool is_high_surrogate(char16_t c16) { + return c16 >= 0xd800 && c16 < 0xdc00; +} + +static inline constexpr bool is_low_surrogate(char16_t c16) { + return c16 >= 0xdc00 && c16 < 0xe000; +} + +size_t c16rtomb(char* s, char16_t c16, mbstate_t* ps) { + static mbstate_t __private_state; + mbstate_t* state = (ps == NULL) ? &__private_state : ps; + if (mbsinit(state)) { + if (is_high_surrogate(c16)) { + char32_t c32 = (c16 & ~0xd800) << 10; + mbstate_set_byte(state, 3, (c32 & 0xff0000) >> 16); + mbstate_set_byte(state, 2, (c32 & 0x00ff00) >> 8); + return 0; + } else if (is_low_surrogate(c16)) { + return reset_and_return_illegal(EINVAL, state); + } else { + return c32rtomb(s, static_cast<char32_t>(c16), state); + } + } else { + if (!is_low_surrogate(c16)) { + return reset_and_return_illegal(EINVAL, state); + } + + char32_t c32 = ((mbstate_get_byte(state, 3) << 16) | + (mbstate_get_byte(state, 2) << 8) | + (c16 & ~0xdc00)) + 0x10000; + return reset_and_return(c32rtomb(s, c32, NULL), state); + } +} diff --git a/libc/bionic/c32rtomb.cpp b/libc/bionic/c32rtomb.cpp new file mode 100644 index 0000000..d3231c0 --- /dev/null +++ b/libc/bionic/c32rtomb.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <errno.h> +#include <uchar.h> +#include <wchar.h> + +#include "private/bionic_mbstate.h" + +size_t c32rtomb(char* s, char32_t c32, mbstate_t* ps) { + static mbstate_t __private_state; + mbstate_t* state = (ps == NULL) ? &__private_state : ps; + + if (s == NULL) { + // Equivalent to c32rtomb(buf, U'\0', ps). + return reset_and_return(1, state); + } + + // POSIX states that if char32_t is a null wide character, a null byte shall + // be stored, preceded by any shift sequence needed to restore the initial + // shift state. Since shift states are not supported, only the null byte is + // stored. + if (c32 == U'\0') { + *s = '\0'; + reset_and_return(1, state); + } + + if (!mbsinit(state)) { + return reset_and_return_illegal(EILSEQ, state); + } + + if ((c32 & ~0x7f) == 0) { + // Fast path for plain ASCII characters. + *s = c32; + return 1; + } + + // Determine the number of octets needed to represent this character. + // We always output the shortest sequence possible. Also specify the + // first few bits of the first octet, which contains the information + // about the sequence length. + uint8_t lead; + size_t length; + if ((c32 & ~0x7f) == 0) { + lead = 0; + length = 1; + } else if ((c32 & ~0x7ff) == 0) { + lead = 0xc0; + length = 2; + } else if ((c32 & ~0xffff) == 0) { + lead = 0xe0; + length = 3; + } else if ((c32 & ~0x1fffff) == 0) { + lead = 0xf0; + length = 4; + } else { + errno = EILSEQ; + return __MB_ERR_ILLEGAL_SEQUENCE; + } + + // Output the octets representing the character in chunks + // of 6 bits, least significant last. The first octet is + // a special case because it contains the sequence length + // information. + for (size_t i = length - 1; i > 0; i--) { + s[i] = (c32 & 0x3f) | 0x80; + c32 >>= 6; + } + *s = (c32 & 0xff) | lead; + + return length; +} diff --git a/libc/bionic/mbrtoc16.cpp b/libc/bionic/mbrtoc16.cpp new file mode 100644 index 0000000..6878a11 --- /dev/null +++ b/libc/bionic/mbrtoc16.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <assert.h> +#include <errno.h> +#include <uchar.h> +#include <wchar.h> + +#include "private/bionic_mbstate.h" + +static inline bool mbspartialc16(const mbstate_t* state) { + return mbstate_get_byte(state, 3) != 0; +} + +static size_t begin_surrogate(char32_t c32, char16_t* pc16, + size_t nconv, mbstate_t* state) { + c32 -= 0x10000; + char16_t trail = (c32 & 0x3ff) | 0xdc00; + + mbstate_set_byte(state, 0, trail & 0x00ff); + mbstate_set_byte(state, 1, (trail & 0xff00) >> 8); + mbstate_set_byte(state, 3, nconv & 0xff); + + *pc16 = ((c32 & 0xffc00) >> 10) | 0xd800; + // Defined by POSIX as return value for first surrogate character. + return static_cast<size_t>(-3); +} + +static size_t finish_surrogate(char16_t* pc16, mbstate_t* state) { + char16_t trail = mbstate_get_byte(state, 1) << 8 | + mbstate_get_byte(state, 0); + *pc16 = trail; + return reset_and_return(mbstate_get_byte(state, 3), state); +} + +size_t mbrtoc16(char16_t* pc16, const char* s, size_t n, mbstate_t* ps) { + static mbstate_t __private_state; + mbstate_t* state = (ps == NULL) ? &__private_state : ps; + + char16_t __private_pc16; + if (pc16 == NULL) { + pc16 = &__private_pc16; + } + + if (mbspartialc16(state)) { + return finish_surrogate(pc16, state); + } + + char32_t c32; + size_t nconv = mbrtoc32(&c32, s, n, state); + if (__MB_IS_ERR(nconv)) { + return nconv; + } else if (nconv == 0) { + return reset_and_return(nconv, state); + } else if (c32 > 0x10ffff) { + // Input cannot be encoded as UTF-16. + return reset_and_return_illegal(EILSEQ, state); + } else if (c32 < 0x10000) { + *pc16 = static_cast<char16_t>(c32); + return reset_and_return(nconv, state); + } else { + return begin_surrogate(c32, pc16, nconv, state); + } +} diff --git a/libc/bionic/mbrtoc32.cpp b/libc/bionic/mbrtoc32.cpp new file mode 100644 index 0000000..bd40ecf --- /dev/null +++ b/libc/bionic/mbrtoc32.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <errno.h> +#include <sys/param.h> +#include <uchar.h> +#include <wchar.h> + +#include "private/bionic_mbstate.h" + +size_t mbrtoc32(char32_t* pc32, const char* s, size_t n, mbstate_t* ps) { + static mbstate_t __private_state; + mbstate_t* state = (ps == NULL) ? &__private_state : ps; + + // We should never get to a state which has all 4 bytes of the sequence set. + // Full state verification is done when decoding the sequence (after we have + // all the bytes). + if (mbstate_get_byte(state, 3) != 0) { + return reset_and_return_illegal(EINVAL, state); + } + + if (s == NULL) { + s = ""; + n = 1; + pc32 = NULL; + } + + if (n == 0) { + return 0; + } + + uint8_t ch; + if (mbsinit(state) && (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0)) { + // Fast path for plain ASCII characters. + if (pc32 != NULL) { + *pc32 = ch; + } + return (ch != '\0' ? 1 : 0); + } + + // Determine the number of octets that make up this character + // from the first octet, and a mask that extracts the + // interesting bits of the first octet. We already know + // the character is at least two bytes long. + size_t length; + int mask; + + // We also specify a lower bound for the character code to + // detect redundant, non-"shortest form" encodings. For + // example, the sequence C0 80 is _not_ a legal representation + // of the null character. This enforces a 1-to-1 mapping + // between character codes and their multibyte representations. + char32_t lower_bound; + + // The first byte in the state (if any) tells the length. + size_t bytes_so_far = mbstate_bytes_so_far(state); + ch = bytes_so_far > 0 ? mbstate_get_byte(state, 0) : static_cast<uint8_t>(*s); + if ((ch & 0x80) == 0) { + mask = 0x7f; + length = 1; + lower_bound = 0; + } else if ((ch & 0xe0) == 0xc0) { + mask = 0x1f; + length = 2; + lower_bound = 0x80; + } else if ((ch & 0xf0) == 0xe0) { + mask = 0x0f; + length = 3; + lower_bound = 0x800; + } else if ((ch & 0xf8) == 0xf0) { + mask = 0x07; + length = 4; + lower_bound = 0x10000; + } else { + // Malformed input; input is not UTF-8. See RFC 3629. + return reset_and_return_illegal(EILSEQ, state); + } + + // Fill in the state. + size_t bytes_wanted = length - bytes_so_far; + size_t i; + for (i = 0; i < MIN(bytes_wanted, n); i++) { + if (!mbsinit(state) && ((*s & 0xc0) != 0x80)) { + // Malformed input; bad characters in the middle of a character. + return reset_and_return_illegal(EILSEQ, state); + } + mbstate_set_byte(state, bytes_so_far + i, *s++); + } + if (i < bytes_wanted) { + return __MB_ERR_INCOMPLETE_SEQUENCE; + } + + // Decode the octet sequence representing the character in chunks + // of 6 bits, most significant first. + char32_t c32 = mbstate_get_byte(state, 0) & mask; + for (i = 1; i < length; i++) { + c32 <<= 6; + c32 |= mbstate_get_byte(state, i) & 0x3f; + } + + if (c32 < lower_bound) { + // Malformed input; redundant encoding. + return reset_and_return_illegal(EILSEQ, state); + } + if ((c32 >= 0xd800 && c32 <= 0xdfff) || c32 == 0xfffe || c32 == 0xffff) { + // Malformed input; invalid code points. + return reset_and_return_illegal(EILSEQ, state); + } + if (pc32 != NULL) { + *pc32 = c32; + } + return reset_and_return(c32 == U'\0' ? 0 : bytes_wanted, state); +} diff --git a/libc/bionic/mbstate.cpp b/libc/bionic/mbstate.cpp new file mode 100644 index 0000000..cb327d8 --- /dev/null +++ b/libc/bionic/mbstate.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "private/bionic_mbstate.h" + +#include <errno.h> + +__LIBC_HIDDEN__ size_t mbstate_bytes_so_far(const mbstate_t* ps) { + return + (ps->__seq[2] != 0) ? 3 : + (ps->__seq[1] != 0) ? 2 : + (ps->__seq[0] != 0) ? 1 : 0; +} + +__LIBC_HIDDEN__ void mbstate_set_byte(mbstate_t* ps, int i, char byte) { + ps->__seq[i] = static_cast<uint8_t>(byte); +} + +__LIBC_HIDDEN__ uint8_t mbstate_get_byte(const mbstate_t* ps, int n) { + return ps->__seq[n]; +} + +__LIBC_HIDDEN__ size_t reset_and_return_illegal(int _errno, mbstate_t* ps) { + errno = _errno; + *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0; + return __MB_ERR_ILLEGAL_SEQUENCE; +} + +__LIBC_HIDDEN__ size_t reset_and_return(int _return, mbstate_t* ps) { + *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0; + return _return; +} diff --git a/libc/bionic/wchar.cpp b/libc/bionic/wchar.cpp index 5da882f..acb2761 100644 --- a/libc/bionic/wchar.cpp +++ b/libc/bionic/wchar.cpp @@ -27,9 +27,12 @@ */ #include <errno.h> -#include <string.h> #include <sys/param.h> +#include <string.h> #include <wchar.h> +#include <uchar.h> + +#include "private/bionic_mbstate.h" // // This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a @@ -50,36 +53,6 @@ // function pointers. // -#define ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1) -#define ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2) - -static size_t mbstate_bytes_so_far(const mbstate_t* ps) { - return - (ps->__seq[2] != 0) ? 3 : - (ps->__seq[1] != 0) ? 2 : - (ps->__seq[0] != 0) ? 1 : 0; -} - -static void mbstate_set_byte(mbstate_t* ps, int i, char byte) { - ps->__seq[i] = static_cast<uint8_t>(byte); -} - -static uint8_t mbstate_get_byte(const mbstate_t* ps, int n) { - return ps->__seq[n]; -} - -static size_t reset_and_return_illegal(int _errno, mbstate_t* ps) { - errno = _errno; - *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0; - return ERR_ILLEGAL_SEQUENCE; -} - -static size_t reset_and_return(int _return, mbstate_t* ps) { - *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0; - return _return; -} - - int mbsinit(const mbstate_t* ps) { return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0)); } @@ -88,104 +61,8 @@ size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) { static mbstate_t __private_state; mbstate_t* state = (ps == NULL) ? &__private_state : ps; - // We should never get to a state which has all 4 bytes of the sequence set. - // Full state verification is done when decoding the sequence (after we have - // all the bytes). - if (mbstate_get_byte(state, 3) != 0) { - return reset_and_return_illegal(EINVAL, state); - } - - if (s == NULL) { - s = ""; - n = 1; - pwc = NULL; - } - - if (n == 0) { - return 0; - } - - uint8_t ch; - if (mbsinit(state) && (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0)) { - // Fast path for plain ASCII characters. - if (pwc != NULL) { - *pwc = ch; - } - return (ch != '\0' ? 1 : 0); - } - - // Determine the number of octets that make up this character - // from the first octet, and a mask that extracts the - // interesting bits of the first octet. We already know - // the character is at least two bytes long. - size_t length; - int mask; - - // We also specify a lower bound for the character code to - // detect redundant, non-"shortest form" encodings. For - // example, the sequence C0 80 is _not_ a legal representation - // of the null character. This enforces a 1-to-1 mapping - // between character codes and their multibyte representations. - wchar_t lower_bound; - - // The first byte in the state (if any) tells the length. - size_t bytes_so_far = mbstate_bytes_so_far(state); - ch = bytes_so_far > 0 ? mbstate_get_byte(state, 0) : static_cast<uint8_t>(*s); - if ((ch & 0x80) == 0) { - mask = 0x7f; - length = 1; - lower_bound = 0; - } else if ((ch & 0xe0) == 0xc0) { - mask = 0x1f; - length = 2; - lower_bound = 0x80; - } else if ((ch & 0xf0) == 0xe0) { - mask = 0x0f; - length = 3; - lower_bound = 0x800; - } else if ((ch & 0xf8) == 0xf0) { - mask = 0x07; - length = 4; - lower_bound = 0x10000; - } else { - // Malformed input; input is not UTF-8. See RFC 3629. - return reset_and_return_illegal(EILSEQ, state); - } - - // Fill in the state. - size_t bytes_wanted = length - bytes_so_far; - size_t i; - for (i = 0; i < MIN(bytes_wanted, n); i++) { - if (!mbsinit(state) && ((*s & 0xc0) != 0x80)) { - // Malformed input; bad characters in the middle of a character. - return reset_and_return_illegal(EILSEQ, state); - } - mbstate_set_byte(state, bytes_so_far + i, *s++); - } - if (i < bytes_wanted) { - return ERR_INCOMPLETE_SEQUENCE; - } - - // Decode the octet sequence representing the character in chunks - // of 6 bits, most significant first. - wchar_t wch = mbstate_get_byte(state, 0) & mask; - for (i = 1; i < length; i++) { - wch <<= 6; - wch |= mbstate_get_byte(state, i) & 0x3f; - } - - if (wch < lower_bound) { - // Malformed input; redundant encoding. - return reset_and_return_illegal(EILSEQ, state); - } - if ((wch >= 0xd800 && wch <= 0xdfff) || wch == 0xfffe || wch == 0xffff) { - // Malformed input; invalid code points. - return reset_and_return_illegal(EILSEQ, state); - } - if (pwc != NULL) { - *pwc = wch; - } - return reset_and_return(wch == L'\0' ? 0 : bytes_wanted, state); + // Our wchar_t is UTF-32 + return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state); } size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) { @@ -212,10 +89,10 @@ size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstat r = 1; } else { r = mbrtowc(NULL, *src + i, nmc - i, state); - if (r == ERR_ILLEGAL_SEQUENCE) { + if (r == __MB_ERR_ILLEGAL_SEQUENCE) { return reset_and_return_illegal(EILSEQ, state); } - if (r == ERR_INCOMPLETE_SEQUENCE) { + if (r == __MB_ERR_INCOMPLETE_SEQUENCE) { return reset_and_return_illegal(EILSEQ, state); } if (r == 0) { @@ -246,11 +123,11 @@ size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstat r = 1; } else { r = mbrtowc(dst + o, *src + i, nmc - i, state); - if (r == ERR_ILLEGAL_SEQUENCE) { + if (r == __MB_ERR_ILLEGAL_SEQUENCE) { *src += i; return reset_and_return_illegal(EILSEQ, state); } - if (r == ERR_INCOMPLETE_SEQUENCE) { + if (r == __MB_ERR_INCOMPLETE_SEQUENCE) { *src += nmc; return reset_and_return(EILSEQ, state); } @@ -272,63 +149,8 @@ size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) { static mbstate_t __private_state; mbstate_t* state = (ps == NULL) ? &__private_state : ps; - if (s == NULL) { - // Equivalent to wcrtomb(buf, L'\0', ps). - return reset_and_return(1, state); - } - - // POSIX states that if wc is a null wide character, a null byte shall be - // stored, preceded by any shift sequence needed to restore the initial shift - // state. Since shift states are not supported, only the null byte is stored. - if (wc == L'\0') { - *s = '\0'; - reset_and_return(1, state); - } - - if (!mbsinit(state)) { - return reset_and_return_illegal(EILSEQ, state); - } - - if ((wc & ~0x7f) == 0) { - // Fast path for plain ASCII characters. - *s = wc; - return 1; - } - - // Determine the number of octets needed to represent this character. - // We always output the shortest sequence possible. Also specify the - // first few bits of the first octet, which contains the information - // about the sequence length. - uint8_t lead; - size_t length; - if ((wc & ~0x7f) == 0) { - lead = 0; - length = 1; - } else if ((wc & ~0x7ff) == 0) { - lead = 0xc0; - length = 2; - } else if ((wc & ~0xffff) == 0) { - lead = 0xe0; - length = 3; - } else if ((wc & ~0x1fffff) == 0) { - lead = 0xf0; - length = 4; - } else { - errno = EILSEQ; - return ERR_ILLEGAL_SEQUENCE; - } - - // Output the octets representing the character in chunks - // of 6 bits, least significant last. The first octet is - // a special case because it contains the sequence length - // information. - for (size_t i = length - 1; i > 0; i--) { - s[i] = (wc & 0x3f) | 0x80; - wc >>= 6; - } - *s = (wc & 0xff) | lead; - - return length; + // Our wchar_t is UTF-32 + return c32rtomb(s, static_cast<char32_t>(wc), state); } size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) { @@ -352,7 +174,7 @@ size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstat r = 1; } else { r = wcrtomb(buf, wc, state); - if (r == ERR_ILLEGAL_SEQUENCE) { + if (r == __MB_ERR_ILLEGAL_SEQUENCE) { return r; } } @@ -373,14 +195,14 @@ size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstat } else if (len - o >= sizeof(buf)) { // Enough space to translate in-place. r = wcrtomb(dst + o, wc, state); - if (r == ERR_ILLEGAL_SEQUENCE) { + if (r == __MB_ERR_ILLEGAL_SEQUENCE) { *src += i; return r; } } else { // May not be enough space; use temp buffer. r = wcrtomb(buf, wc, state); - if (r == ERR_ILLEGAL_SEQUENCE) { + if (r == __MB_ERR_ILLEGAL_SEQUENCE) { *src += i; return r; } diff --git a/libc/include/uchar.h b/libc/include/uchar.h new file mode 100644 index 0000000..e1fcb5c --- /dev/null +++ b/libc/include/uchar.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _UCHAR_H_ +#define _UCHAR_H_ + +#include <sys/cdefs.h> +#include <wchar.h> + +__BEGIN_DECLS + +#define __STD_UTF_16__ 1 +#define __STD_UTF_32__ 1 + +size_t c16rtomb(char* __restrict, char16_t, mbstate_t* __restrict); +size_t c32rtomb(char* __restrict, char32_t, mbstate_t* __restrict); +size_t mbrtoc16(char16_t* __restrict, + const char* __restrict, + size_t, + mbstate_t* __restrict); +size_t mbrtoc32(char32_t* __restrict, + const char* __restrict, + size_t, + mbstate_t* __restrict); + +__END_DECLS + +#endif /* _UCHAR_H_ */ diff --git a/libc/private/bionic_mbstate.h b/libc/private/bionic_mbstate.h new file mode 100644 index 0000000..018b47c --- /dev/null +++ b/libc/private/bionic_mbstate.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _BIONIC_MBSTATE_H +#define _BIONIC_MBSTATE_H + +#include <wchar.h> + +__BEGIN_DECLS + +/* + * These return values are specified by POSIX for multibyte conversion + * functions. + */ +#define __MB_ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1) +#define __MB_ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2) + +#define __MB_IS_ERR(rv) (rv == __MB_ERR_ILLEGAL_SEQUENCE || \ + rv == __MB_ERR_INCOMPLETE_SEQUENCE) + +size_t mbstate_bytes_so_far(const mbstate_t* ps); +void mbstate_set_byte(mbstate_t* ps, int i, char byte); +uint8_t mbstate_get_byte(const mbstate_t* ps, int n); +size_t reset_and_return_illegal(int _errno, mbstate_t* ps); +size_t reset_and_return(int _return, mbstate_t* ps); + +__END_DECLS + +#endif // _BIONIC_MBSTATE_H diff --git a/tests/Android.mk b/tests/Android.mk index 811c12a..25f8b2b 100644 --- a/tests/Android.mk +++ b/tests/Android.mk @@ -108,6 +108,7 @@ libBionicStandardTests_src_files := \ sys_vfs_test.cpp \ system_properties_test.cpp \ time_test.cpp \ + uchar_test.cpp \ unistd_test.cpp \ wchar_test.cpp \ diff --git a/tests/uchar_test.cpp b/tests/uchar_test.cpp new file mode 100644 index 0000000..5f230f0 --- /dev/null +++ b/tests/uchar_test.cpp @@ -0,0 +1,412 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include <sys/cdefs.h> +#if defined(__BIONIC__) +#define HAVE_UCHAR 1 +#elif defined(__GLIBC__) +#include <features.h> +#define HAVE_UCHAR __GLIBC_PREREQ(2, 16) +#endif + +#include <gtest/gtest.h> + +#include <errno.h> +#include <limits.h> +#include <locale.h> +#include <stdint.h> + +#if HAVE_UCHAR +#include <uchar.h> +#endif + +TEST(uchar, sizeof_uchar_t) { +#if HAVE_UCHAR + EXPECT_EQ(2U, sizeof(char16_t)); + EXPECT_EQ(4U, sizeof(char32_t)); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, start_state) { +#if HAVE_UCHAR + char out[MB_LEN_MAX]; + mbstate_t ps; + + // Any non-initial state is invalid when calling c32rtomb. + memset(&ps, 0, sizeof(ps)); + EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(NULL, "\xc2", 1, &ps)); + EXPECT_EQ(static_cast<size_t>(-1), c32rtomb(out, 0x00a2, &ps)); + EXPECT_EQ(EILSEQ, errno); + + // If the first argument to c32rtomb is NULL or the second is L'\0' the shift + // state should be reset. + memset(&ps, 0, sizeof(ps)); + EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(NULL, "\xc2", 1, &ps)); + EXPECT_EQ(1U, c32rtomb(NULL, 0x00a2, &ps)); + EXPECT_TRUE(mbsinit(&ps)); + + memset(&ps, 0, sizeof(ps)); + EXPECT_EQ(static_cast<size_t>(-2), mbrtoc32(NULL, "\xf0\xa4", 1, &ps)); + EXPECT_EQ(1U, c32rtomb(out, L'\0', &ps)); + EXPECT_TRUE(mbsinit(&ps)); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, c16rtomb_null_out) { +#if HAVE_UCHAR + EXPECT_EQ(1U, c16rtomb(NULL, L'\0', NULL)); + EXPECT_EQ(1U, c16rtomb(NULL, L'h', NULL)); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, c16rtomb_null_char) { +#if HAVE_UCHAR + char bytes[MB_LEN_MAX]; + EXPECT_EQ(1U, c16rtomb(bytes, L'\0', NULL)); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, c16rtomb) { +#if HAVE_UCHAR + char bytes[MB_LEN_MAX]; + + memset(bytes, 0, sizeof(bytes)); + EXPECT_EQ(1U, c16rtomb(bytes, L'h', NULL)); + EXPECT_EQ('h', bytes[0]); + + ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8")); + uselocale(LC_GLOBAL_LOCALE); + + // 1-byte UTF-8. + memset(bytes, 0, sizeof(bytes)); + EXPECT_EQ(1U, c16rtomb(bytes, L'h', NULL)); + EXPECT_EQ('h', bytes[0]); + // 2-byte UTF-8. + memset(bytes, 0, sizeof(bytes)); + EXPECT_EQ(2U, c16rtomb(bytes, 0x00a2, NULL)); + EXPECT_EQ('\xc2', bytes[0]); + EXPECT_EQ('\xa2', bytes[1]); + // 3-byte UTF-8. + memset(bytes, 0, sizeof(bytes)); + EXPECT_EQ(3U, c16rtomb(bytes, 0x20ac, NULL)); + EXPECT_EQ('\xe2', bytes[0]); + EXPECT_EQ('\x82', bytes[1]); + EXPECT_EQ('\xac', bytes[2]); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, c16rtomb_surrogate) { +#if HAVE_UCHAR + char bytes[MB_LEN_MAX]; + + memset(bytes, 0, sizeof(bytes)); + EXPECT_EQ(0U, c16rtomb(bytes, 0xdbea, NULL)); + EXPECT_EQ(4U, c16rtomb(bytes, 0xdfcd, NULL)); + EXPECT_EQ('\xf4', bytes[0]); + EXPECT_EQ('\x8a', bytes[1]); + EXPECT_EQ('\xaf', bytes[2]); + EXPECT_EQ('\x8d', bytes[3]); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, c16rtomb_invalid) { +#if HAVE_UCHAR + char bytes[MB_LEN_MAX]; + + memset(bytes, 0, sizeof(bytes)); + EXPECT_EQ(static_cast<size_t>(-1), c16rtomb(bytes, 0xdfcd, NULL)); + + EXPECT_EQ(0U, c16rtomb(bytes, 0xdbea, NULL)); + EXPECT_EQ(static_cast<size_t>(-1), c16rtomb(bytes, 0xdbea, NULL)); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, mbrtoc16_null) { +#if HAVE_UCHAR + ASSERT_EQ(0U, mbrtoc16(NULL, NULL, 0, NULL)); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, mbrtoc16_zero_len) { +#if HAVE_UCHAR + char16_t out; + + out = 'x'; + ASSERT_EQ(0U, mbrtoc16(&out, "hello", 0, NULL)); + ASSERT_EQ('x', out); + + ASSERT_EQ(0U, mbrtoc16(&out, "hello", 0, NULL)); + ASSERT_EQ(0U, mbrtoc16(&out, "", 0, NULL)); + ASSERT_EQ(1U, mbrtoc16(&out, "hello", 1, NULL)); + ASSERT_EQ(L'h', out); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, mbrtoc16) { +#if HAVE_UCHAR + char16_t out; + + ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8")); + uselocale(LC_GLOBAL_LOCALE); + + // 1-byte UTF-8. + ASSERT_EQ(1U, mbrtoc16(&out, "abcdef", 6, NULL)); + ASSERT_EQ(L'a', out); + // 2-byte UTF-8. + ASSERT_EQ(2U, mbrtoc16(&out, "\xc2\xa2" "cdef", 6, NULL)); + ASSERT_EQ(static_cast<char16_t>(0x00a2), out); + // 3-byte UTF-8. + ASSERT_EQ(3U, mbrtoc16(&out, "\xe2\x82\xac" "def", 6, NULL)); + ASSERT_EQ(static_cast<char16_t>(0x20ac), out); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, mbrtoc16_surrogate) { +#if HAVE_UCHAR + char16_t out; + + ASSERT_EQ(static_cast<size_t>(-3), + mbrtoc16(&out, "\xf4\x8a\xaf\x8d", 6, NULL)); + ASSERT_EQ(static_cast<char16_t>(0xdbea), out); + ASSERT_EQ(4U, mbrtoc16(&out, "\xf4\x8a\xaf\x8d" "ef", 6, NULL)); + ASSERT_EQ(static_cast<char16_t>(0xdfcd), out); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, mbrtoc16_reserved_range) { +#if HAVE_UCHAR + char16_t out; + ASSERT_EQ(static_cast<size_t>(-1), + mbrtoc16(&out, "\xf0\x80\xbf\xbf", 6, NULL)); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, mbrtoc16_beyond_range) { +#if HAVE_UCHAR + char16_t out; + ASSERT_EQ(static_cast<size_t>(-1), + mbrtoc16(&out, "\xf5\x80\x80\x80", 6, NULL)); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +#if HAVE_UCHAR +void test_mbrtoc16_incomplete(mbstate_t* ps) { + ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8")); + uselocale(LC_GLOBAL_LOCALE); + + char16_t out; + // 2-byte UTF-8. + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\xc2", 1, ps)); + ASSERT_EQ(1U, mbrtoc16(&out, "\xa2" "cdef", 5, ps)); + ASSERT_EQ(static_cast<char16_t>(0x00a2), out); + ASSERT_TRUE(mbsinit(ps)); + // 3-byte UTF-8. + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\xe2", 1, ps)); + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\x82", 1, ps)); + ASSERT_EQ(1U, mbrtoc16(&out, "\xac" "def", 4, ps)); + ASSERT_EQ(static_cast<char16_t>(0x20ac), out); + ASSERT_TRUE(mbsinit(ps)); + // 4-byte UTF-8. + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\xf4", 1, ps)); + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\x8a\xaf", 2, ps)); + ASSERT_EQ(static_cast<size_t>(-3), mbrtoc16(&out, "\x8d" "ef", 3, ps)); + ASSERT_EQ(static_cast<char16_t>(0xdbea), out); + ASSERT_EQ(1U, mbrtoc16(&out, "\x80" "ef", 3, ps)); + ASSERT_EQ(static_cast<char16_t>(0xdfcd), out); + ASSERT_TRUE(mbsinit(ps)); + + // Invalid 2-byte + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\xc2", 1, ps)); + ASSERT_EQ(static_cast<size_t>(-1), mbrtoc16(&out, "\x20" "cdef", 5, ps)); + ASSERT_EQ(EILSEQ, errno); +} +#endif + +TEST(uchar, mbrtoc16_incomplete) { +#if HAVE_UCHAR + mbstate_t ps; + memset(&ps, 0, sizeof(ps)); + + test_mbrtoc16_incomplete(&ps); + test_mbrtoc16_incomplete(NULL); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, c32rtomb) { +#if HAVE_UCHAR + EXPECT_EQ(1U, c32rtomb(NULL, L'\0', NULL)); + EXPECT_EQ(1U, c32rtomb(NULL, L'h', NULL)); + + char bytes[MB_LEN_MAX]; + + EXPECT_EQ(1U, c32rtomb(bytes, L'\0', NULL)); + + memset(bytes, 0, sizeof(bytes)); + EXPECT_EQ(1U, c32rtomb(bytes, L'h', NULL)); + EXPECT_EQ('h', bytes[0]); + + ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8")); + uselocale(LC_GLOBAL_LOCALE); + + // 1-byte UTF-8. + memset(bytes, 0, sizeof(bytes)); + EXPECT_EQ(1U, c32rtomb(bytes, L'h', NULL)); + EXPECT_EQ('h', bytes[0]); + // 2-byte UTF-8. + memset(bytes, 0, sizeof(bytes)); + EXPECT_EQ(2U, c32rtomb(bytes, 0x00a2, NULL)); + EXPECT_EQ('\xc2', bytes[0]); + EXPECT_EQ('\xa2', bytes[1]); + // 3-byte UTF-8. + memset(bytes, 0, sizeof(bytes)); + EXPECT_EQ(3U, c32rtomb(bytes, 0x20ac, NULL)); + EXPECT_EQ('\xe2', bytes[0]); + EXPECT_EQ('\x82', bytes[1]); + EXPECT_EQ('\xac', bytes[2]); + // 4-byte UTF-8. + memset(bytes, 0, sizeof(bytes)); + EXPECT_EQ(4U, c32rtomb(bytes, 0x24b62, NULL)); + EXPECT_EQ('\xf0', bytes[0]); + EXPECT_EQ('\xa4', bytes[1]); + EXPECT_EQ('\xad', bytes[2]); + EXPECT_EQ('\xa2', bytes[3]); + // Invalid code point. + EXPECT_EQ(static_cast<size_t>(-1), c32rtomb(bytes, 0xffffffff, NULL)); + EXPECT_EQ(EILSEQ, errno); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +TEST(uchar, mbrtoc32) { +#if HAVE_UCHAR + char32_t out[8]; + + out[0] = 'x'; + ASSERT_EQ(0U, mbrtoc32(out, "hello", 0, NULL)); + ASSERT_EQ('x', out[0]); + + ASSERT_EQ(0U, mbrtoc32(out, "hello", 0, NULL)); + ASSERT_EQ(0U, mbrtoc32(out, "", 0, NULL)); + ASSERT_EQ(1U, mbrtoc32(out, "hello", 1, NULL)); + ASSERT_EQ(L'h', out[0]); + + ASSERT_EQ(0U, mbrtoc32(NULL, "hello", 0, NULL)); + ASSERT_EQ(0U, mbrtoc32(NULL, "", 0, NULL)); + ASSERT_EQ(1U, mbrtoc32(NULL, "hello", 1, NULL)); + + ASSERT_EQ(0U, mbrtoc32(NULL, NULL, 0, NULL)); + + ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8")); + uselocale(LC_GLOBAL_LOCALE); + + // 1-byte UTF-8. + ASSERT_EQ(1U, mbrtoc32(out, "abcdef", 6, NULL)); + ASSERT_EQ(L'a', out[0]); + // 2-byte UTF-8. + ASSERT_EQ(2U, mbrtoc32(out, "\xc2\xa2" "cdef", 6, NULL)); + ASSERT_EQ(static_cast<char32_t>(0x00a2), out[0]); + // 3-byte UTF-8. + ASSERT_EQ(3U, mbrtoc32(out, "\xe2\x82\xac" "def", 6, NULL)); + ASSERT_EQ(static_cast<char32_t>(0x20ac), out[0]); + // 4-byte UTF-8. + ASSERT_EQ(4U, mbrtoc32(out, "\xf0\xa4\xad\xa2" "ef", 6, NULL)); + ASSERT_EQ(static_cast<char32_t>(0x24b62), out[0]); +#if defined(__BIONIC__) // glibc allows this. + // Illegal 5-byte UTF-8. + ASSERT_EQ(static_cast<size_t>(-1), mbrtoc32(out, "\xf8\xa1\xa2\xa3\xa4" "f", 6, NULL)); + ASSERT_EQ(EILSEQ, errno); +#endif + // Illegal over-long sequence. + ASSERT_EQ(static_cast<size_t>(-1), mbrtoc32(out, "\xf0\x82\x82\xac" "ef", 6, NULL)); + ASSERT_EQ(EILSEQ, errno); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + +#if HAVE_UCHAR +void test_mbrtoc32_incomplete(mbstate_t* ps) { + ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8")); + uselocale(LC_GLOBAL_LOCALE); + + char32_t out; + // 2-byte UTF-8. + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc32(&out, "\xc2", 1, ps)); + ASSERT_EQ(1U, mbrtoc32(&out, "\xa2" "cdef", 5, ps)); + ASSERT_EQ(static_cast<char32_t>(0x00a2), out); + ASSERT_TRUE(mbsinit(ps)); + // 3-byte UTF-8. + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc32(&out, "\xe2", 1, ps)); + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc32(&out, "\x82", 1, ps)); + ASSERT_EQ(1U, mbrtoc32(&out, "\xac" "def", 4, ps)); + ASSERT_EQ(static_cast<char32_t>(0x20ac), out); + ASSERT_TRUE(mbsinit(ps)); + // 4-byte UTF-8. + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc32(&out, "\xf0", 1, ps)); + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc32(&out, "\xa4\xad", 2, ps)); + ASSERT_EQ(1U, mbrtoc32(&out, "\xa2" "ef", 3, ps)); + ASSERT_EQ(static_cast<char32_t>(0x24b62), out); + ASSERT_TRUE(mbsinit(ps)); + + // Invalid 2-byte + ASSERT_EQ(static_cast<size_t>(-2), mbrtoc32(&out, "\xc2", 1, ps)); + ASSERT_EQ(static_cast<size_t>(-1), mbrtoc32(&out, "\x20" "cdef", 5, ps)); + ASSERT_EQ(EILSEQ, errno); +} +#endif + +TEST(uchar, mbrtoc32_incomplete) { +#if HAVE_UCHAR + mbstate_t ps; + memset(&ps, 0, sizeof(ps)); + + test_mbrtoc32_incomplete(&ps); + test_mbrtoc32_incomplete(NULL); +#else + GTEST_LOG_(INFO) << "uchar.h is unavailable.\n"; +#endif +} + |