diff options
author | Narayan Kamath <narayan@google.com> | 2015-01-29 20:06:46 +0000 |
---|---|---|
committer | Narayan Kamath <narayan@google.com> | 2015-02-12 11:54:37 +0000 |
commit | a5afcfc73141e5e378d79a326d02c5c2039fb025 (patch) | |
tree | 424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime | |
parent | 5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff) | |
download | art-a5afcfc73141e5e378d79a326d02c5c2039fb025.zip art-a5afcfc73141e5e378d79a326d02c5c2039fb025.tar.gz art-a5afcfc73141e5e378d79a326d02c5c2039fb025.tar.bz2 |
Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.
Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.
bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
Diffstat (limited to 'runtime')
-rw-r--r-- | runtime/check_jni.cc | 30 | ||||
-rw-r--r-- | runtime/jni_internal_test.cc | 31 | ||||
-rw-r--r-- | runtime/mirror/object_test.cc | 8 | ||||
-rw-r--r-- | runtime/mirror/string-inl.h | 18 | ||||
-rw-r--r-- | runtime/mirror/string.cc | 46 | ||||
-rw-r--r-- | runtime/mirror/string.h | 12 | ||||
-rw-r--r-- | runtime/utf-inl.h | 100 | ||||
-rw-r--r-- | runtime/utf.cc | 44 | ||||
-rw-r--r-- | runtime/utf.h | 22 | ||||
-rw-r--r-- | runtime/utf_test.cc | 113 | ||||
-rw-r--r-- | runtime/utils.cc | 101 |
11 files changed, 393 insertions, 132 deletions
diff --git a/runtime/check_jni.cc b/runtime/check_jni.cc index e45d3a3..6ec0949 100644 --- a/runtime/check_jni.cc +++ b/runtime/check_jni.cc @@ -1095,6 +1095,8 @@ class ScopedCheck { return true; } + // Checks whether |bytes| is valid modified UTF-8. We also accept 4 byte UTF + // sequences in place of encoded surrogate pairs. static uint8_t CheckUtfBytes(const char* bytes, const char** errorKind) { while (*bytes != '\0') { uint8_t utf8 = *(bytes++); @@ -1114,14 +1116,26 @@ class ScopedCheck { case 0x09: case 0x0a: case 0x0b: - case 0x0f: - /* - * Bit pattern 10xx or 1111, which are illegal start bytes. - * Note: 1111 is valid for normal UTF-8, but not the - * Modified UTF-8 used here. - */ + // Bit patterns 10xx, which are illegal start bytes. *errorKind = "start"; return utf8; + case 0x0f: + // Bit pattern 1111, which might be the start of a 4 byte sequence. + if ((utf8 & 0x08) == 0) { + // Bit pattern 1111 0xxx, which is the start of a 4 byte sequence. + // We consume one continuation byte here, and fall through to consume two more. + utf8 = *(bytes++); + if ((utf8 & 0xc0) != 0x80) { + *errorKind = "continuation"; + return utf8; + } + } else { + *errorKind = "start"; + return utf8; + } + + // Fall through to the cases below to consume two more continuation bytes. + FALLTHROUGH_INTENDED; case 0x0e: // Bit pattern 1110, so there are two additional bytes. utf8 = *(bytes++); @@ -1129,7 +1143,9 @@ class ScopedCheck { *errorKind = "continuation"; return utf8; } - FALLTHROUGH_INTENDED; // Fall-through to take care of the final byte. + + // Fall through to consume one more continuation byte. + FALLTHROUGH_INTENDED; case 0x0c: case 0x0d: // Bit pattern 110x, so there is one additional byte. diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc index 906aa4c..1048214 100644 --- a/runtime/jni_internal_test.cc +++ b/runtime/jni_internal_test.cc @@ -1351,7 +1351,36 @@ TEST_F(JniInternalTest, NewStringUTF) { EXPECT_EQ(5, env_->GetStringLength(s)); EXPECT_EQ(5, env_->GetStringUTFLength(s)); - // TODO: check some non-ASCII strings. + // Encoded surrogate pair. + s = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80"); + EXPECT_NE(s, nullptr); + EXPECT_EQ(2, env_->GetStringLength(s)); + // Note that this uses 2 x 3 byte UTF sequences, one + // for each half of the surrogate pair. + EXPECT_EQ(6, env_->GetStringUTFLength(s)); + const char* chars = env_->GetStringUTFChars(s, nullptr); + EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80", chars); + env_->ReleaseStringUTFChars(s, chars); + + // 4 byte UTF sequence appended to an encoded surrogate pair. + s = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80 \xf0\x9f\x8f\xa0"); + EXPECT_NE(s, nullptr); + EXPECT_EQ(5, env_->GetStringLength(s)); + EXPECT_EQ(13, env_->GetStringUTFLength(s)); + chars = env_->GetStringUTFChars(s, nullptr); + // The 4 byte sequence {0xf0, 0x9f, 0x8f, 0xa0} is converted into a surrogate + // pair {0xd83c, 0xdfe0} which is then converted into a two three byte + // sequences {0xed 0xa0, 0xbc} and {0xed, 0xbf, 0xa0}, one for each half of + // the surrogate pair. + EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80 \xed\xa0\xbc\xed\xbf\xa0", chars); + env_->ReleaseStringUTFChars(s, chars); + + // A string with 1, 2, 3 and 4 byte UTF sequences with spaces + // between them + s = env_->NewStringUTF("\x24 \xc2\xa2 \xe2\x82\xac \xf0\x9f\x8f\xa0"); + EXPECT_NE(s, nullptr); + EXPECT_EQ(8, env_->GetStringLength(s)); + EXPECT_EQ(15, env_->GetStringUTFLength(s)); } TEST_F(JniInternalTest, NewString) { diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc index fb42d28..9b345a6 100644 --- a/runtime/mirror/object_test.cc +++ b/runtime/mirror/object_test.cc @@ -67,7 +67,7 @@ class ObjectTest : public CommonRuntimeTest { ASSERT_TRUE(string->Equals(utf8_in) || (expected_utf16_length == 1 && strlen(utf8_in) == 0)); ASSERT_TRUE(string->Equals(StringPiece(utf8_in)) || (expected_utf16_length == 1 && strlen(utf8_in) == 0)); for (int32_t i = 0; i < expected_utf16_length; i++) { - EXPECT_EQ(utf16_expected[i], string->CharAt(i)); + EXPECT_EQ(utf16_expected[i], string->UncheckedCharAt(i)); } EXPECT_EQ(expected_hash, string->GetHashCode()); } @@ -424,6 +424,12 @@ TEST_F(ObjectTest, String) { AssertString(1, "\xe1\x88\xb4", "\x12\x34", 0x1234); AssertString(1, "\xef\xbf\xbf", "\xff\xff", 0xffff); AssertString(3, "h\xe1\x88\xb4i", "\x00\x68\x12\x34\x00\x69", (31 * ((31 * 0x68) + 0x1234)) + 0x69); + + // Test four-byte characters. + AssertString(2, "\xf0\x9f\x8f\xa0", "\xd8\x3c\xdf\xe0", (31 * 0xd83c) + 0xdfe0); + AssertString(2, "\xf0\x9f\x9a\x80", "\xd8\x3d\xde\x80", (31 * 0xd83d) + 0xde80); + AssertString(4, "h\xf0\x9f\x9a\x80i", "\x00\x68\xd8\x3d\xde\x80\x00\x69", + (31 * (31 * (31 * 0x68 + 0xd83d) + 0xde80) + 0x69)); } TEST_F(ObjectTest, StringEqualsUtf8) { diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h index 14d7de2..4a95519 100644 --- a/runtime/mirror/string-inl.h +++ b/runtime/mirror/string-inl.h @@ -33,6 +33,10 @@ inline uint32_t String::ClassSize() { return Class::ComputeClassSize(true, vtable_entries, 0, 1, 0, 1, 2); } +inline uint16_t String::UncheckedCharAt(int32_t index) { + return GetCharArray()->Get(index + GetOffset()); +} + inline CharArray* String::GetCharArray() { return GetFieldObject<CharArray>(ValueOffset()); } @@ -54,20 +58,6 @@ inline String* String::Intern() { return Runtime::Current()->GetInternTable()->InternWeak(this); } -inline uint16_t String::CharAt(int32_t index) { - // TODO: do we need this? Equals is the only caller, and could - // bounds check itself. - DCHECK_GE(count_, 0); // ensures the unsigned comparison is safe. - if (UNLIKELY(static_cast<uint32_t>(index) >= static_cast<uint32_t>(count_))) { - Thread* self = Thread::Current(); - ThrowLocation throw_location = self->GetCurrentLocationForThrow(); - self->ThrowNewExceptionF(throw_location, "Ljava/lang/StringIndexOutOfBoundsException;", - "length=%i; index=%i", count_, index); - return 0; - } - return GetCharArray()->Get(index + GetOffset()); -} - inline int32_t String::GetHashCode() { int32_t result = GetField32(OFFSET_OF_OBJECT_MEMBER(String, hash_code_)); if (UNLIKELY(result == 0)) { diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc index e199d0e..e7c88c5 100644 --- a/runtime/mirror/string.cc +++ b/runtime/mirror/string.cc @@ -147,7 +147,7 @@ bool String::Equals(String* that) { // Note: don't short circuit on hash code as we're presumably here as the // hash code was already equal for (int32_t i = 0; i < that->GetLength(); ++i) { - if (this->CharAt(i) != that->CharAt(i)) { + if (this->UncheckedCharAt(i) != that->UncheckedCharAt(i)) { return false; } } @@ -160,7 +160,7 @@ bool String::Equals(const uint16_t* that_chars, int32_t that_offset, int32_t tha return false; } else { for (int32_t i = 0; i < that_length; ++i) { - if (this->CharAt(i) != that_chars[that_offset + i]) { + if (this->UncheckedCharAt(i) != that_chars[that_offset + i]) { return false; } } @@ -169,22 +169,52 @@ bool String::Equals(const uint16_t* that_chars, int32_t that_offset, int32_t tha } bool String::Equals(const char* modified_utf8) { - for (int32_t i = 0; i < GetLength(); ++i) { - uint16_t ch = GetUtf16FromUtf8(&modified_utf8); - if (ch == '\0' || ch != CharAt(i)) { + const int32_t length = GetLength(); + int32_t i = 0; + while (i < length) { + const uint32_t ch = GetUtf16FromUtf8(&modified_utf8); + if (ch == '\0') { return false; } + + if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i++)) { + return false; + } + + const uint16_t trailing = GetTrailingUtf16Char(ch); + if (trailing != 0) { + if (i == length) { + return false; + } + + if (UncheckedCharAt(i++) != trailing) { + return false; + } + } } return *modified_utf8 == '\0'; } bool String::Equals(const StringPiece& modified_utf8) { + const int32_t length = GetLength(); const char* p = modified_utf8.data(); - for (int32_t i = 0; i < GetLength(); ++i) { - uint16_t ch = GetUtf16FromUtf8(&p); - if (ch != CharAt(i)) { + for (int32_t i = 0; i < length; ++i) { + uint32_t ch = GetUtf16FromUtf8(&p); + + if (GetLeadingUtf16Char(ch) != UncheckedCharAt(i)) { return false; } + + const uint16_t trailing = GetTrailingUtf16Char(ch); + if (trailing != 0) { + if (i == (length - 1)) { + return false; + } + + if (UncheckedCharAt(++i) != trailing) { + return false; + } + } } return true; } diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h index 30b8aa3..6c22b9b 100644 --- a/runtime/mirror/string.h +++ b/runtime/mirror/string.h @@ -69,8 +69,6 @@ class MANAGED String FINAL : public Object { int32_t GetUtfLength() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); - uint16_t CharAt(int32_t index) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); - String* Intern() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); static String* AllocFromUtf16(Thread* self, @@ -86,9 +84,14 @@ class MANAGED String FINAL : public Object { const char* utf8_data_in) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); + // TODO: This is only used in the interpreter to compare against + // entries from a dex files constant pool (ArtField names). Should + // we unify this with Equals(const StringPiece&); ? bool Equals(const char* modified_utf8) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); - // TODO: do we need this overload? give it a more intention-revealing name. + // TODO: This is only used to compare DexCache.location with + // a dex_file's location (which is an std::string). Do we really + // need this in mirror::String just for that one usage ? bool Equals(const StringPiece& modified_utf8) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); @@ -127,6 +130,9 @@ class MANAGED String FINAL : public Object { static void VisitRoots(RootCallback* callback, void* arg) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); + // TODO: Make this private. It's only used on ObjectTest at the moment. + uint16_t UncheckedCharAt(int32_t index) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); + private: void SetHashCode(int32_t new_hash_code) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) { // Hash code is invariant so use non-transactional mode. Also disable check as we may run inside diff --git a/runtime/utf-inl.h b/runtime/utf-inl.h index 1373d17..b2d6765 100644 --- a/runtime/utf-inl.h +++ b/runtime/utf-inl.h @@ -21,26 +21,57 @@ namespace art { -inline uint16_t GetUtf16FromUtf8(const char** utf8_data_in) { - uint8_t one = *(*utf8_data_in)++; +inline uint16_t GetTrailingUtf16Char(uint32_t maybe_pair) { + return static_cast<uint16_t>(maybe_pair >> 16); +} + +inline uint16_t GetLeadingUtf16Char(uint32_t maybe_pair) { + return static_cast<uint16_t>(maybe_pair & 0x0000FFFF); +} + +inline uint32_t GetUtf16FromUtf8(const char** utf8_data_in) { + const uint8_t one = *(*utf8_data_in)++; if ((one & 0x80) == 0) { // one-byte encoding return one; } - // two- or three-byte encoding - uint8_t two = *(*utf8_data_in)++; + + const uint8_t two = *(*utf8_data_in)++; if ((one & 0x20) == 0) { // two-byte encoding return ((one & 0x1f) << 6) | (two & 0x3f); } - // three-byte encoding - uint8_t three = *(*utf8_data_in)++; - return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f); + + const uint8_t three = *(*utf8_data_in)++; + if ((one & 0x10) == 0) { + return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f); + } + + // Four byte encodings need special handling. We'll have + // to convert them into a surrogate pair. + const uint8_t four = *(*utf8_data_in)++; + + // Since this is a 4 byte UTF-8 sequence, it will lie between + // U+10000 and U+1FFFFF. + // + // TODO: What do we do about values in (U+10FFFF, U+1FFFFF) ? The + // spec says they're invalid but nobody appears to check for them. + const uint32_t code_point = ((one & 0x0f) << 18) | ((two & 0x3f) << 12) + | ((three & 0x3f) << 6) | (four & 0x3f); + + uint32_t surrogate_pair = 0; + // Step two: Write out the high (leading) surrogate to the bottom 16 bits + // of the of the 32 bit type. + surrogate_pair |= ((code_point >> 10) + 0xd7c0) & 0xffff; + // Step three : Write out the low (trailing) surrogate to the top 16 bits. + surrogate_pair |= ((code_point & 0x03ff) + 0xdc00) << 16; + + return surrogate_pair; } inline int CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(const char* utf8_1, const char* utf8_2) { - uint16_t c1, c2; + uint32_t c1, c2; do { c1 = *utf8_1; c2 = *utf8_2; @@ -50,50 +81,17 @@ inline int CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(const char* u } else if (c2 == 0) { return 1; } - // Assume 1-byte value and handle all cases first. - utf8_1++; - utf8_2++; - if ((c1 & 0x80) == 0) { - if (c1 == c2) { - // Matching 1-byte values. - continue; - } else { - // Non-matching values. - if ((c2 & 0x80) == 0) { - // 1-byte value, do nothing. - } else if ((c2 & 0x20) == 0) { - // 2-byte value. - c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f); - } else { - // 3-byte value. - c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f); - } - return static_cast<int>(c1) - static_cast<int>(c2); - } - } - // Non-matching or multi-byte values. - if ((c1 & 0x20) == 0) { - // 2-byte value. - c1 = ((c1 & 0x1f) << 6) | (*utf8_1 & 0x3f); - utf8_1++; - } else { - // 3-byte value. - c1 = ((c1 & 0x0f) << 12) | ((utf8_1[0] & 0x3f) << 6) | (utf8_1[1] & 0x3f); - utf8_1 += 2; - } - if ((c2 & 0x80) == 0) { - // 1-byte value, do nothing. - } else if ((c2 & 0x20) == 0) { - // 2-byte value. - c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f); - utf8_2++; - } else { - // 3-byte value. - c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f); - utf8_2 += 2; - } + + c1 = GetUtf16FromUtf8(&utf8_1); + c2 = GetUtf16FromUtf8(&utf8_2); } while (c1 == c2); - return static_cast<int>(c1) - static_cast<int>(c2); + + const uint32_t leading_surrogate_diff = GetLeadingUtf16Char(c1) - GetLeadingUtf16Char(c2); + if (leading_surrogate_diff != 0) { + return static_cast<int>(leading_surrogate_diff); + } + + return GetTrailingUtf16Char(c1) - GetTrailingUtf16Char(c2); } } // namespace art diff --git a/runtime/utf.cc b/runtime/utf.cc index 7ff296b..39c8d15 100644 --- a/runtime/utf.cc +++ b/runtime/utf.cc @@ -38,15 +38,30 @@ size_t CountModifiedUtf8Chars(const char* utf8) { // two-byte encoding continue; } - // three-byte encoding utf8++; + if ((ic & 0x10) == 0) { + // three-byte encoding + continue; + } + + // four-byte encoding: needs to be converted into a surrogate + // pair. + utf8++; + len++; } return len; } void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) { while (*utf8_data_in != '\0') { - *utf16_data_out++ = GetUtf16FromUtf8(&utf8_data_in); + const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in); + const uint16_t leading = GetLeadingUtf16Char(ch); + const uint16_t trailing = GetTrailingUtf16Char(ch); + + *utf16_data_out++ = leading; + if (trailing != 0) { + *utf16_data_out++ = trailing; + } } } @@ -102,12 +117,29 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t return 1; } - int c1 = GetUtf16FromUtf8(&utf8); - int c2 = *utf16++; + const uint32_t pair = GetUtf16FromUtf8(&utf8); + + // First compare the leading utf16 char. + const uint16_t lhs = GetLeadingUtf16Char(pair); + const uint16_t rhs = *utf16++; --utf16_length; + if (lhs != rhs) { + return lhs > rhs ? 1 : -1; + } - if (c1 != c2) { - return c1 > c2 ? 1 : -1; + // Then compare the trailing utf16 char. First check if there + // are any characters left to consume. + const uint16_t lhs2 = GetTrailingUtf16Char(pair); + if (lhs2 != 0) { + if (utf16_length == 0) { + return 1; + } + + const uint16_t rhs2 = *utf16++; + --utf16_length; + if (lhs2 != rhs2) { + return lhs2 > rhs2 ? 1 : -1; + } } } } diff --git a/runtime/utf.h b/runtime/utf.h index 3ee07fe..dd38afa 100644 --- a/runtime/utf.h +++ b/runtime/utf.h @@ -85,12 +85,16 @@ int32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count); size_t ComputeModifiedUtf8Hash(const char* chars); /* - * Retrieve the next UTF-16 character from a UTF-8 string. + * Retrieve the next UTF-16 character or surrogate pair from a UTF-8 string. + * single byte, 2-byte and 3-byte UTF-8 sequences result in a single UTF-16 + * character whereas 4-byte UTF-8 sequences result in a surrogate pair. Use + * GetLeadingUtf16Char and GetTrailingUtf16Char to process the return value + * of this function. * * Advances "*utf8_data_in" to the start of the next character. * * WARNING: If a string is corrupted by dropping a '\0' in the middle - * of a 3-byte sequence, you can end up overrunning the buffer with + * of a multi byte sequence, you can end up overrunning the buffer with * reads (and possibly with the writes if the length was computed and * cached before the damage). For performance reasons, this function * assumes that the string being parsed is known to be valid (e.g., by @@ -98,7 +102,19 @@ size_t ComputeModifiedUtf8Hash(const char* chars); * out of dex files or other internal translations, so the only real * risk comes from the JNI NewStringUTF call. */ -uint16_t GetUtf16FromUtf8(const char** utf8_data_in); +uint32_t GetUtf16FromUtf8(const char** utf8_data_in); + +/** + * Gets the leading UTF-16 character from a surrogate pair, or the sole + * UTF-16 character from the return value of GetUtf16FromUtf8. + */ +ALWAYS_INLINE uint16_t GetLeadingUtf16Char(uint32_t maybe_pair); + +/** + * Gets the trailing UTF-16 character from a surrogate pair, or 0 otherwise + * from the return value of GetUtf16FromUtf8. + */ +ALWAYS_INLINE uint16_t GetTrailingUtf16Char(uint32_t maybe_pair); } // namespace art diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc new file mode 100644 index 0000000..8048bbd --- /dev/null +++ b/runtime/utf_test.cc @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utf.h" + +#include "common_runtime_test.h" +#include "utf-inl.h" + +namespace art { + +class UtfTest : public CommonRuntimeTest {}; + +TEST_F(UtfTest, GetLeadingUtf16Char) { + EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff)); +} + +TEST_F(UtfTest, GetTrailingUtf16Char) { + EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee)); + EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa)); +} + +#define EXPECT_ARRAY_POSITION(expected, end, start) \ + EXPECT_EQ(static_cast<uintptr_t>(expected), \ + reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start)); + +// A test string containing one, two, three and four byte UTF-8 sequences. +static const uint8_t kAllSequences[] = { + 0x24, + 0xc2, 0xa2, + 0xe2, 0x82, 0xac, + 0xf0, 0x9f, 0x8f, 0xa0, + 0x00 +}; + +// A test string that contains a UTF-8 encoding of a surrogate pair +// (code point = U+10400) +static const uint8_t kSurrogateEncoding[] = { + 0xed, 0xa0, 0x81, + 0xed, 0xb0, 0x80, + 0x00 +}; + +TEST_F(UtfTest, GetUtf16FromUtf8) { + const char* const start = reinterpret_cast<const char*>(kAllSequences); + const char* ptr = start; + uint32_t pair = 0; + + // Single byte sequence. + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0x24, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(1, ptr, start); + + // Two byte sequence + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(3, ptr, start); + + // Three byte sequence + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(6, ptr, start); + + // Four byte sequence + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(10, ptr, start); + + // Null terminator + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(11, ptr, start); +} + +TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) { + const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding); + const char* ptr = start; + uint32_t pair = 0; + + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(3, ptr, start); + + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(6, ptr, start); +} + +TEST_F(UtfTest, CountModifiedUtf8Chars) { + EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences))); + EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding))); +} + +} // namespace art diff --git a/runtime/utils.cc b/runtime/utils.cc index af16d7e..3ec9561 100644 --- a/runtime/utils.cc +++ b/runtime/utils.cc @@ -625,7 +625,7 @@ std::string PrintableString(const char* utf) { const char* p = utf; size_t char_count = CountModifiedUtf8Chars(p); for (size_t i = 0; i < char_count; ++i) { - uint16_t ch = GetUtf16FromUtf8(&p); + uint32_t ch = GetUtf16FromUtf8(&p); if (ch == '\\') { result += "\\\\"; } else if (ch == '\n') { @@ -634,10 +634,20 @@ std::string PrintableString(const char* utf) { result += "\\r"; } else if (ch == '\t') { result += "\\t"; - } else if (NeedsEscaping(ch)) { - StringAppendF(&result, "\\u%04x", ch); } else { - result += ch; + const uint16_t leading = GetLeadingUtf16Char(ch); + + if (NeedsEscaping(leading)) { + StringAppendF(&result, "\\u%04x", leading); + } else { + result += leading; + } + + const uint32_t trailing = GetTrailingUtf16Char(ch); + if (trailing != 0) { + // All high surrogates will need escaping. + StringAppendF(&result, "\\u%04x", trailing); + } } } result += '"'; @@ -650,7 +660,7 @@ std::string MangleForJni(const std::string& s) { size_t char_count = CountModifiedUtf8Chars(s.c_str()); const char* cp = &s[0]; for (size_t i = 0; i < char_count; ++i) { - uint16_t ch = GetUtf16FromUtf8(&cp); + uint32_t ch = GetUtf16FromUtf8(&cp); if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) { result.push_back(ch); } else if (ch == '.' || ch == '/') { @@ -662,7 +672,13 @@ std::string MangleForJni(const std::string& s) { } else if (ch == '[') { result += "_3"; } else { - StringAppendF(&result, "_0%04x", ch); + const uint16_t leading = GetLeadingUtf16Char(ch); + const uint32_t trailing = GetTrailingUtf16Char(ch); + + StringAppendF(&result, "_0%04x", leading); + if (trailing != 0) { + StringAppendF(&result, "_0%04x", trailing); + } } } return result; @@ -757,41 +773,50 @@ bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) { * document. */ - uint16_t utf16 = GetUtf16FromUtf8(pUtf8Ptr); - - // Perform follow-up tests based on the high 8 bits. - switch (utf16 >> 8) { - case 0x00: - // It's only valid if it's above the ISO-8859-1 high space (0xa0). - return (utf16 > 0x00a0); - case 0xd8: - case 0xd9: - case 0xda: - case 0xdb: - // It's a leading surrogate. Check to see that a trailing - // surrogate follows. - utf16 = GetUtf16FromUtf8(pUtf8Ptr); - return (utf16 >= 0xdc00) && (utf16 <= 0xdfff); - case 0xdc: - case 0xdd: - case 0xde: - case 0xdf: - // It's a trailing surrogate, which is not valid at this point. - return false; - case 0x20: - case 0xff: - // It's in the range that has spaces, controls, and specials. - switch (utf16 & 0xfff8) { - case 0x2000: - case 0x2008: - case 0x2028: - case 0xfff0: - case 0xfff8: + const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr); + + const uint16_t leading = GetLeadingUtf16Char(pair); + const uint32_t trailing = GetTrailingUtf16Char(pair); + + if (trailing == 0) { + // Perform follow-up tests based on the high 8 bits of the + // lower surrogate. + switch (leading >> 8) { + case 0x00: + // It's only valid if it's above the ISO-8859-1 high space (0xa0). + return (leading > 0x00a0); + case 0xd8: + case 0xd9: + case 0xda: + case 0xdb: + // It looks like a leading surrogate but we didn't find a trailing + // surrogate if we're here. return false; + case 0xdc: + case 0xdd: + case 0xde: + case 0xdf: + // It's a trailing surrogate, which is not valid at this point. + return false; + case 0x20: + case 0xff: + // It's in the range that has spaces, controls, and specials. + switch (leading & 0xfff8) { + case 0x2000: + case 0x2008: + case 0x2028: + case 0xfff0: + case 0xfff8: + return false; + } + break; } - break; + + return true; } - return true; + + // We have a surrogate pair. Check that trailing surrogate is well formed. + return (trailing >= 0xdc00 && trailing <= 0xdfff); } /* Return whether the pointed-at modified-UTF-8 encoded character is |