diff options
author | Narayan Kamath <narayan@google.com> | 2015-01-29 20:06:46 +0000 |
---|---|---|
committer | Narayan Kamath <narayan@google.com> | 2015-02-12 11:54:37 +0000 |
commit | a5afcfc73141e5e378d79a326d02c5c2039fb025 (patch) | |
tree | 424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime/utf_test.cc | |
parent | 5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff) | |
download | art-a5afcfc73141e5e378d79a326d02c5c2039fb025.zip art-a5afcfc73141e5e378d79a326d02c5c2039fb025.tar.gz art-a5afcfc73141e5e378d79a326d02c5c2039fb025.tar.bz2 |
Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.
Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.
bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
Diffstat (limited to 'runtime/utf_test.cc')
-rw-r--r-- | runtime/utf_test.cc | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc new file mode 100644 index 0000000..8048bbd --- /dev/null +++ b/runtime/utf_test.cc @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utf.h" + +#include "common_runtime_test.h" +#include "utf-inl.h" + +namespace art { + +class UtfTest : public CommonRuntimeTest {}; + +TEST_F(UtfTest, GetLeadingUtf16Char) { + EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff)); +} + +TEST_F(UtfTest, GetTrailingUtf16Char) { + EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee)); + EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa)); +} + +#define EXPECT_ARRAY_POSITION(expected, end, start) \ + EXPECT_EQ(static_cast<uintptr_t>(expected), \ + reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start)); + +// A test string containing one, two, three and four byte UTF-8 sequences. +static const uint8_t kAllSequences[] = { + 0x24, + 0xc2, 0xa2, + 0xe2, 0x82, 0xac, + 0xf0, 0x9f, 0x8f, 0xa0, + 0x00 +}; + +// A test string that contains a UTF-8 encoding of a surrogate pair +// (code point = U+10400) +static const uint8_t kSurrogateEncoding[] = { + 0xed, 0xa0, 0x81, + 0xed, 0xb0, 0x80, + 0x00 +}; + +TEST_F(UtfTest, GetUtf16FromUtf8) { + const char* const start = reinterpret_cast<const char*>(kAllSequences); + const char* ptr = start; + uint32_t pair = 0; + + // Single byte sequence. + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0x24, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(1, ptr, start); + + // Two byte sequence + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(3, ptr, start); + + // Three byte sequence + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(6, ptr, start); + + // Four byte sequence + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(10, ptr, start); + + // Null terminator + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(11, ptr, start); +} + +TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) { + const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding); + const char* ptr = start; + uint32_t pair = 0; + + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(3, ptr, start); + + pair = GetUtf16FromUtf8(&ptr); + EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair)); + EXPECT_EQ(0, GetTrailingUtf16Char(pair)); + EXPECT_ARRAY_POSITION(6, ptr, start); +} + +TEST_F(UtfTest, CountModifiedUtf8Chars) { + EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences))); + EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding))); +} + +} // namespace art |