summaryrefslogtreecommitdiffstats
path: root/runtime/utf_test.cc
diff options
context:
space:
mode:
authorNarayan Kamath <narayan@google.com>2015-01-29 20:06:46 +0000
committerNarayan Kamath <narayan@google.com>2015-02-12 11:54:37 +0000
commita5afcfc73141e5e378d79a326d02c5c2039fb025 (patch)
tree424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime/utf_test.cc
parent5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff)
downloadart-a5afcfc73141e5e378d79a326d02c5c2039fb025.zip
art-a5afcfc73141e5e378d79a326d02c5c2039fb025.tar.gz
art-a5afcfc73141e5e378d79a326d02c5c2039fb025.tar.bz2
Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate pairs instead of expecting 2 separate 3 byte sequences each encoding one half of a surrogate pair. Note that in addition to supporting 4 byte sequences in strings from JNI, we also tolerate them in dex files. This is mainly for consistency, and there's no need to claim any sort of official support. bug: 18848397 bug: https://code.google.com/p/android/issues/detail?id=81341 Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
Diffstat (limited to 'runtime/utf_test.cc')
-rw-r--r--runtime/utf_test.cc113
1 files changed, 113 insertions, 0 deletions
diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc
new file mode 100644
index 0000000..8048bbd
--- /dev/null
+++ b/runtime/utf_test.cc
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utf.h"
+
+#include "common_runtime_test.h"
+#include "utf-inl.h"
+
+namespace art {
+
+class UtfTest : public CommonRuntimeTest {};
+
+TEST_F(UtfTest, GetLeadingUtf16Char) {
+ EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
+}
+
+TEST_F(UtfTest, GetTrailingUtf16Char) {
+ EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
+ EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
+}
+
+#define EXPECT_ARRAY_POSITION(expected, end, start) \
+ EXPECT_EQ(static_cast<uintptr_t>(expected), \
+ reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
+
+// A test string containing one, two, three and four byte UTF-8 sequences.
+static const uint8_t kAllSequences[] = {
+ 0x24,
+ 0xc2, 0xa2,
+ 0xe2, 0x82, 0xac,
+ 0xf0, 0x9f, 0x8f, 0xa0,
+ 0x00
+};
+
+// A test string that contains a UTF-8 encoding of a surrogate pair
+// (code point = U+10400)
+static const uint8_t kSurrogateEncoding[] = {
+ 0xed, 0xa0, 0x81,
+ 0xed, 0xb0, 0x80,
+ 0x00
+};
+
+TEST_F(UtfTest, GetUtf16FromUtf8) {
+ const char* const start = reinterpret_cast<const char*>(kAllSequences);
+ const char* ptr = start;
+ uint32_t pair = 0;
+
+ // Single byte sequence.
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(1, ptr, start);
+
+ // Two byte sequence
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(3, ptr, start);
+
+ // Three byte sequence
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(6, ptr, start);
+
+ // Four byte sequence
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(10, ptr, start);
+
+ // Null terminator
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(11, ptr, start);
+}
+
+TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
+ const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
+ const char* ptr = start;
+ uint32_t pair = 0;
+
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(3, ptr, start);
+
+ pair = GetUtf16FromUtf8(&ptr);
+ EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
+ EXPECT_EQ(0, GetTrailingUtf16Char(pair));
+ EXPECT_ARRAY_POSITION(6, ptr, start);
+}
+
+TEST_F(UtfTest, CountModifiedUtf8Chars) {
+ EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
+ EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
+}
+
+} // namespace art