summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--core/java/com/android/internal/util/HanziToPinyin.java114
-rw-r--r--tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java70
2 files changed, 147 insertions, 37 deletions
diff --git a/core/java/com/android/internal/util/HanziToPinyin.java b/core/java/com/android/internal/util/HanziToPinyin.java
index 4368e98..6a4adaa 100644
--- a/core/java/com/android/internal/util/HanziToPinyin.java
+++ b/core/java/com/android/internal/util/HanziToPinyin.java
@@ -16,8 +16,6 @@
package com.android.internal.util;
-import com.google.android.util.AbstractMessageParser.Token;
-
import android.text.TextUtils;
import android.util.Log;
@@ -298,8 +296,10 @@ public class HanziToPinyin {
};
/** First and last Chinese character with known Pinyin according to zh collation */
- private static final String FIRST_UNIHAN = "\u5416";
- private static final String LAST_UNIHAN = "\u5497";
+ private static final String FIRST_PINYIN_UNIHAN = "\u5416";
+ private static final String LAST_PINYIN_UNIHAN = "\u5497";
+ /** The first Chinese character in Unicode block */
+ private static final char FIRST_UNIHAN = '\u3400';
private static final Collator COLLATOR = Collator.getInstance(Locale.CHINA);
private static HanziToPinyin sInstance;
@@ -311,10 +311,18 @@ public class HanziToPinyin {
*/
public static final String SEPARATOR = " ";
- public static final int ASCII = 1;
+ public static final int LATIN = 1;
public static final int PINYIN = 2;
public static final int UNKNOWN = 3;
+ public Token() {
+ }
+
+ public Token(int type, String source, String target) {
+ this.type = type;
+ this.source = source;
+ this.target = target;
+ }
/**
* Type of this token, ASCII, PINYIN or UNKNOWN.
*/
@@ -347,6 +355,7 @@ public class HanziToPinyin {
return sInstance;
}
}
+ Log.w(TAG, "There is no Chinese collator, HanziToPinyin is disabled");
sInstance = new HanziToPinyin(false);
return sInstance;
}
@@ -359,11 +368,15 @@ public class HanziToPinyin {
int offset = -1;
int cmp;
if (character < 256) {
- token.type = Token.ASCII;
+ token.type = Token.LATIN;
+ token.target = letter;
+ return token;
+ } else if (character < FIRST_UNIHAN) {
+ token.type = Token.UNKNOWN;
token.target = letter;
return token;
} else {
- cmp = COLLATOR.compare(letter, FIRST_UNIHAN);
+ cmp = COLLATOR.compare(letter, FIRST_PINYIN_UNIHAN);
if (cmp < 0) {
token.type = Token.UNKNOWN;
token.target = letter;
@@ -372,7 +385,7 @@ public class HanziToPinyin {
token.type = Token.PINYIN;
offset = 0;
} else {
- cmp = COLLATOR.compare(letter, LAST_UNIHAN);
+ cmp = COLLATOR.compare(letter, LAST_PINYIN_UNIHAN);
if (cmp > 0) {
token.type = Token.UNKNOWN;
token.target = letter;
@@ -412,44 +425,71 @@ public class HanziToPinyin {
return token;
}
+ /**
+ * Convert the input to a array of tokens. The sequence of ASCII or Unknown
+ * characters without space will be put into a Token, One Hanzi character
+ * which has pinyin will be treated as a Token.
+ * If these is no China collator, the empty token array is returned.
+ */
public ArrayList<Token> get(final String input) {
+ ArrayList<Token> tokens = new ArrayList<Token>();
if (!mHasChinaCollator || TextUtils.isEmpty(input)) {
- return null;
+ // return empty tokens.
+ return tokens;
}
-
- ArrayList<Token> tokens = new ArrayList<Token>();
- Token currentToken;
-
final int inputLength = input.length();
-
- currentToken = getToken(input.charAt(0));
-
- for (int i = 1; i < inputLength; i++) {
+ final StringBuilder sb = new StringBuilder();
+ int tokenType = Token.LATIN;
+ // Go through the input, create a new token when
+ // a. Token type changed
+ // b. Get the Pinyin of current charater.
+ // c. current character is space.
+ for (int i = 0; i < inputLength; i++) {
final char character = input.charAt(i);
- Token token = getToken(character);
-
- if (token.type != currentToken.type) {
- currentToken.target = currentToken.target.trim();
- tokens.add(currentToken);
- currentToken = token;
+ if (character == ' ') {
+ if (sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
+ } else if (character < 256) {
+ if (tokenType != Token.LATIN && sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
+ tokenType = Token.LATIN;
+ sb.append(character);
+ } else if (character < FIRST_UNIHAN) {
+ if (tokenType != Token.UNKNOWN && sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
+ tokenType = Token.UNKNOWN;
+ sb.append(character);
} else {
- switch (token.type) {
- case Token.ASCII:
- case Token.UNKNOWN:
- currentToken.source += token.source;
- currentToken.target += token.target;
- break;
- case Token.PINYIN:
- currentToken.source += token.source;
- currentToken.target += " " + token.target;
- break;
+ Token t = getToken(character);
+ if (t.type == Token.PINYIN) {
+ if (sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
+ tokens.add(t);
+ tokenType = Token.PINYIN;
+ } else {
+ if (tokenType != t.type && sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
+ tokenType = t.type;
+ sb.append(character);
}
}
}
-
- currentToken.target = currentToken.target.trim();
- tokens.add(currentToken);
-
+ if (sb.length() > 0) {
+ addToken(sb, tokens, tokenType);
+ }
return tokens;
}
+
+ private void addToken(final StringBuilder sb, final ArrayList<Token> tokens,
+ final int tokenType) {
+ String str = sb.toString();
+ tokens.add(new Token(tokenType, str, str));
+ sb.setLength(0);
+ }
+
}
diff --git a/tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java b/tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java
new file mode 100644
index 0000000..8e1ff0b
--- /dev/null
+++ b/tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.unit_tests.internal.util;
+
+import java.text.Collator;
+import java.util.ArrayList;
+import java.util.Locale;
+
+import android.test.suitebuilder.annotation.SmallTest;
+import android.util.Log;
+
+import com.android.internal.util.HanziToPinyin;
+import com.android.internal.util.HanziToPinyin.Token;
+
+import junit.framework.TestCase;
+
+public class HanziToPinyinTest extends TestCase {
+ private final static String ONE_HANZI = "\u675C";
+ private final static String TWO_HANZI = "\u675C\u9D51";
+ private final static String ASSIC = "test";
+ private final static String ONE_UNKNOWN = "\uFF71";
+ private final static String MISC = "test\u675C Test with space\uFF71\uFF71\u675C";
+
+ @SmallTest
+ public void testGetToken() throws Exception {
+ ArrayList<Token> tokens = HanziToPinyin.getInstance().get(ONE_HANZI);
+ assertEquals(tokens.size(), 1);
+ assertEquals(tokens.get(0).type, Token.PINYIN);
+ assertTrue(tokens.get(0).target.equalsIgnoreCase("DU"));
+
+ tokens = HanziToPinyin.getInstance().get(TWO_HANZI);
+ assertEquals(tokens.size(), 2);
+ assertEquals(tokens.get(0).type, Token.PINYIN);
+ assertEquals(tokens.get(1).type, Token.PINYIN);
+ assertTrue(tokens.get(0).target.equalsIgnoreCase("DU"));
+ assertTrue(tokens.get(1).target.equalsIgnoreCase("JUAN"));
+
+ tokens = HanziToPinyin.getInstance().get(ASSIC);
+ assertEquals(tokens.size(), 1);
+ assertEquals(tokens.get(0).type, Token.LATIN);
+
+ tokens = HanziToPinyin.getInstance().get(ONE_UNKNOWN);
+ assertEquals(tokens.size(), 1);
+ assertEquals(tokens.get(0).type, Token.UNKNOWN);
+
+ tokens = HanziToPinyin.getInstance().get(MISC);
+ assertEquals(tokens.size(), 7);
+ assertEquals(tokens.get(0).type, Token.LATIN);
+ assertEquals(tokens.get(1).type, Token.PINYIN);
+ assertEquals(tokens.get(2).type, Token.LATIN);
+ assertEquals(tokens.get(3).type, Token.LATIN);
+ assertEquals(tokens.get(4).type, Token.LATIN);
+ assertEquals(tokens.get(5).type, Token.UNKNOWN);
+ assertEquals(tokens.get(6).type, Token.PINYIN);
+ }
+}