diff options
-rw-r--r-- | core/java/com/android/internal/util/HanziToPinyin.java | 114 | ||||
-rw-r--r-- | tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java | 70 |
2 files changed, 147 insertions, 37 deletions
diff --git a/core/java/com/android/internal/util/HanziToPinyin.java b/core/java/com/android/internal/util/HanziToPinyin.java index 4368e98..6a4adaa 100644 --- a/core/java/com/android/internal/util/HanziToPinyin.java +++ b/core/java/com/android/internal/util/HanziToPinyin.java @@ -16,8 +16,6 @@ package com.android.internal.util; -import com.google.android.util.AbstractMessageParser.Token; - import android.text.TextUtils; import android.util.Log; @@ -298,8 +296,10 @@ public class HanziToPinyin { }; /** First and last Chinese character with known Pinyin according to zh collation */ - private static final String FIRST_UNIHAN = "\u5416"; - private static final String LAST_UNIHAN = "\u5497"; + private static final String FIRST_PINYIN_UNIHAN = "\u5416"; + private static final String LAST_PINYIN_UNIHAN = "\u5497"; + /** The first Chinese character in Unicode block */ + private static final char FIRST_UNIHAN = '\u3400'; private static final Collator COLLATOR = Collator.getInstance(Locale.CHINA); private static HanziToPinyin sInstance; @@ -311,10 +311,18 @@ public class HanziToPinyin { */ public static final String SEPARATOR = " "; - public static final int ASCII = 1; + public static final int LATIN = 1; public static final int PINYIN = 2; public static final int UNKNOWN = 3; + public Token() { + } + + public Token(int type, String source, String target) { + this.type = type; + this.source = source; + this.target = target; + } /** * Type of this token, ASCII, PINYIN or UNKNOWN. */ @@ -347,6 +355,7 @@ public class HanziToPinyin { return sInstance; } } + Log.w(TAG, "There is no Chinese collator, HanziToPinyin is disabled"); sInstance = new HanziToPinyin(false); return sInstance; } @@ -359,11 +368,15 @@ public class HanziToPinyin { int offset = -1; int cmp; if (character < 256) { - token.type = Token.ASCII; + token.type = Token.LATIN; + token.target = letter; + return token; + } else if (character < FIRST_UNIHAN) { + token.type = Token.UNKNOWN; token.target = letter; return token; } else { - cmp = COLLATOR.compare(letter, FIRST_UNIHAN); + cmp = COLLATOR.compare(letter, FIRST_PINYIN_UNIHAN); if (cmp < 0) { token.type = Token.UNKNOWN; token.target = letter; @@ -372,7 +385,7 @@ public class HanziToPinyin { token.type = Token.PINYIN; offset = 0; } else { - cmp = COLLATOR.compare(letter, LAST_UNIHAN); + cmp = COLLATOR.compare(letter, LAST_PINYIN_UNIHAN); if (cmp > 0) { token.type = Token.UNKNOWN; token.target = letter; @@ -412,44 +425,71 @@ public class HanziToPinyin { return token; } + /** + * Convert the input to a array of tokens. The sequence of ASCII or Unknown + * characters without space will be put into a Token, One Hanzi character + * which has pinyin will be treated as a Token. + * If these is no China collator, the empty token array is returned. + */ public ArrayList<Token> get(final String input) { + ArrayList<Token> tokens = new ArrayList<Token>(); if (!mHasChinaCollator || TextUtils.isEmpty(input)) { - return null; + // return empty tokens. + return tokens; } - - ArrayList<Token> tokens = new ArrayList<Token>(); - Token currentToken; - final int inputLength = input.length(); - - currentToken = getToken(input.charAt(0)); - - for (int i = 1; i < inputLength; i++) { + final StringBuilder sb = new StringBuilder(); + int tokenType = Token.LATIN; + // Go through the input, create a new token when + // a. Token type changed + // b. Get the Pinyin of current charater. + // c. current character is space. + for (int i = 0; i < inputLength; i++) { final char character = input.charAt(i); - Token token = getToken(character); - - if (token.type != currentToken.type) { - currentToken.target = currentToken.target.trim(); - tokens.add(currentToken); - currentToken = token; + if (character == ' ') { + if (sb.length() > 0) { + addToken(sb, tokens, tokenType); + } + } else if (character < 256) { + if (tokenType != Token.LATIN && sb.length() > 0) { + addToken(sb, tokens, tokenType); + } + tokenType = Token.LATIN; + sb.append(character); + } else if (character < FIRST_UNIHAN) { + if (tokenType != Token.UNKNOWN && sb.length() > 0) { + addToken(sb, tokens, tokenType); + } + tokenType = Token.UNKNOWN; + sb.append(character); } else { - switch (token.type) { - case Token.ASCII: - case Token.UNKNOWN: - currentToken.source += token.source; - currentToken.target += token.target; - break; - case Token.PINYIN: - currentToken.source += token.source; - currentToken.target += " " + token.target; - break; + Token t = getToken(character); + if (t.type == Token.PINYIN) { + if (sb.length() > 0) { + addToken(sb, tokens, tokenType); + } + tokens.add(t); + tokenType = Token.PINYIN; + } else { + if (tokenType != t.type && sb.length() > 0) { + addToken(sb, tokens, tokenType); + } + tokenType = t.type; + sb.append(character); } } } - - currentToken.target = currentToken.target.trim(); - tokens.add(currentToken); - + if (sb.length() > 0) { + addToken(sb, tokens, tokenType); + } return tokens; } + + private void addToken(final StringBuilder sb, final ArrayList<Token> tokens, + final int tokenType) { + String str = sb.toString(); + tokens.add(new Token(tokenType, str, str)); + sb.setLength(0); + } + } diff --git a/tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java b/tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java new file mode 100644 index 0000000..8e1ff0b --- /dev/null +++ b/tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.unit_tests.internal.util; + +import java.text.Collator; +import java.util.ArrayList; +import java.util.Locale; + +import android.test.suitebuilder.annotation.SmallTest; +import android.util.Log; + +import com.android.internal.util.HanziToPinyin; +import com.android.internal.util.HanziToPinyin.Token; + +import junit.framework.TestCase; + +public class HanziToPinyinTest extends TestCase { + private final static String ONE_HANZI = "\u675C"; + private final static String TWO_HANZI = "\u675C\u9D51"; + private final static String ASSIC = "test"; + private final static String ONE_UNKNOWN = "\uFF71"; + private final static String MISC = "test\u675C Test with space\uFF71\uFF71\u675C"; + + @SmallTest + public void testGetToken() throws Exception { + ArrayList<Token> tokens = HanziToPinyin.getInstance().get(ONE_HANZI); + assertEquals(tokens.size(), 1); + assertEquals(tokens.get(0).type, Token.PINYIN); + assertTrue(tokens.get(0).target.equalsIgnoreCase("DU")); + + tokens = HanziToPinyin.getInstance().get(TWO_HANZI); + assertEquals(tokens.size(), 2); + assertEquals(tokens.get(0).type, Token.PINYIN); + assertEquals(tokens.get(1).type, Token.PINYIN); + assertTrue(tokens.get(0).target.equalsIgnoreCase("DU")); + assertTrue(tokens.get(1).target.equalsIgnoreCase("JUAN")); + + tokens = HanziToPinyin.getInstance().get(ASSIC); + assertEquals(tokens.size(), 1); + assertEquals(tokens.get(0).type, Token.LATIN); + + tokens = HanziToPinyin.getInstance().get(ONE_UNKNOWN); + assertEquals(tokens.size(), 1); + assertEquals(tokens.get(0).type, Token.UNKNOWN); + + tokens = HanziToPinyin.getInstance().get(MISC); + assertEquals(tokens.size(), 7); + assertEquals(tokens.get(0).type, Token.LATIN); + assertEquals(tokens.get(1).type, Token.PINYIN); + assertEquals(tokens.get(2).type, Token.LATIN); + assertEquals(tokens.get(3).type, Token.LATIN); + assertEquals(tokens.get(4).type, Token.LATIN); + assertEquals(tokens.get(5).type, Token.UNKNOWN); + assertEquals(tokens.get(6).type, Token.PINYIN); + } +} |