summaryrefslogtreecommitdiffstats
path: root/content/common/android
diff options
context:
space:
mode:
authorleandrogracia@chromium.org <leandrogracia@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-05-30 19:04:03 +0000
committerleandrogracia@chromium.org <leandrogracia@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-05-30 19:04:03 +0000
commitbe77471b2116f55aa30e457377ed3accdd51c348 (patch)
tree2784af86b3e8413fec84cb8181777118ef4680d9 /content/common/android
parentd32591c8f1654c0a7f39809cb6a969efb33ebf43 (diff)
downloadchromium_src-be77471b2116f55aa30e457377ed3accdd51c348.zip
chromium_src-be77471b2116f55aa30e457377ed3accdd51c348.tar.gz
chromium_src-be77471b2116f55aa30e457377ed3accdd51c348.tar.bz2
[Android] Split the address parser from AddressDetector for WebView use.
Split the address parser inside AddressDetector into a separate file outside the renderer folder. This is done in order to provide support for Java's WebView.findAddress using the new address parser while preventing layering violations in Chromium (WebView should not access content/renderer). BUG=125390 TEST=address_parser_unittest.cc Review URL: https://chromiumcodereview.appspot.com/10456007 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@139597 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'content/common/android')
-rw-r--r--content/common/android/address_parser.cc219
-rw-r--r--content/common/android/address_parser.h35
-rw-r--r--content/common/android/address_parser_internal.cc628
-rw-r--r--content/common/android/address_parser_internal.h83
-rw-r--r--content/common/android/address_parser_unittest.cc594
5 files changed, 1559 insertions, 0 deletions
diff --git a/content/common/android/address_parser.cc b/content/common/android/address_parser.cc
new file mode 100644
index 0000000..48c9400
--- /dev/null
+++ b/content/common/android/address_parser.cc
@@ -0,0 +1,219 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/common/android/address_parser.h"
+
+#include "base/logging.h"
+#include "base/string_util.h"
+#include "content/common/android/address_parser_internal.h"
+
+namespace {
+
+// Minimum number of words in an address after the house number
+// before a state is expected to be found.
+// A value too high can miss short addresses.
+const size_t kMinAddressWords = 3;
+
+// Maximum number of words allowed in an address between the house number
+// and the state, both not included.
+const size_t kMaxAddressWords = 12;
+
+// Maximum number of lines allowed in an address between the house number
+// and the state, both not included.
+const size_t kMaxAddressLines = 5;
+
+// Maximum length allowed for any address word between the house number
+// and the state, both not included.
+const size_t kMaxAddressNameWordLength = 25;
+
+// Maximum number of words after the house number in which the location name
+// should be found.
+const size_t kMaxLocationNameDistance = 4;
+
+// Additional characters used as new line delimiters.
+const char16 kNewlineDelimiters[] = {
+ '\n',
+ ',',
+ '*',
+ 0x2022, // Unicode bullet
+ 0,
+};
+
+} // anonymous namespace
+
+namespace content {
+
+namespace address_parser {
+
+using namespace internal;
+
+bool FindAddress(const string16& text, string16* address) {
+ size_t start, end;
+ if (FindAddress(text.begin(), text.end(), &start, &end)) {
+ address->assign(text.substr(start, end));
+ return true;
+ }
+ return false;
+}
+
+bool FindAddress(const string16::const_iterator& begin,
+ const string16::const_iterator& end,
+ size_t* start_pos,
+ size_t* end_pos) {
+ HouseNumberParser house_number_parser;
+
+ // Keep going through the input string until a potential house number is
+ // detected. Start tokenizing the following words to find a valid
+ // street name within a word range. Then, find a state name followed
+ // by a valid zip code for that state. Also keep a look for any other
+ // possible house numbers to continue from in case of no match and for
+ // state names not followed by a zip code (e.g. New York, NY 10000).
+ const string16 newline_delimiters = kNewlineDelimiters;
+ const string16 delimiters = kWhitespaceUTF16 + newline_delimiters;
+ for (string16::const_iterator it = begin; it != end; ) {
+ Word house_number;
+ if (!house_number_parser.Parse(it, end, &house_number))
+ return false;
+
+ String16Tokenizer tokenizer(house_number.end, end, delimiters);
+ tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
+
+ WordList words;
+ words.push_back(house_number);
+
+ bool found_location_name = false;
+ bool continue_on_house_number = true;
+ bool consecutive_house_numbers = true;
+ size_t next_house_number_word = 0;
+ size_t num_lines = 1;
+
+ // Don't include the house number in the word count.
+ size_t next_word = 1;
+ for (; next_word <= kMaxAddressWords + 1; ++next_word) {
+
+ // Extract a new word from the tokenizer.
+ if (next_word == words.size()) {
+ do {
+ if (!tokenizer.GetNext())
+ return false;
+
+ // Check the number of address lines.
+ if (tokenizer.token_is_delim() && newline_delimiters.find(
+ *tokenizer.token_begin()) != string16::npos) {
+ ++num_lines;
+ }
+ } while (tokenizer.token_is_delim());
+
+ if (num_lines > kMaxAddressLines)
+ break;
+
+ words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
+ }
+
+ // Check the word length. If too long, don't try to continue from
+ // the next house number as no address can hold this word.
+ const Word& current_word = words[next_word];
+ DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
+ size_t current_word_length = std::distance(
+ current_word.begin, current_word.end);
+ if (current_word_length > kMaxAddressNameWordLength) {
+ continue_on_house_number = false;
+ break;
+ }
+
+ // Check if the new word is a valid house number.
+ if (house_number_parser.Parse(current_word.begin, current_word.end,
+ NULL)) {
+ // Increase the number of consecutive house numbers since the beginning.
+ if (consecutive_house_numbers) {
+ // Check if there is a new line between consecutive house numbers.
+ // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
+ if (num_lines > 1) {
+ next_house_number_word = next_word;
+ break;
+ }
+ }
+
+ // Keep the next candidate to resume parsing from in case of failure.
+ if (next_house_number_word == 0) {
+ next_house_number_word = next_word;
+ continue;
+ }
+ } else {
+ consecutive_house_numbers = false;
+ }
+
+ // Look for location names in the words after the house number.
+ // A range limitation is introduced to avoid matching
+ // anything that starts with a number before a legitimate address.
+ if (next_word <= kMaxLocationNameDistance &&
+ IsValidLocationName(current_word)) {
+ found_location_name = true;
+ continue;
+ }
+
+ // Don't count the house number.
+ if (next_word > kMinAddressWords) {
+ // Looking for the state is likely to add new words to the list while
+ // checking for multi-word state names.
+ size_t state_first_word = next_word;
+ size_t state_last_word, state_index;
+ if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
+ &tokenizer, &state_index)) {
+
+ // A location name should have been found at this point.
+ if (!found_location_name)
+ break;
+
+ // Explicitly exclude "et al", as "al" is a valid state code.
+ if (current_word_length == 2 && words.size() > 2) {
+ const Word& previous_word = words[state_first_word - 1];
+ if (previous_word.end - previous_word.begin == 2 &&
+ LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
+ "et") &&
+ LowerCaseEqualsASCII(current_word.begin, current_word.end,
+ "al"))
+ break;
+ }
+
+ // Extract one more word from the tokenizer if not already available.
+ size_t zip_word = state_last_word + 1;
+ if (zip_word == words.size()) {
+ do {
+ if (!tokenizer.GetNext())
+ return false;
+ } while (tokenizer.token_is_delim());
+ words.push_back(Word(tokenizer.token_begin(),
+ tokenizer.token_end()));
+ }
+
+ // Check the parsing validity and state range of the zip code.
+ next_word = state_last_word;
+ if (!IsZipValid(words[zip_word], state_index))
+ continue;
+
+ *start_pos = words[0].begin - begin;
+ *end_pos = words[zip_word].end - begin;
+ return true;
+ }
+ }
+ }
+
+ // Avoid skipping too many words because of a non-address number
+ // at the beginning of the contents to parse.
+ if (continue_on_house_number && next_house_number_word > 0) {
+ it = words[next_house_number_word].begin;
+ } else {
+ DCHECK(!words.empty());
+ next_word = std::min(next_word, words.size() - 1);
+ it = words[next_word].end;
+ }
+ }
+
+ return false;
+}
+
+} // namespace address_parser
+
+} // namespace content
diff --git a/content/common/android/address_parser.h b/content/common/android/address_parser.h
new file mode 100644
index 0000000..aee9df7
--- /dev/null
+++ b/content/common/android/address_parser.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CONTENT_COMMON_ADDRESS_PARSER_H_
+#define CONTENT_COMMON_ADDRESS_PARSER_H_
+#pragma once
+
+#include "base/string16.h"
+
+namespace content {
+
+// Provides methods to find a geographical address (currently US only)
+// in a given text string.
+namespace address_parser {
+
+// Find the first address in some chunk of text. If an address is found in
+// |text| true is returned and the address is copied into |address|.
+// Otherwise, false is returned.
+bool FindAddress(const string16& text, string16* address);
+
+// Find the first address in some chunk of test. |begin| is the starting
+// position to search from, |end| is the position to search to. |start_pos|
+// and |end_pos| are set to the starting and ending position of the address,
+// if found.
+bool FindAddress(const string16::const_iterator& begin,
+ const string16::const_iterator& end,
+ size_t* start_pos,
+ size_t* end_pos);
+
+} // namespace address_parser
+
+} // namespace content
+
+#endif // CONTENT_COMMON_ADDRESS_PARSER_H_
diff --git a/content/common/android/address_parser_internal.cc b/content/common/android/address_parser_internal.cc
new file mode 100644
index 0000000..46f9a0d
--- /dev/null
+++ b/content/common/android/address_parser_internal.cc
@@ -0,0 +1,628 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/common/android/address_parser_internal.h"
+
+#include <bitset>
+
+#include "base/logging.h"
+#include "base/string_util.h"
+
+namespace {
+
+// Number of digits for a valid zip code.
+const size_t kZipDigits = 5;
+
+// Number of digits for a valid zip code in the Zip Plus 4 format.
+const size_t kZipPlus4Digits = 9;
+
+// Maximum number of digits of a house number, including possible hyphens.
+const size_t kMaxHouseDigits = 5;
+
+char16 SafePreviousChar(const string16::const_iterator& it,
+ const string16::const_iterator& begin) {
+ if (it == begin)
+ return ' ';
+ return *(it - 1);
+}
+
+char16 SafeNextChar(const string16::const_iterator& it,
+ const string16::const_iterator& end) {
+ if (it == end)
+ return ' ';
+ return *(it + 1);
+}
+
+bool WordLowerCaseEqualsASCII(string16::const_iterator word_begin,
+ string16::const_iterator word_end, const char* ascii_to_match) {
+ for (string16::const_iterator it = word_begin; it != word_end;
+ ++it, ++ascii_to_match) {
+ if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
+ return false;
+ }
+ return *ascii_to_match == 0 || *ascii_to_match == ' ';
+}
+
+bool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin,
+ string16::const_iterator word_end, const char* ascii_to_match,
+ bool allow_plural) {
+ for (string16::const_iterator it = word_begin; it != word_end;
+ ++it, ++ascii_to_match) {
+ if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end)
+ return true;
+
+ if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
+ return false;
+ }
+ return *ascii_to_match == 0;
+}
+
+} // anonymous namespace
+
+namespace content {
+
+namespace address_parser {
+
+namespace internal {
+
+Word::Word(const string16::const_iterator& begin,
+ const string16::const_iterator& end)
+ : begin(begin),
+ end(end) {
+ DCHECK(begin <= end);
+}
+
+bool HouseNumberParser::IsPreDelimiter(char16 character) {
+ return character == ':' || IsPostDelimiter(character);
+}
+
+bool HouseNumberParser::IsPostDelimiter(char16 character) {
+ return IsWhitespace(character) || strchr(",\"'", character);
+}
+
+void HouseNumberParser::RestartOnNextDelimiter() {
+ ResetState();
+ for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {}
+}
+
+void HouseNumberParser::AcceptChars(size_t num_chars) {
+ size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)),
+ num_chars);
+ it_ += offset;
+ result_chars_ += offset;
+}
+
+void HouseNumberParser::SkipChars(size_t num_chars) {
+ it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars);
+}
+
+void HouseNumberParser::ResetState() {
+ num_digits_ = 0;
+ result_chars_ = 0;
+}
+
+bool HouseNumberParser::CheckFinished(Word* word) const {
+ // There should always be a number after a hyphen.
+ if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-')
+ return false;
+
+ if (word) {
+ word->begin = it_ - result_chars_;
+ word->end = it_;
+ }
+ return true;
+}
+
+bool HouseNumberParser::Parse(
+ const string16::const_iterator& begin,
+ const string16::const_iterator& end, Word* word) {
+ it_ = begin_ = begin;
+ end_ = end;
+ ResetState();
+
+ // Iterations only used as a fail-safe against any buggy infinite loops.
+ size_t iterations = 0;
+ size_t max_iterations = end - begin + 1;
+ for (; it_ != end_ && iterations < max_iterations; ++iterations) {
+
+ // Word finished case.
+ if (IsPostDelimiter(*it_)) {
+ if (CheckFinished(word))
+ return true;
+ else if (result_chars_)
+ ResetState();
+
+ SkipChars(1);
+ continue;
+ }
+
+ // More digits. There should be no more after a letter was found.
+ if (IsAsciiDigit(*it_)) {
+ if (num_digits_ >= kMaxHouseDigits) {
+ RestartOnNextDelimiter();
+ } else {
+ AcceptChars(1);
+ ++num_digits_;
+ }
+ continue;
+ }
+
+ if (IsAsciiAlpha(*it_)) {
+ // Handle special case 'one'.
+ if (result_chars_ == 0) {
+ if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one"))
+ AcceptChars(3);
+ else
+ RestartOnNextDelimiter();
+ continue;
+ }
+
+ // There should be more than 1 character because of result_chars.
+ DCHECK_GT(result_chars_, 0U);
+ DCHECK(it_ != begin_);
+ char16 previous = SafePreviousChar(it_, begin_);
+ if (IsAsciiDigit(previous)) {
+ // Check cases like '12A'.
+ char16 next = SafeNextChar(it_, end_);
+ if (IsPostDelimiter(next)) {
+ AcceptChars(1);
+ continue;
+ }
+
+ // Handle cases like 12a, 1st, 2nd, 3rd, 7th.
+ if (IsAsciiAlpha(next)) {
+ char16 last_digit = previous;
+ char16 first_letter = base::ToLowerASCII(*it_);
+ char16 second_letter = base::ToLowerASCII(next);
+ bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' &&
+ num_digits_ == 2;
+
+ switch (last_digit - '0') {
+ case 1:
+ if ((first_letter == 's' && second_letter == 't') ||
+ (first_letter == 't' && second_letter == 'h' && is_teen)) {
+ AcceptChars(2);
+ continue;
+ }
+ break;
+
+ case 2:
+ if ((first_letter == 'n' && second_letter == 'd') ||
+ (first_letter == 't' && second_letter == 'h' && is_teen)) {
+ AcceptChars(2);
+ continue;
+ }
+ break;
+
+ case 3:
+ if ((first_letter == 'r' && second_letter == 'd') ||
+ (first_letter == 't' && second_letter == 'h' && is_teen)) {
+ AcceptChars(2);
+ continue;
+ }
+ break;
+
+ case 0:
+ // Explicitly exclude '0th'.
+ if (num_digits_ == 1)
+ break;
+
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ case 8:
+ case 9:
+ if (first_letter == 't' && second_letter == 'h') {
+ AcceptChars(2);
+ continue;
+ }
+ break;
+
+ default:
+ NOTREACHED();
+ }
+ }
+ }
+
+ RestartOnNextDelimiter();
+ continue;
+ }
+
+ if (*it_ == '-' && num_digits_ > 0) {
+ AcceptChars(1);
+ ++num_digits_;
+ continue;
+ }
+
+ RestartOnNextDelimiter();
+ SkipChars(1);
+ }
+
+ if (iterations >= max_iterations)
+ return false;
+
+ return CheckFinished(word);
+}
+
+bool FindStateStartingInWord(WordList* words,
+ size_t state_first_word,
+ size_t* state_last_word,
+ String16Tokenizer* tokenizer,
+ size_t* state_index) {
+
+ // Bitmasks containing the allowed suffixes for 2-letter state codes.
+ static const int state_two_letter_suffix[23] = {
+ 0x02060c00, // A followed by: [KLRSZ].
+ 0x00000000, // B.
+ 0x00084001, // C followed by: [AOT].
+ 0x00000014, // D followed by: [CE].
+ 0x00000000, // E.
+ 0x00001800, // F followed by: [LM].
+ 0x00100001, // G followed by: [AU].
+ 0x00000100, // H followed by: [I].
+ 0x00002809, // I followed by: [ADLN].
+ 0x00000000, // J.
+ 0x01040000, // K followed by: [SY].
+ 0x00000001, // L followed by: [A].
+ 0x000ce199, // M followed by: [ADEHINOPST].
+ 0x0120129c, // N followed by: [CDEHJMVY].
+ 0x00020480, // O followed by: [HKR].
+ 0x00420001, // P followed by: [ARW].
+ 0x00000000, // Q.
+ 0x00000100, // R followed by: [I].
+ 0x0000000c, // S followed by: [CD].
+ 0x00802000, // T followed by: [NX].
+ 0x00080000, // U followed by: [T].
+ 0x00080101, // V followed by: [AIT].
+ 0x01200101 // W followed by: [AIVY].
+ };
+
+ // Accumulative number of states for the 2-letter code indexed by the first.
+ static const int state_two_letter_accumulative[24] = {
+ 0, 5, 5, 8, 10, 10, 12, 14,
+ 15, 19, 19, 21, 22, 32, 40, 43,
+ 46, 46, 47, 49, 51, 52, 55, 59
+ };
+
+ // State names sorted alphabetically with their lengths.
+ // There can be more than one possible name for a same state if desired.
+ static const struct StateNameInfo {
+ const char* string;
+ char first_word_length;
+ char length;
+ char state_index; // Relative to two-character code alphabetical order.
+ } state_names[59] = {
+ { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 },
+ { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 },
+ { "arkansas", 8, 8, 2 },
+ { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 },
+ { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 },
+ { "district of columbia", 8, 20, 8 },
+ { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 },
+ { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 },
+ { "hawaii", 6, 6, 14 },
+ { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 },
+ { "iowa", 4, 4, 15 },
+ { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 },
+ { "louisiana", 9, 9, 21 },
+ { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 },
+ { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 },
+ { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 },
+ { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 },
+ { "montana", 7, 7, 31 },
+ { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 },
+ { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 },
+ { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 },
+ { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 },
+ { "northern mariana islands", 8, 24, 29 },
+ { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 },
+ { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 },
+ { "puerto rico", 6, 11, 44 },
+ { "rhode island", 5, 5, 46 },
+ { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 },
+ { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 },
+ { "utah", 4, 4, 51 },
+ { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 },
+ { "virginia", 8, 8, 52 },
+ { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 },
+ { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 }
+ };
+
+ // Accumulative number of states for sorted names indexed by the first letter.
+ // Required a different one since there are codes that don't share their
+ // first letter with the name of their state (MP = Northern Mariana Islands).
+ static const int state_names_accumulative[24] = {
+ 0, 5, 5, 8, 10, 10, 12, 14,
+ 15, 19, 19, 21, 22, 31, 40, 43,
+ 46, 46, 47, 49, 51, 52, 55, 59
+ };
+
+ DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1],
+ static_cast<int>(ARRAYSIZE_UNSAFE(state_names)));
+
+ const Word& first_word = words->at(state_first_word);
+ int length = first_word.end - first_word.begin;
+ if (length < 2 || !IsAsciiAlpha(*first_word.begin))
+ return false;
+
+ // No state names start with x, y, z.
+ char16 first_letter = base::ToLowerASCII(*first_word.begin);
+ if (first_letter > 'w')
+ return false;
+
+ DCHECK(first_letter >= 'a');
+ int first_index = first_letter - 'a';
+
+ // Look for two-letter state names.
+ if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) {
+ char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1));
+ DCHECK(second_letter >= 'a');
+
+ int second_index = second_letter - 'a';
+ if (!(state_two_letter_suffix[first_index] & (1 << second_index)))
+ return false;
+
+ std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] &
+ ((1 << second_index) - 1);
+ *state_last_word = state_first_word;
+ *state_index = state_two_letter_accumulative[first_index] +
+ previous_suffixes.count();
+ return true;
+ }
+
+ // Look for full state names by their first letter. Discard by length.
+ for (int state = state_names_accumulative[first_index];
+ state < state_names_accumulative[first_index + 1]; ++state) {
+ if (state_names[state].first_word_length != length)
+ continue;
+
+ bool state_match = false;
+ size_t state_word = state_first_word;
+ for (int pos = 0; true; ) {
+ if (!WordLowerCaseEqualsASCII(words->at(state_word).begin,
+ words->at(state_word).end, &state_names[state].string[pos]))
+ break;
+
+ pos += words->at(state_word).end - words->at(state_word).begin + 1;
+ if (pos >= state_names[state].length) {
+ state_match = true;
+ break;
+ }
+
+ // Ran out of words, extract more from the tokenizer.
+ if (++state_word == words->size()) {
+ do {
+ if (!tokenizer->GetNext())
+ break;
+ } while (tokenizer->token_is_delim());
+ words->push_back(
+ Word(tokenizer->token_begin(), tokenizer->token_end()));
+ }
+ }
+
+ if (state_match) {
+ *state_last_word = state_word;
+ *state_index = state_names[state].state_index;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool IsZipValid(const Word& word, size_t state_index) {
+ size_t length = word.end - word.begin;
+ if (length != kZipDigits && length != kZipPlus4Digits + 1)
+ return false;
+
+ for (string16::const_iterator it = word.begin; it != word.end; ++it) {
+ size_t pos = it - word.begin;
+ if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits))
+ continue;
+ return false;
+ }
+ return IsZipValidForState(word, state_index);
+}
+
+bool IsZipValidForState(const Word& word, size_t state_index) {
+ // List of valid zip code ranges.
+ static const struct {
+ char low;
+ char high;
+ char exception1;
+ char exception2;
+ } zip_range[] = {
+ { 99, 99, -1, -1 }, // AK Alaska.
+ { 35, 36, -1, -1 }, // AL Alabama.
+ { 71, 72, -1, -1 }, // AR Arkansas.
+ { 96, 96, -1, -1 }, // AS American Samoa.
+ { 85, 86, -1, -1 }, // AZ Arizona.
+ { 90, 96, -1, -1 }, // CA California.
+ { 80, 81, -1, -1 }, // CO Colorado.
+ { 6, 6, -1, -1 }, // CT Connecticut.
+ { 20, 20, -1, -1 }, // DC District of Columbia.
+ { 19, 19, -1, -1 }, // DE Delaware.
+ { 32, 34, -1, -1 }, // FL Florida.
+ { 96, 96, -1, -1 }, // FM Federated States of Micronesia.
+ { 30, 31, -1, -1 }, // GA Georgia.
+ { 96, 96, -1, -1 }, // GU Guam.
+ { 96, 96, -1, -1 }, // HI Hawaii.
+ { 50, 52, -1, -1 }, // IA Iowa.
+ { 83, 83, -1, -1 }, // ID Idaho.
+ { 60, 62, -1, -1 }, // IL Illinois.
+ { 46, 47, -1, -1 }, // IN Indiana.
+ { 66, 67, 73, -1 }, // KS Kansas.
+ { 40, 42, -1, -1 }, // KY Kentucky.
+ { 70, 71, -1, -1 }, // LA Louisiana.
+ { 1, 2, -1, -1 }, // MA Massachusetts.
+ { 20, 21, -1, -1 }, // MD Maryland.
+ { 3, 4, -1, -1 }, // ME Maine.
+ { 96, 96, -1, -1 }, // MH Marshall Islands.
+ { 48, 49, -1, -1 }, // MI Michigan.
+ { 55, 56, -1, -1 }, // MN Minnesota.
+ { 63, 65, -1, -1 }, // MO Missouri.
+ { 96, 96, -1, -1 }, // MP Northern Mariana Islands.
+ { 38, 39, -1, -1 }, // MS Mississippi.
+ { 55, 56, -1, -1 }, // MT Montana.
+ { 27, 28, -1, -1 }, // NC North Carolina.
+ { 58, 58, -1, -1 }, // ND North Dakota.
+ { 68, 69, -1, -1 }, // NE Nebraska.
+ { 3, 4, -1, -1 }, // NH New Hampshire.
+ { 7, 8, -1, -1 }, // NJ New Jersey.
+ { 87, 88, 86, -1 }, // NM New Mexico.
+ { 88, 89, 96, -1 }, // NV Nevada.
+ { 10, 14, 0, 6 }, // NY New York.
+ { 43, 45, -1, -1 }, // OH Ohio.
+ { 73, 74, -1, -1 }, // OK Oklahoma.
+ { 97, 97, -1, -1 }, // OR Oregon.
+ { 15, 19, -1, -1 }, // PA Pennsylvania.
+ { 6, 6, 0, 9 }, // PR Puerto Rico.
+ { 96, 96, -1, -1 }, // PW Palau.
+ { 2, 2, -1, -1 }, // RI Rhode Island.
+ { 29, 29, -1, -1 }, // SC South Carolina.
+ { 57, 57, -1, -1 }, // SD South Dakota.
+ { 37, 38, -1, -1 }, // TN Tennessee.
+ { 75, 79, 87, 88 }, // TX Texas.
+ { 84, 84, -1, -1 }, // UT Utah.
+ { 22, 24, 20, -1 }, // VA Virginia.
+ { 6, 9, -1, -1 }, // VI Virgin Islands.
+ { 5, 5, -1, -1 }, // VT Vermont.
+ { 98, 99, -1, -1 }, // WA Washington.
+ { 53, 54, -1, -1 }, // WI Wisconsin.
+ { 24, 26, -1, -1 }, // WV West Virginia.
+ { 82, 83, -1, -1 } // WY Wyoming.
+ };
+
+ // Zip numeric value for the first two characters.
+ DCHECK(word.begin != word.end);
+ DCHECK(IsAsciiDigit(*word.begin));
+ DCHECK(IsAsciiDigit(*(word.begin + 1)));
+ int zip_prefix = (*word.begin - '0') * 10 + (*(word.begin + 1) - '0');
+
+ if ((zip_prefix >= zip_range[state_index].low &&
+ zip_prefix <= zip_range[state_index].high) ||
+ zip_prefix == zip_range[state_index].exception1 ||
+ zip_prefix == zip_range[state_index].exception2) {
+ return true;
+ }
+ return false;
+}
+
+bool IsValidLocationName(const Word& word) {
+ // Supported location names sorted alphabetically and grouped by first letter.
+ static const struct LocationNameInfo {
+ const char* string;
+ char length;
+ bool allow_plural;
+ } location_names[157] = {
+ { "alley", 5, false }, { "annex", 5, false }, { "arcade", 6, false },
+ { "ave", 3, false }, { "ave.", 4, false }, { "avenue", 6, false },
+ { "alameda", 7, false },
+ { "bayou", 5, false }, { "beach", 5, false }, { "bend", 4, false },
+ { "bluff", 5, true }, { "bottom", 6, false }, { "boulevard", 9, false },
+ { "branch", 6, false }, { "bridge", 6, false }, { "brook", 5, true },
+ { "burg", 4, true }, { "bypass", 6, false }, { "broadway", 8, false },
+ { "camino", 6, false }, { "camp", 4, false }, { "canyon", 6, false },
+ { "cape", 4, false }, { "causeway", 8, false }, { "center", 6, true },
+ { "circle", 6, true }, { "cliff", 5, true }, { "club", 4, false },
+ { "common", 6, false }, { "corner", 6, true }, { "course", 6, false },
+ { "court", 5, true }, { "cove", 4, true }, { "creek", 5, false },
+ { "crescent", 8, false }, { "crest", 5, false }, { "crossing", 8, false },
+ { "crossroad", 9, false }, { "curve", 5, false }, { "circulo", 7, false },
+ { "dale", 4, false }, { "dam", 3, false }, { "divide", 6, false },
+ { "drive", 5, true },
+ { "estate", 6, true }, { "expressway", 10, false },
+ { "extension", 9, true },
+ { "fall", 4, true }, { "ferry", 5, false }, { "field", 5, true },
+ { "flat", 4, true }, { "ford", 4, true }, { "forest", 6, false },
+ { "forge", 5, true }, { "fork", 4, true }, { "fort", 4, false },
+ { "freeway", 7, false },
+ { "garden", 6, true }, { "gateway", 7, false }, { "glen", 4, true },
+ { "green", 5, true }, { "grove", 5, true },
+ { "harbor", 6, true }, { "haven", 5, false }, { "heights", 7, false },
+ { "highway", 7, false }, { "hill", 4, true }, { "hollow", 6, false },
+ { "inlet", 5, false }, { "island", 6, true }, { "isle", 4, false },
+ { "junction", 8, true },
+ { "key", 3, true }, { "knoll", 5, true },
+ { "lake", 4, true }, { "land", 4, false }, { "landing", 7, false },
+ { "lane", 4, false }, { "light", 5, true }, { "loaf", 4, false },
+ { "lock", 4, true }, { "lodge", 5, false }, { "loop", 4, false },
+ { "mall", 4, false }, { "manor", 5, true }, { "meadow", 6, true },
+ { "mews", 4, false }, { "mill", 4, true }, { "mission", 7, false },
+ { "motorway", 8, false }, { "mount", 5, false }, { "mountain", 8, true },
+ { "neck", 4, false },
+ { "orchard", 7, false }, { "oval", 4, false }, { "overpass", 8, false },
+ { "park", 4, true }, { "parkway", 7, true }, { "pass", 4, false },
+ { "passage", 7, false }, { "path", 4, false }, { "pike", 4, false },
+ { "pine", 4, true }, { "plain", 5, true }, { "plaza", 5, false },
+ { "point", 5, true }, { "port", 4, true }, { "prairie", 7, false },
+ { "privada", 7, false },
+ { "radial", 6, false }, { "ramp", 4, false }, { "ranch", 5, false },
+ { "rapid", 5, true }, { "rest", 4, false }, { "ridge", 5, true },
+ { "river", 5, false }, { "road", 4, true }, { "route", 5, false },
+ { "row", 3, false }, { "rue", 3, false }, { "run", 3, false },
+ { "shoal", 5, true }, { "shore", 5, true }, { "skyway", 6, false },
+ { "spring", 6, true }, { "spur", 4, true }, { "square", 6, true },
+ { "station", 7, false }, { "stravenue", 9, false }, { "stream", 6, false },
+ { "st", 2, false }, { "st.", 3, false }, { "street", 6, true },
+ { "summit", 6, false }, { "speedway", 8, false },
+ { "terrace", 7, false }, { "throughway", 10, false }, { "trace", 5, false },
+ { "track", 5, false }, { "trafficway", 10, false }, { "trail", 5, false },
+ { "tunnel", 6, false }, { "turnpike", 8, false },
+ { "underpass", 9, false }, { "union", 5, true },
+ { "valley", 6, true }, { "viaduct", 7, false }, { "view", 4, true },
+ { "village", 7, true }, { "ville", 5, false }, { "vista", 5, false },
+ { "walk", 4, true }, { "wall", 4, false }, { "way", 3, true },
+ { "well", 4, true },
+ { "xing", 4, false }, { "xrd", 3, false }
+ };
+
+ // Accumulative number of location names for each starting letter.
+ static const int location_names_accumulative[25] = {
+ 0, 7, 19, 40, 44,
+ 47, 57, 62, 68, 71,
+ 72, 74, 83, 92, 93,
+ 96, 109, 109, 121, 135,
+ 143, 145, 151, 155, 157
+ };
+
+ DCHECK_EQ(
+ location_names_accumulative[arraysize(location_names_accumulative) - 1],
+ static_cast<int>(ARRAYSIZE_UNSAFE(location_names)));
+
+ if (!IsAsciiAlpha(*word.begin))
+ return false;
+
+ // No location names start with y, z.
+ char16 first_letter = base::ToLowerASCII(*word.begin);
+ if (first_letter > 'x')
+ return false;
+
+ DCHECK(first_letter >= 'a');
+ int index = first_letter - 'a';
+ int length = std::distance(word.begin, word.end);
+ for (int i = location_names_accumulative[index];
+ i < location_names_accumulative[index + 1]; ++i) {
+ if (location_names[i].length != length &&
+ (location_names[i].allow_plural &&
+ location_names[i].length + 1 != length)) {
+ continue;
+ }
+
+ if (LowerCaseEqualsASCIIWithPlural(word.begin, word.end,
+ location_names[i].string,
+ location_names[i].allow_plural)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+} // namespace internal
+
+} // namespace address_parser
+
+} // namespace content
diff --git a/content/common/android/address_parser_internal.h b/content/common/android/address_parser_internal.h
new file mode 100644
index 0000000..75ebb7b
--- /dev/null
+++ b/content/common/android/address_parser_internal.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CONTENT_COMMON_ADDRESS_PARSER_INTERNAL_H_
+#define CONTENT_COMMON_ADDRESS_PARSER_INTERNAL_H_
+#pragma once
+
+#include <vector>
+
+#include "base/string_tokenizer.h"
+
+namespace content {
+
+namespace address_parser {
+
+// Internal classes and functions for address parsing.
+namespace internal {
+
+struct Word {
+ string16::const_iterator begin;
+ string16::const_iterator end;
+
+ Word() {}
+ Word(const string16::const_iterator& begin,
+ const string16::const_iterator& end);
+};
+
+class HouseNumberParser {
+ public:
+ HouseNumberParser() {}
+
+ bool Parse(const string16::const_iterator& begin,
+ const string16::const_iterator& end,
+ Word* word);
+
+ private:
+ static inline bool IsPreDelimiter(char16 character);
+ static inline bool IsPostDelimiter(char16 character);
+ inline void RestartOnNextDelimiter();
+
+ inline bool CheckFinished(Word* word) const;
+ inline void AcceptChars(size_t num_chars);
+ inline void SkipChars(size_t num_chars);
+ inline void ResetState();
+
+ // Iterators to the beginning, current position and ending of the string
+ // being currently parsed.
+ string16::const_iterator begin_;
+ string16::const_iterator it_;
+ string16::const_iterator end_;
+
+ // Number of digits found in the current result candidate.
+ size_t num_digits_;
+
+ // Number of characters previous to the current iterator that belong
+ // to the current result candidate.
+ size_t result_chars_;
+
+ DISALLOW_COPY_AND_ASSIGN(HouseNumberParser);
+};
+
+typedef std::vector<Word> WordList;
+typedef StringTokenizerT<string16, string16::const_iterator>
+ String16Tokenizer;
+
+bool FindStateStartingInWord(WordList* words,
+ size_t state_first_word,
+ size_t* state_last_word,
+ String16Tokenizer* tokenizer,
+ size_t* state_index);
+
+bool IsValidLocationName(const Word& word);
+bool IsZipValid(const Word& word, size_t state_index);
+bool IsZipValidForState(const Word& word, size_t state_index);
+
+} // namespace internal
+
+} // namespace address_parser
+
+} // namespace content
+
+#endif // CONTENT_COMMON_ADDRESS_PARSER_INTERNAL_H_
diff --git a/content/common/android/address_parser_unittest.cc b/content/common/android/address_parser_unittest.cc
new file mode 100644
index 0000000..99dfcc8
--- /dev/null
+++ b/content/common/android/address_parser_unittest.cc
@@ -0,0 +1,594 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/memory/scoped_ptr.h"
+#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
+#include "content/common/android/address_parser.h"
+#include "content/common/android/address_parser_internal.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using namespace content::address_parser;
+using namespace content::address_parser::internal;
+
+class AddressParserTest : public testing::Test {
+ public:
+ AddressParserTest() {}
+
+ void TokenizeWords(const string16& content, WordList* words) const {
+ String16Tokenizer tokenizer(content.begin(), content.end(),
+ kWhitespaceUTF16);
+ while (tokenizer.GetNext()) {
+ words->push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
+ }
+ }
+
+ std::string GetHouseNumber(const std::string& content) const {
+ string16 content_16 = UTF8ToUTF16(content);
+ string16 result;
+
+ HouseNumberParser parser;
+ Word word;
+ if (parser.Parse(content_16.begin(), content_16.end(), &word))
+ result = string16(word.begin, word.end);
+ return UTF16ToUTF8(result);
+ }
+
+ bool ContainsHouseNumber(const std::string& content) const {
+ return !GetHouseNumber(content).empty();
+ }
+
+ bool GetState(const std::string& state, size_t* state_index) const {
+ string16 state_16 = UTF8ToUTF16(state);
+ String16Tokenizer tokenizer(state_16.begin(), state_16.end(),
+ kWhitespaceUTF16);
+ if (!tokenizer.GetNext())
+ return false;
+
+ size_t state_last_word;
+ WordList words;
+ words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
+ return FindStateStartingInWord(&words, 0, &state_last_word, &tokenizer,
+ state_index);
+ }
+
+ bool IsState(const std::string& state) const {
+ size_t state_index;
+ return GetState(state, &state_index);
+ }
+
+ bool IsZipValid(const std::string& zip, const std::string& state) const {
+ size_t state_index;
+ EXPECT_TRUE(GetState(state, &state_index));
+
+ string16 zip_16 = UTF8ToUTF16(zip);
+ WordList words;
+ TokenizeWords(zip_16, &words);
+ EXPECT_TRUE(words.size() == 1);
+ return ::IsZipValid(words.front(), state_index);
+ }
+
+ bool IsLocationName(const std::string& street) const {
+ string16 street_16 = UTF8ToUTF16(street);
+ WordList words;
+ TokenizeWords(street_16, &words);
+ EXPECT_TRUE(words.size() == 1);
+ return IsValidLocationName(words.front());
+ }
+
+ std::string FindAddress(const std::string& content) const {
+ string16 content_16 = UTF8ToUTF16(content);
+ string16 result_16;
+ size_t start, end;
+ if (::FindAddress(content_16.begin(), content_16.end(), &start, &end))
+ result_16 = content_16.substr(start, end - start);
+ return UTF16ToUTF8(result_16);
+ }
+
+ bool ContainsAddress(const std::string& content) const {
+ return !FindAddress(content).empty();
+ }
+
+ bool IsAddress(const std::string& content) const {
+ return FindAddress(content) == content;
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(AddressParserTest);
+};
+
+TEST_F(AddressParserTest, HouseNumber) {
+ // Tests cases with valid home numbers.
+ EXPECT_EQ(GetHouseNumber("4 my house"), "4");
+ EXPECT_EQ(GetHouseNumber("Something 4 my house"), "4");
+ EXPECT_EQ(GetHouseNumber("4"), "4");
+ EXPECT_EQ(GetHouseNumber(" 4,5"), "4");
+ EXPECT_EQ(GetHouseNumber("one"), "one");
+ EXPECT_EQ(GetHouseNumber("Number One somewhere"), "One");
+ EXPECT_EQ(GetHouseNumber("Testing \n4\n"), "4");
+ EXPECT_EQ(GetHouseNumber("Foo 1ST"), "1ST");
+ EXPECT_EQ(GetHouseNumber("Bar 2nd"), "2nd");
+ EXPECT_EQ(GetHouseNumber("Blah 3rd"), "3rd");
+ EXPECT_EQ(GetHouseNumber("4th"), "4th");
+ EXPECT_EQ(GetHouseNumber("Blah 11th"), "11th");
+ EXPECT_EQ(GetHouseNumber("Blah 12th meh"), "12th");
+ EXPECT_EQ(GetHouseNumber("Blah 13th moo"), "13th");
+ EXPECT_EQ(GetHouseNumber("211st"), "211st");
+ EXPECT_EQ(GetHouseNumber("1A"), "1A");
+ EXPECT_EQ(GetHouseNumber("number:35"), "35");
+ EXPECT_EQ(GetHouseNumber("five digits at most: 12345"), "12345");
+ EXPECT_EQ(GetHouseNumber("'123'"), "123");
+ EXPECT_EQ(GetHouseNumber("\"123\""), "123");
+ EXPECT_EQ(GetHouseNumber("\"123, something\""), "123");
+ EXPECT_EQ(GetHouseNumber("Testing 12-34"), "12-34");
+ EXPECT_EQ(GetHouseNumber("Testing 12-34c,d"), "12-34c");
+ EXPECT_EQ(GetHouseNumber("住所は:76 Buckingham Palace Roadです"), "76");
+
+ // Tests cases without valid home numbers.
+ EXPECT_FALSE(ContainsHouseNumber("0th"));
+ EXPECT_FALSE(ContainsHouseNumber("25st"));
+ EXPECT_FALSE(ContainsHouseNumber("111th"));
+ EXPECT_FALSE(ContainsHouseNumber("011th"));
+ EXPECT_FALSE(ContainsHouseNumber("27AZ"));
+ EXPECT_FALSE(ContainsHouseNumber("22ºC"));
+ EXPECT_FALSE(ContainsHouseNumber("3.141592"));
+ EXPECT_FALSE(ContainsHouseNumber("more than five digits: 123456"));
+ EXPECT_FALSE(ContainsHouseNumber("kjhdfkajsdhf98uf93h"));
+ EXPECT_FALSE(ContainsHouseNumber("これはテストです。"));
+ EXPECT_FALSE(ContainsHouseNumber("Number On"));
+ EXPECT_FALSE(ContainsHouseNumber("2: foo"));
+ EXPECT_FALSE(ContainsHouseNumber("12-"));
+ EXPECT_FALSE(ContainsHouseNumber("\n\"' \t, "));
+ EXPECT_FALSE(ContainsHouseNumber(""));
+}
+
+TEST_F(AddressParserTest, FindState) {
+ // The complete set of state codes and names is tested together with their
+ // returned state indices in the zip code test.
+ EXPECT_TRUE(IsState("CALIFORNIA"));
+ EXPECT_TRUE(IsState("ca"));
+
+ EXPECT_FALSE(IsState("californi"));
+ EXPECT_FALSE(IsState("northern mariana"));
+ EXPECT_FALSE(IsState("northern mariana island"));
+ EXPECT_FALSE(IsState("zz"));
+}
+
+TEST_F(AddressParserTest, ZipCode) {
+ EXPECT_TRUE(IsZipValid("90000", "CA"));
+ EXPECT_TRUE(IsZipValid("01234", "MA"));
+ EXPECT_TRUE(IsZipValid("99999-9999", "Alaska"));
+
+ EXPECT_FALSE(IsZipValid("999999999", "Alaska"));
+ EXPECT_FALSE(IsZipValid("9999-99999", "Alaska"));
+ EXPECT_FALSE(IsZipValid("999999999-", "Alaska"));
+ EXPECT_FALSE(IsZipValid("99999-999a", "Alaska"));
+ EXPECT_FALSE(IsZipValid("99999--9999", "Alaska"));
+ EXPECT_FALSE(IsZipValid("90000o", "CA"));
+ EXPECT_FALSE(IsZipValid("01234", "CA"));
+ EXPECT_FALSE(IsZipValid("01234-", "MA"));
+
+ // Test the state index against the zip range table.
+ EXPECT_TRUE(IsZipValid("99000", "AK"));
+ EXPECT_TRUE(IsZipValid("99000", "Alaska"));
+ EXPECT_TRUE(IsZipValid("35000", "AL"));
+ EXPECT_TRUE(IsZipValid("36000", "Alabama"));
+ EXPECT_TRUE(IsZipValid("71000", "AR"));
+ EXPECT_TRUE(IsZipValid("72000", "Arkansas"));
+ EXPECT_TRUE(IsZipValid("96000", "AS"));
+ EXPECT_TRUE(IsZipValid("96000", "American Samoa"));
+ EXPECT_TRUE(IsZipValid("85000", "AZ"));
+ EXPECT_TRUE(IsZipValid("86000", "Arizona"));
+ EXPECT_TRUE(IsZipValid("90000", "CA"));
+ EXPECT_TRUE(IsZipValid("96000", "California"));
+ EXPECT_TRUE(IsZipValid("80000", "CO"));
+ EXPECT_TRUE(IsZipValid("81000", "Colorado"));
+ EXPECT_TRUE(IsZipValid("06000", "CT"));
+ EXPECT_TRUE(IsZipValid("06000", "Connecticut"));
+ EXPECT_TRUE(IsZipValid("20000", "DC"));
+ EXPECT_TRUE(IsZipValid("20000", "District of Columbia"));
+ EXPECT_TRUE(IsZipValid("19000", "DE"));
+ EXPECT_TRUE(IsZipValid("19000", "Delaware"));
+ EXPECT_TRUE(IsZipValid("32000", "FL"));
+ EXPECT_TRUE(IsZipValid("34000", "Florida"));
+ EXPECT_TRUE(IsZipValid("96000", "FM"));
+ EXPECT_TRUE(IsZipValid("96000", "Federated States of Micronesia"));
+ EXPECT_TRUE(IsZipValid("30000", "GA"));
+ EXPECT_TRUE(IsZipValid("31000", "Georgia"));
+ EXPECT_TRUE(IsZipValid("96000", "GU"));
+ EXPECT_TRUE(IsZipValid("96000", "Guam"));
+ EXPECT_TRUE(IsZipValid("96000", "HI"));
+ EXPECT_TRUE(IsZipValid("96000", "Hawaii"));
+ EXPECT_TRUE(IsZipValid("50000", "IA"));
+ EXPECT_TRUE(IsZipValid("52000", "Iowa"));
+ EXPECT_TRUE(IsZipValid("83000", "ID"));
+ EXPECT_TRUE(IsZipValid("83000", "Idaho"));
+ EXPECT_TRUE(IsZipValid("60000", "IL"));
+ EXPECT_TRUE(IsZipValid("62000", "Illinois"));
+ EXPECT_TRUE(IsZipValid("46000", "IN"));
+ EXPECT_TRUE(IsZipValid("47000", "Indiana"));
+ EXPECT_TRUE(IsZipValid("66000", "KS"));
+ EXPECT_TRUE(IsZipValid("67000", "Kansas"));
+ EXPECT_TRUE(IsZipValid("40000", "KY"));
+ EXPECT_TRUE(IsZipValid("42000", "Kentucky"));
+ EXPECT_TRUE(IsZipValid("70000", "LA"));
+ EXPECT_TRUE(IsZipValid("71000", "Louisiana"));
+ EXPECT_TRUE(IsZipValid("01000", "MA"));
+ EXPECT_TRUE(IsZipValid("02000", "Massachusetts"));
+ EXPECT_TRUE(IsZipValid("20000", "MD"));
+ EXPECT_TRUE(IsZipValid("21000", "Maryland"));
+ EXPECT_TRUE(IsZipValid("03000", "ME"));
+ EXPECT_TRUE(IsZipValid("04000", "Maine"));
+ EXPECT_TRUE(IsZipValid("96000", "MH"));
+ EXPECT_TRUE(IsZipValid("96000", "Marshall Islands"));
+ EXPECT_TRUE(IsZipValid("48000", "MI"));
+ EXPECT_TRUE(IsZipValid("49000", "Michigan"));
+ EXPECT_TRUE(IsZipValid("55000", "MN"));
+ EXPECT_TRUE(IsZipValid("56000", "Minnesota"));
+ EXPECT_TRUE(IsZipValid("63000", "MO"));
+ EXPECT_TRUE(IsZipValid("65000", "Missouri"));
+ EXPECT_TRUE(IsZipValid("96000", "MP"));
+ EXPECT_TRUE(IsZipValid("96000", "Northern Mariana Islands"));
+ EXPECT_TRUE(IsZipValid("38000", "MS"));
+ EXPECT_TRUE(IsZipValid("39000", "Mississippi"));
+ EXPECT_TRUE(IsZipValid("55000", "MT"));
+ EXPECT_TRUE(IsZipValid("56000", "Montana"));
+ EXPECT_TRUE(IsZipValid("27000", "NC"));
+ EXPECT_TRUE(IsZipValid("28000", "North Carolina"));
+ EXPECT_TRUE(IsZipValid("58000", "ND"));
+ EXPECT_TRUE(IsZipValid("58000", "North Dakota"));
+ EXPECT_TRUE(IsZipValid("68000", "NE"));
+ EXPECT_TRUE(IsZipValid("69000", "Nebraska"));
+ EXPECT_TRUE(IsZipValid("03000", "NH"));
+ EXPECT_TRUE(IsZipValid("04000", "New Hampshire"));
+ EXPECT_TRUE(IsZipValid("07000", "NJ"));
+ EXPECT_TRUE(IsZipValid("08000", "New Jersey"));
+ EXPECT_TRUE(IsZipValid("87000", "NM"));
+ EXPECT_TRUE(IsZipValid("88000", "New Mexico"));
+ EXPECT_TRUE(IsZipValid("88000", "NV"));
+ EXPECT_TRUE(IsZipValid("89000", "Nevada"));
+ EXPECT_TRUE(IsZipValid("10000", "NY"));
+ EXPECT_TRUE(IsZipValid("14000", "New York"));
+ EXPECT_TRUE(IsZipValid("43000", "OH"));
+ EXPECT_TRUE(IsZipValid("45000", "Ohio"));
+ EXPECT_TRUE(IsZipValid("73000", "OK"));
+ EXPECT_TRUE(IsZipValid("74000", "Oklahoma"));
+ EXPECT_TRUE(IsZipValid("97000", "OR"));
+ EXPECT_TRUE(IsZipValid("97000", "Oregon"));
+ EXPECT_TRUE(IsZipValid("15000", "PA"));
+ EXPECT_TRUE(IsZipValid("19000", "Pennsylvania"));
+ EXPECT_TRUE(IsZipValid("06000", "PR"));
+ EXPECT_TRUE(IsZipValid("06000", "Puerto Rico"));
+ EXPECT_TRUE(IsZipValid("96000", "PW"));
+ EXPECT_TRUE(IsZipValid("96000", "Palau"));
+ EXPECT_TRUE(IsZipValid("02000", "RI"));
+ EXPECT_TRUE(IsZipValid("02000", "Rhode Island"));
+ EXPECT_TRUE(IsZipValid("29000", "SC"));
+ EXPECT_TRUE(IsZipValid("29000", "South Carolina"));
+ EXPECT_TRUE(IsZipValid("57000", "SD"));
+ EXPECT_TRUE(IsZipValid("57000", "South Dakota"));
+ EXPECT_TRUE(IsZipValid("37000", "TN"));
+ EXPECT_TRUE(IsZipValid("38000", "Tennessee"));
+ EXPECT_TRUE(IsZipValid("75000", "TX"));
+ EXPECT_TRUE(IsZipValid("79000", "Texas"));
+ EXPECT_TRUE(IsZipValid("84000", "UT"));
+ EXPECT_TRUE(IsZipValid("84000", "Utah"));
+ EXPECT_TRUE(IsZipValid("22000", "VA"));
+ EXPECT_TRUE(IsZipValid("24000", "Virginia"));
+ EXPECT_TRUE(IsZipValid("06000", "VI"));
+ EXPECT_TRUE(IsZipValid("09000", "Virgin Islands"));
+ EXPECT_TRUE(IsZipValid("05000", "VT"));
+ EXPECT_TRUE(IsZipValid("05000", "Vermont"));
+ EXPECT_TRUE(IsZipValid("98000", "WA"));
+ EXPECT_TRUE(IsZipValid("99000", "Washington"));
+ EXPECT_TRUE(IsZipValid("53000", "WI"));
+ EXPECT_TRUE(IsZipValid("54000", "Wisconsin"));
+ EXPECT_TRUE(IsZipValid("24000", "WV"));
+ EXPECT_TRUE(IsZipValid("26000", "West Virginia"));
+ EXPECT_TRUE(IsZipValid("82000", "WY"));
+ EXPECT_TRUE(IsZipValid("83000", "Wyoming"));
+}
+
+TEST_F(AddressParserTest, LocationName) {
+ EXPECT_FALSE(IsLocationName("str-eet"));
+ EXPECT_FALSE(IsLocationName("somewhere"));
+
+ // Test all supported street names and expected plural cases.
+ EXPECT_TRUE(IsLocationName("alley"));
+ EXPECT_TRUE(IsLocationName("annex"));
+ EXPECT_TRUE(IsLocationName("arcade"));
+ EXPECT_TRUE(IsLocationName("ave."));
+ EXPECT_TRUE(IsLocationName("avenue"));
+ EXPECT_TRUE(IsLocationName("alameda"));
+ EXPECT_TRUE(IsLocationName("bayou"));
+ EXPECT_TRUE(IsLocationName("beach"));
+ EXPECT_TRUE(IsLocationName("bend"));
+ EXPECT_TRUE(IsLocationName("bluff"));
+ EXPECT_TRUE(IsLocationName("bluffs"));
+ EXPECT_TRUE(IsLocationName("bottom"));
+ EXPECT_TRUE(IsLocationName("boulevard"));
+ EXPECT_TRUE(IsLocationName("branch"));
+ EXPECT_TRUE(IsLocationName("bridge"));
+ EXPECT_TRUE(IsLocationName("brook"));
+ EXPECT_TRUE(IsLocationName("brooks"));
+ EXPECT_TRUE(IsLocationName("burg"));
+ EXPECT_TRUE(IsLocationName("burgs"));
+ EXPECT_TRUE(IsLocationName("bypass"));
+ EXPECT_TRUE(IsLocationName("broadway"));
+ EXPECT_TRUE(IsLocationName("camino"));
+ EXPECT_TRUE(IsLocationName("camp"));
+ EXPECT_TRUE(IsLocationName("canyon"));
+ EXPECT_TRUE(IsLocationName("cape"));
+ EXPECT_TRUE(IsLocationName("causeway"));
+ EXPECT_TRUE(IsLocationName("center"));
+ EXPECT_TRUE(IsLocationName("centers"));
+ EXPECT_TRUE(IsLocationName("circle"));
+ EXPECT_TRUE(IsLocationName("circles"));
+ EXPECT_TRUE(IsLocationName("cliff"));
+ EXPECT_TRUE(IsLocationName("cliffs"));
+ EXPECT_TRUE(IsLocationName("club"));
+ EXPECT_TRUE(IsLocationName("common"));
+ EXPECT_TRUE(IsLocationName("corner"));
+ EXPECT_TRUE(IsLocationName("corners"));
+ EXPECT_TRUE(IsLocationName("course"));
+ EXPECT_TRUE(IsLocationName("court"));
+ EXPECT_TRUE(IsLocationName("courts"));
+ EXPECT_TRUE(IsLocationName("cove"));
+ EXPECT_TRUE(IsLocationName("coves"));
+ EXPECT_TRUE(IsLocationName("creek"));
+ EXPECT_TRUE(IsLocationName("crescent"));
+ EXPECT_TRUE(IsLocationName("crest"));
+ EXPECT_TRUE(IsLocationName("crossing"));
+ EXPECT_TRUE(IsLocationName("crossroad"));
+ EXPECT_TRUE(IsLocationName("curve"));
+ EXPECT_TRUE(IsLocationName("circulo"));
+ EXPECT_TRUE(IsLocationName("dale"));
+ EXPECT_TRUE(IsLocationName("dam"));
+ EXPECT_TRUE(IsLocationName("divide"));
+ EXPECT_TRUE(IsLocationName("drive"));
+ EXPECT_TRUE(IsLocationName("drives"));
+ EXPECT_TRUE(IsLocationName("estate"));
+ EXPECT_TRUE(IsLocationName("estates"));
+ EXPECT_TRUE(IsLocationName("expressway"));
+ EXPECT_TRUE(IsLocationName("extension"));
+ EXPECT_TRUE(IsLocationName("extensions"));
+ EXPECT_TRUE(IsLocationName("fall"));
+ EXPECT_TRUE(IsLocationName("falls"));
+ EXPECT_TRUE(IsLocationName("ferry"));
+ EXPECT_TRUE(IsLocationName("field"));
+ EXPECT_TRUE(IsLocationName("fields"));
+ EXPECT_TRUE(IsLocationName("flat"));
+ EXPECT_TRUE(IsLocationName("flats"));
+ EXPECT_TRUE(IsLocationName("ford"));
+ EXPECT_TRUE(IsLocationName("fords"));
+ EXPECT_TRUE(IsLocationName("forest"));
+ EXPECT_TRUE(IsLocationName("forge"));
+ EXPECT_TRUE(IsLocationName("forges"));
+ EXPECT_TRUE(IsLocationName("fork"));
+ EXPECT_TRUE(IsLocationName("forks"));
+ EXPECT_TRUE(IsLocationName("fort"));
+ EXPECT_TRUE(IsLocationName("freeway"));
+ EXPECT_TRUE(IsLocationName("garden"));
+ EXPECT_TRUE(IsLocationName("gardens"));
+ EXPECT_TRUE(IsLocationName("gateway"));
+ EXPECT_TRUE(IsLocationName("glen"));
+ EXPECT_TRUE(IsLocationName("glens"));
+ EXPECT_TRUE(IsLocationName("green"));
+ EXPECT_TRUE(IsLocationName("greens"));
+ EXPECT_TRUE(IsLocationName("grove"));
+ EXPECT_TRUE(IsLocationName("groves"));
+ EXPECT_TRUE(IsLocationName("harbor"));
+ EXPECT_TRUE(IsLocationName("harbors"));
+ EXPECT_TRUE(IsLocationName("haven"));
+ EXPECT_TRUE(IsLocationName("heights"));
+ EXPECT_TRUE(IsLocationName("highway"));
+ EXPECT_TRUE(IsLocationName("hill"));
+ EXPECT_TRUE(IsLocationName("hills"));
+ EXPECT_TRUE(IsLocationName("hollow"));
+ EXPECT_TRUE(IsLocationName("inlet"));
+ EXPECT_TRUE(IsLocationName("island"));
+ EXPECT_TRUE(IsLocationName("islands"));
+ EXPECT_TRUE(IsLocationName("isle"));
+ EXPECT_TRUE(IsLocationName("junction"));
+ EXPECT_TRUE(IsLocationName("junctions"));
+ EXPECT_TRUE(IsLocationName("key"));
+ EXPECT_TRUE(IsLocationName("keys"));
+ EXPECT_TRUE(IsLocationName("knoll"));
+ EXPECT_TRUE(IsLocationName("knolls"));
+ EXPECT_TRUE(IsLocationName("lake"));
+ EXPECT_TRUE(IsLocationName("lakes"));
+ EXPECT_TRUE(IsLocationName("land"));
+ EXPECT_TRUE(IsLocationName("landing"));
+ EXPECT_TRUE(IsLocationName("lane"));
+ EXPECT_TRUE(IsLocationName("light"));
+ EXPECT_TRUE(IsLocationName("lights"));
+ EXPECT_TRUE(IsLocationName("loaf"));
+ EXPECT_TRUE(IsLocationName("lock"));
+ EXPECT_TRUE(IsLocationName("locks"));
+ EXPECT_TRUE(IsLocationName("lodge"));
+ EXPECT_TRUE(IsLocationName("loop"));
+ EXPECT_TRUE(IsLocationName("mall"));
+ EXPECT_TRUE(IsLocationName("manor"));
+ EXPECT_TRUE(IsLocationName("manors"));
+ EXPECT_TRUE(IsLocationName("meadow"));
+ EXPECT_TRUE(IsLocationName("meadows"));
+ EXPECT_TRUE(IsLocationName("mews"));
+ EXPECT_TRUE(IsLocationName("mill"));
+ EXPECT_TRUE(IsLocationName("mills"));
+ EXPECT_TRUE(IsLocationName("mission"));
+ EXPECT_TRUE(IsLocationName("motorway"));
+ EXPECT_TRUE(IsLocationName("mount"));
+ EXPECT_TRUE(IsLocationName("mountain"));
+ EXPECT_TRUE(IsLocationName("mountains"));
+ EXPECT_TRUE(IsLocationName("neck"));
+ EXPECT_TRUE(IsLocationName("orchard"));
+ EXPECT_TRUE(IsLocationName("oval"));
+ EXPECT_TRUE(IsLocationName("overpass"));
+ EXPECT_TRUE(IsLocationName("park"));
+ EXPECT_TRUE(IsLocationName("parks"));
+ EXPECT_TRUE(IsLocationName("parkway"));
+ EXPECT_TRUE(IsLocationName("parkways"));
+ EXPECT_TRUE(IsLocationName("pass"));
+ EXPECT_TRUE(IsLocationName("passage"));
+ EXPECT_TRUE(IsLocationName("path"));
+ EXPECT_TRUE(IsLocationName("pike"));
+ EXPECT_TRUE(IsLocationName("pine"));
+ EXPECT_TRUE(IsLocationName("pines"));
+ EXPECT_TRUE(IsLocationName("plain"));
+ EXPECT_TRUE(IsLocationName("plains"));
+ EXPECT_TRUE(IsLocationName("plaza"));
+ EXPECT_TRUE(IsLocationName("point"));
+ EXPECT_TRUE(IsLocationName("points"));
+ EXPECT_TRUE(IsLocationName("port"));
+ EXPECT_TRUE(IsLocationName("ports"));
+ EXPECT_TRUE(IsLocationName("prairie"));
+ EXPECT_TRUE(IsLocationName("privada"));
+ EXPECT_TRUE(IsLocationName("radial"));
+ EXPECT_TRUE(IsLocationName("ramp"));
+ EXPECT_TRUE(IsLocationName("ranch"));
+ EXPECT_TRUE(IsLocationName("rapid"));
+ EXPECT_TRUE(IsLocationName("rapids"));
+ EXPECT_TRUE(IsLocationName("rest"));
+ EXPECT_TRUE(IsLocationName("ridge"));
+ EXPECT_TRUE(IsLocationName("ridges"));
+ EXPECT_TRUE(IsLocationName("river"));
+ EXPECT_TRUE(IsLocationName("road"));
+ EXPECT_TRUE(IsLocationName("roads"));
+ EXPECT_TRUE(IsLocationName("route"));
+ EXPECT_TRUE(IsLocationName("row"));
+ EXPECT_TRUE(IsLocationName("rue"));
+ EXPECT_TRUE(IsLocationName("run"));
+ EXPECT_TRUE(IsLocationName("shoal"));
+ EXPECT_TRUE(IsLocationName("shoals"));
+ EXPECT_TRUE(IsLocationName("shore"));
+ EXPECT_TRUE(IsLocationName("shores"));
+ EXPECT_TRUE(IsLocationName("skyway"));
+ EXPECT_TRUE(IsLocationName("spring"));
+ EXPECT_TRUE(IsLocationName("springs"));
+ EXPECT_TRUE(IsLocationName("spur"));
+ EXPECT_TRUE(IsLocationName("spurs"));
+ EXPECT_TRUE(IsLocationName("square"));
+ EXPECT_TRUE(IsLocationName("squares"));
+ EXPECT_TRUE(IsLocationName("station"));
+ EXPECT_TRUE(IsLocationName("stravenue"));
+ EXPECT_TRUE(IsLocationName("stream"));
+ EXPECT_TRUE(IsLocationName("st."));
+ EXPECT_TRUE(IsLocationName("street"));
+ EXPECT_TRUE(IsLocationName("streets"));
+ EXPECT_TRUE(IsLocationName("summit"));
+ EXPECT_TRUE(IsLocationName("speedway"));
+ EXPECT_TRUE(IsLocationName("terrace"));
+ EXPECT_TRUE(IsLocationName("throughway"));
+ EXPECT_TRUE(IsLocationName("trace"));
+ EXPECT_TRUE(IsLocationName("track"));
+ EXPECT_TRUE(IsLocationName("trafficway"));
+ EXPECT_TRUE(IsLocationName("trail"));
+ EXPECT_TRUE(IsLocationName("tunnel"));
+ EXPECT_TRUE(IsLocationName("turnpike"));
+ EXPECT_TRUE(IsLocationName("underpass"));
+ EXPECT_TRUE(IsLocationName("union"));
+ EXPECT_TRUE(IsLocationName("unions"));
+ EXPECT_TRUE(IsLocationName("valley"));
+ EXPECT_TRUE(IsLocationName("valleys"));
+ EXPECT_TRUE(IsLocationName("viaduct"));
+ EXPECT_TRUE(IsLocationName("view"));
+ EXPECT_TRUE(IsLocationName("views"));
+ EXPECT_TRUE(IsLocationName("village"));
+ EXPECT_TRUE(IsLocationName("villages"));
+ EXPECT_TRUE(IsLocationName("ville"));
+ EXPECT_TRUE(IsLocationName("vista"));
+ EXPECT_TRUE(IsLocationName("walk"));
+ EXPECT_TRUE(IsLocationName("walks"));
+ EXPECT_TRUE(IsLocationName("wall"));
+ EXPECT_TRUE(IsLocationName("way"));
+ EXPECT_TRUE(IsLocationName("ways"));
+ EXPECT_TRUE(IsLocationName("well"));
+ EXPECT_TRUE(IsLocationName("wells"));
+ EXPECT_TRUE(IsLocationName("xing"));
+ EXPECT_TRUE(IsLocationName("xrd"));
+}
+
+TEST_F(AddressParserTest, NumberPrefixCases) {
+ EXPECT_EQ(FindAddress("Cafe 21\n750 Fifth Ave. San Diego, California 92101"),
+ "750 Fifth Ave. San Diego, California 92101");
+ EXPECT_EQ(FindAddress(
+ "Century City 15\n 10250 Santa Monica Boulevard Los Angeles, CA 90067"),
+ "10250 Santa Monica Boulevard Los Angeles, CA 90067");
+ EXPECT_EQ(FindAddress("123 45\n67 My Street, Somewhere, NY 10000"),
+ "67 My Street, Somewhere, NY 10000");
+ EXPECT_TRUE(IsAddress("123 4th Avenue, Somewhere in NY 10000"));
+}
+
+TEST_F(AddressParserTest, FullAddress) {
+ // Test US Google corporate addresses. Expects a full string match.
+ EXPECT_TRUE(IsAddress("1600 Amphitheatre Parkway Mountain View, CA 94043"));
+ EXPECT_TRUE(IsAddress("201 S. Division St. Suite 500 Ann Arbor, MI 48104"));
+ EXPECT_TRUE(ContainsAddress(
+ "Millennium at Midtown 10 10th Street NE Suite 600 Atlanta, GA 30309"));
+ EXPECT_TRUE(IsAddress(
+ "9606 North MoPac Expressway Suite 400 Austin, TX 78759"));
+ EXPECT_TRUE(IsAddress("2590 Pearl Street Suite 100 Boulder, CO 80302"));
+ EXPECT_TRUE(IsAddress("5 Cambridge Center, Floors 3-6 Cambridge, MA 02142"));
+ EXPECT_TRUE(IsAddress("410 Market St Suite 415 Chapel Hill, NC 27516"));
+ EXPECT_TRUE(IsAddress("20 West Kinzie St. Chicago, IL 60654"));
+ EXPECT_TRUE(IsAddress("114 Willits Street Birmingham, MI 48009"));
+ EXPECT_TRUE(IsAddress("19540 Jamboree Road 2nd Floor Irvine, CA 92612"));
+ EXPECT_TRUE(IsAddress("747 6th Street South, Kirkland, WA 98033"));
+ EXPECT_TRUE(IsAddress("301 S. Blount St. Suite 301 Madison, WI 53703"));
+ EXPECT_TRUE(IsAddress("76 Ninth Avenue 4th Floor New York, NY 10011"));
+ EXPECT_TRUE(ContainsAddress(
+ "Chelsea Markset Space, 75 Ninth Avenue 2nd and 4th Floors New York, \
+ NY 10011"));
+ EXPECT_TRUE(IsAddress("6425 Penn Ave. Suite 700 Pittsburgh, PA 15206"));
+ EXPECT_TRUE(IsAddress("1818 Library Street Suite 400 Reston, VA 20190"));
+ EXPECT_TRUE(IsAddress("345 Spear Street Floors 2-4 San Francisco, CA 94105"));
+ EXPECT_TRUE(IsAddress("604 Arizona Avenue Santa Monica, CA 90401"));
+ EXPECT_TRUE(IsAddress("651 N. 34th St. Seattle, WA 98103"));
+ EXPECT_TRUE(IsAddress(
+ "1101 New York Avenue, N.W. Second Floor Washington, DC 20005"));
+
+ // Other tests.
+ EXPECT_TRUE(IsAddress("57th Street and Lake Shore Drive\nChicago, IL 60637"));
+ EXPECT_TRUE(IsAddress("308 Congress Street Boston, MA 02210"));
+ EXPECT_TRUE(ContainsAddress(
+ "Central Park West at 79th Street, New York, NY, 10024-5192"));
+ EXPECT_TRUE(ContainsAddress(
+ "Lincoln Park | 100 34th Avenue • San Francisco, CA 94121 | 41575036"));
+
+ EXPECT_EQ(FindAddress("Lorem ipsum dolor sit amet, consectetur adipisicing " \
+ "elit, sed do 1600 Amphitheatre Parkway Mountain View, CA 94043 " \
+ "eiusmod tempor incididunt ut labore et dolore magna aliqua."),
+ "1600 Amphitheatre Parkway Mountain View, CA 94043");
+
+ EXPECT_EQ(FindAddress("2590 Pearl Street Suite 100 Boulder, CO 80302 6425 " \
+ "Penn Ave. Suite 700 Pittsburgh, PA 15206"),
+ "2590 Pearl Street Suite 100 Boulder, CO 80302");
+
+ EXPECT_TRUE(ContainsAddress(
+ "住所は 1600 Amphitheatre Parkway Mountain View, CA 94043 です。"));
+
+ EXPECT_FALSE(ContainsAddress("1 st. too-short, CA 90000"));
+ EXPECT_TRUE(ContainsAddress("1 st. long enough, CA 90000"));
+
+ EXPECT_TRUE(ContainsAddress("1 st. some city in al 35000"));
+ EXPECT_FALSE(ContainsAddress("1 book st Aquinas et al 35000"));
+
+ EXPECT_FALSE(ContainsAddress("1 this comes too late: street, CA 90000"));
+ EXPECT_TRUE(ContainsAddress("1 this is ok: street, CA 90000"));
+
+ EXPECT_FALSE(ContainsAddress(
+ "1 street I love verbosity, so I'm writing an address with too many " \
+ "words CA 90000"));
+ EXPECT_TRUE(ContainsAddress("1 street 2 3 4 5 6 7 8 9 10 11 12, CA 90000"));
+
+ EXPECT_TRUE(IsAddress("79th Street 1st Floor New York City, NY 10024-5192"));
+
+ EXPECT_FALSE(ContainsAddress("123 Fake Street, Springfield, Springfield"));
+ EXPECT_FALSE(ContainsAddress("999 Street Avenue, City, ZZ 98765"));
+ EXPECT_FALSE(ContainsAddress("76 Here be dragons, CA 94043"));
+ EXPECT_FALSE(ContainsAddress("1 This, has, too* many, lines, to, be* valid"));
+ EXPECT_FALSE(ContainsAddress(
+ "1 Supercalifragilisticexpialidocious is too long, CA 90000"));
+ EXPECT_FALSE(ContainsAddress(""));
+}