// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "content/common/android/address_parser.h" #include "base/logging.h" #include "base/strings/string_util.h" #include "content/common/android/address_parser_internal.h" namespace { // Minimum number of words in an address after the house number // before a state is expected to be found. // A value too high can miss short addresses. const size_t kMinAddressWords = 3; // Maximum number of words allowed in an address between the house number // and the state, both not included. const size_t kMaxAddressWords = 12; // Maximum number of lines allowed in an address between the house number // and the state, both not included. const size_t kMaxAddressLines = 5; // Maximum length allowed for any address word between the house number // and the state, both not included. const size_t kMaxAddressNameWordLength = 25; // Maximum number of words after the house number in which the location name // should be found. const size_t kMaxLocationNameDistance = 4; // Additional characters used as new line delimiters. const base::char16 kNewlineDelimiters[] = { '\n', ',', '*', 0x2022, // Unicode bullet 0, }; } // anonymous namespace namespace content { namespace address_parser { using namespace internal; bool FindAddress(const base::string16& text, base::string16* address) { size_t start, end; if (FindAddress(text.begin(), text.end(), &start, &end)) { size_t len = end >= start ? end - start : 0; address->assign(text.substr(start, len)); return true; } return false; } bool FindAddress(const base::string16::const_iterator& begin, const base::string16::const_iterator& end, size_t* start_pos, size_t* end_pos) { HouseNumberParser house_number_parser; // Keep going through the input string until a potential house number is // detected. Start tokenizing the following words to find a valid // street name within a word range. Then, find a state name followed // by a valid zip code for that state. Also keep a look for any other // possible house numbers to continue from in case of no match and for // state names not followed by a zip code (e.g. New York, NY 10000). const base::string16 newline_delimiters = kNewlineDelimiters; const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters; for (base::string16::const_iterator it = begin; it != end; ) { Word house_number; if (!house_number_parser.Parse(it, end, &house_number)) return false; String16Tokenizer tokenizer(house_number.end, end, delimiters); tokenizer.set_options(String16Tokenizer::RETURN_DELIMS); WordList words; words.push_back(house_number); bool found_location_name = false; bool continue_on_house_number = true; bool consecutive_house_numbers = true; size_t next_house_number_word = 0; size_t num_lines = 1; // Don't include the house number in the word count. size_t next_word = 1; for (; next_word <= kMaxAddressWords + 1; ++next_word) { // Extract a new word from the tokenizer. if (next_word == words.size()) { do { if (!tokenizer.GetNext()) return false; // Check the number of address lines. if (tokenizer.token_is_delim() && newline_delimiters.find( *tokenizer.token_begin()) != base::string16::npos) { ++num_lines; } } while (tokenizer.token_is_delim()); if (num_lines > kMaxAddressLines) break; words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); } // Check the word length. If too long, don't try to continue from // the next house number as no address can hold this word. const Word& current_word = words[next_word]; DCHECK_GT(std::distance(current_word.begin, current_word.end), 0); size_t current_word_length = std::distance( current_word.begin, current_word.end); if (current_word_length > kMaxAddressNameWordLength) { continue_on_house_number = false; break; } // Check if the new word is a valid house number. if (house_number_parser.Parse(current_word.begin, current_word.end, NULL)) { // Increase the number of consecutive house numbers since the beginning. if (consecutive_house_numbers) { // Check if there is a new line between consecutive house numbers. // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.." if (num_lines > 1) { next_house_number_word = next_word; break; } } // Keep the next candidate to resume parsing from in case of failure. if (next_house_number_word == 0) { next_house_number_word = next_word; continue; } } else { consecutive_house_numbers = false; } // Look for location names in the words after the house number. // A range limitation is introduced to avoid matching // anything that starts with a number before a legitimate address. if (next_word <= kMaxLocationNameDistance && IsValidLocationName(current_word)) { found_location_name = true; continue; } // Don't count the house number. if (next_word > kMinAddressWords) { // Looking for the state is likely to add new words to the list while // checking for multi-word state names. size_t state_first_word = next_word; size_t state_last_word, state_index; if (FindStateStartingInWord(&words, state_first_word, &state_last_word, &tokenizer, &state_index)) { // A location name should have been found at this point. if (!found_location_name) break; // Explicitly exclude "et al", as "al" is a valid state code. if (current_word_length == 2 && words.size() > 2) { const Word& previous_word = words[state_first_word - 1]; if (previous_word.end - previous_word.begin == 2 && LowerCaseEqualsASCII(previous_word.begin, previous_word.end, "et") && LowerCaseEqualsASCII(current_word.begin, current_word.end, "al")) break; } // Extract one more word from the tokenizer if not already available. size_t zip_word = state_last_word + 1; if (zip_word == words.size()) { do { if (!tokenizer.GetNext()) { // The address ends with a state name without a zip code. This // is legal according to WebView#findAddress public // documentation. *start_pos = words[0].begin - begin; *end_pos = words[state_last_word].end - begin; return true; } } while (tokenizer.token_is_delim()); words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); } // Check the parsing validity and state range of the zip code. next_word = state_last_word; if (!IsZipValid(words[zip_word], state_index)) continue; *start_pos = words[0].begin - begin; *end_pos = words[zip_word].end - begin; return true; } } } // Avoid skipping too many words because of a non-address number // at the beginning of the contents to parse. if (continue_on_house_number && next_house_number_word > 0) { it = words[next_house_number_word].begin; } else { DCHECK(!words.empty()); next_word = std::min(next_word, words.size() - 1); it = words[next_word].end; } } return false; } } // namespace address_parser } // namespace content