summaryrefslogtreecommitdiffstats
path: root/content/common/android/address_parser.cc
blob: 30fa304ba4cd7d1219b5c244d1343bd0f953ae3e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "content/common/android/address_parser.h"

#include "base/logging.h"
#include "base/strings/string_util.h"
#include "content/common/android/address_parser_internal.h"

namespace {

// Minimum number of words in an address after the house number
// before a state is expected to be found.
// A value too high can miss short addresses.
const size_t kMinAddressWords = 3;

// Maximum number of words allowed in an address between the house number
// and the state, both not included.
const size_t kMaxAddressWords = 12;

// Maximum number of lines allowed in an address between the house number
// and the state, both not included.
const size_t kMaxAddressLines = 5;

// Maximum length allowed for any address word between the house number
// and the state, both not included.
const size_t kMaxAddressNameWordLength = 25;

// Maximum number of words after the house number in which the location name
// should be found.
const size_t kMaxLocationNameDistance = 4;

// Additional characters used as new line delimiters.
const base::char16 kNewlineDelimiters[] = {
  '\n',
  ',',
  '*',
  0x2022,  // Unicode bullet
  0,
};

}  // anonymous namespace

namespace content {

namespace address_parser {

using namespace internal;

bool FindAddress(const base::string16& text, base::string16* address) {
  size_t start, end;
  if (FindAddress(text.begin(), text.end(), &start, &end)) {
    size_t len = end >= start ? end - start : 0;
    address->assign(text.substr(start, len));
    return true;
  }
  return false;
}

bool FindAddress(const base::string16::const_iterator& begin,
                 const base::string16::const_iterator& end,
                 size_t* start_pos,
                 size_t* end_pos) {
  HouseNumberParser house_number_parser;

  // Keep going through the input string until a potential house number is
  // detected. Start tokenizing the following words to find a valid
  // street name within a word range. Then, find a state name followed
  // by a valid zip code for that state. Also keep a look for any other
  // possible house numbers to continue from in case of no match and for
  // state names not followed by a zip code (e.g. New York, NY 10000).
  const base::string16 newline_delimiters = kNewlineDelimiters;
  const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;
  for (base::string16::const_iterator it = begin; it != end; ) {
    Word house_number;
    if (!house_number_parser.Parse(it, end, &house_number))
      return false;

    String16Tokenizer tokenizer(house_number.end, end, delimiters);
    tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);

    WordList words;
    words.push_back(house_number);

    bool found_location_name = false;
    bool continue_on_house_number = true;
    bool consecutive_house_numbers = true;
    size_t next_house_number_word = 0;
    size_t num_lines = 1;

    // Don't include the house number in the word count.
    size_t next_word = 1;
    for (; next_word <= kMaxAddressWords + 1; ++next_word) {

      // Extract a new word from the tokenizer.
      if (next_word == words.size()) {
        do {
          if (!tokenizer.GetNext())
            return false;

          // Check the number of address lines.
          if (tokenizer.token_is_delim() && newline_delimiters.find(
              *tokenizer.token_begin()) != base::string16::npos) {
            ++num_lines;
          }
        } while (tokenizer.token_is_delim());

        if (num_lines > kMaxAddressLines)
          break;

        words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
      }

      // Check the word length. If too long, don't try to continue from
      // the next house number as no address can hold this word.
      const Word& current_word = words[next_word];
      DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
      size_t current_word_length = std::distance(
          current_word.begin, current_word.end);
      if (current_word_length > kMaxAddressNameWordLength) {
        continue_on_house_number = false;
        break;
      }

      // Check if the new word is a valid house number.
      if (house_number_parser.Parse(current_word.begin, current_word.end,
          NULL)) {
        // Increase the number of consecutive house numbers since the beginning.
        if (consecutive_house_numbers) {
          // Check if there is a new line between consecutive house numbers.
          // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
          if (num_lines > 1) {
            next_house_number_word = next_word;
            break;
          }
        }

        // Keep the next candidate to resume parsing from in case of failure.
        if (next_house_number_word == 0) {
          next_house_number_word = next_word;
          continue;
        }
      } else {
        consecutive_house_numbers = false;
      }

      // Look for location names in the words after the house number.
      // A range limitation is introduced to avoid matching
      // anything that starts with a number before a legitimate address.
      if (next_word <= kMaxLocationNameDistance &&
          IsValidLocationName(current_word)) {
        found_location_name = true;
        continue;
      }

      // Don't count the house number.
      if (next_word > kMinAddressWords) {
        // Looking for the state is likely to add new words to the list while
        // checking for multi-word state names.
        size_t state_first_word = next_word;
        size_t state_last_word, state_index;
        if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
                                    &tokenizer, &state_index)) {

          // A location name should have been found at this point.
          if (!found_location_name)
            break;

          // Explicitly exclude "et al", as "al" is a valid state code.
          if (current_word_length == 2 && words.size() > 2) {
            const Word& previous_word = words[state_first_word - 1];
            if (previous_word.end - previous_word.begin == 2 &&
                LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
                                     "et") &&
                LowerCaseEqualsASCII(current_word.begin, current_word.end,
                                     "al"))
              break;
          }

          // Extract one more word from the tokenizer if not already available.
          size_t zip_word = state_last_word + 1;
          if (zip_word == words.size()) {
            do {
              if (!tokenizer.GetNext())
                return false;
            } while (tokenizer.token_is_delim());
            words.push_back(Word(tokenizer.token_begin(),
                            tokenizer.token_end()));
          }

          // Check the parsing validity and state range of the zip code.
          next_word = state_last_word;
          if (!IsZipValid(words[zip_word], state_index))
            continue;

          *start_pos = words[0].begin - begin;
          *end_pos = words[zip_word].end - begin;
          return true;
        }
      }
    }

    // Avoid skipping too many words because of a non-address number
    // at the beginning of the contents to parse.
    if (continue_on_house_number && next_house_number_word > 0) {
      it = words[next_house_number_word].begin;
    } else {
      DCHECK(!words.empty());
      next_word = std::min(next_word, words.size() - 1);
      it = words[next_word].end;
    }
  }

  return false;
}

}  // namespace address_parser

}  // namespace content