1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "content/common/android/address_parser.h"
6
7#include "base/logging.h"
8#include "base/strings/string_util.h"
9#include "content/common/android/address_parser_internal.h"
10
11namespace {
12
13// Minimum number of words in an address after the house number
14// before a state is expected to be found.
15// A value too high can miss short addresses.
16const size_t kMinAddressWords = 3;
17
18// Maximum number of words allowed in an address between the house number
19// and the state, both not included.
20const size_t kMaxAddressWords = 12;
21
22// Maximum number of lines allowed in an address between the house number
23// and the state, both not included.
24const size_t kMaxAddressLines = 5;
25
26// Maximum length allowed for any address word between the house number
27// and the state, both not included.
28const size_t kMaxAddressNameWordLength = 25;
29
30// Maximum number of words after the house number in which the location name
31// should be found.
32const size_t kMaxLocationNameDistance = 4;
33
34// Additional characters used as new line delimiters.
35const base::char16 kNewlineDelimiters[] = {
36  '\n',
37  ',',
38  '*',
39  0x2022,  // Unicode bullet
40  0,
41};
42
43}  // anonymous namespace
44
45namespace content {
46
47namespace address_parser {
48
49using namespace internal;
50
51bool FindAddress(const base::string16& text, base::string16* address) {
52  size_t start, end;
53  if (FindAddress(text.begin(), text.end(), &start, &end)) {
54    size_t len = end >= start ? end - start : 0;
55    address->assign(text.substr(start, len));
56    return true;
57  }
58  return false;
59}
60
61bool FindAddress(const base::string16::const_iterator& begin,
62                 const base::string16::const_iterator& end,
63                 size_t* start_pos,
64                 size_t* end_pos) {
65  HouseNumberParser house_number_parser;
66
67  // Keep going through the input string until a potential house number is
68  // detected. Start tokenizing the following words to find a valid
69  // street name within a word range. Then, find a state name followed
70  // by a valid zip code for that state. Also keep a look for any other
71  // possible house numbers to continue from in case of no match and for
72  // state names not followed by a zip code (e.g. New York, NY 10000).
73  const base::string16 newline_delimiters = kNewlineDelimiters;
74  const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;
75  for (base::string16::const_iterator it = begin; it != end; ) {
76    Word house_number;
77    if (!house_number_parser.Parse(it, end, &house_number))
78      return false;
79
80    String16Tokenizer tokenizer(house_number.end, end, delimiters);
81    tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
82
83    WordList words;
84    words.push_back(house_number);
85
86    bool found_location_name = false;
87    bool continue_on_house_number = true;
88    bool consecutive_house_numbers = true;
89    size_t next_house_number_word = 0;
90    size_t num_lines = 1;
91
92    // Don't include the house number in the word count.
93    size_t next_word = 1;
94    for (; next_word <= kMaxAddressWords + 1; ++next_word) {
95
96      // Extract a new word from the tokenizer.
97      if (next_word == words.size()) {
98        do {
99          if (!tokenizer.GetNext())
100            return false;
101
102          // Check the number of address lines.
103          if (tokenizer.token_is_delim() && newline_delimiters.find(
104              *tokenizer.token_begin()) != base::string16::npos) {
105            ++num_lines;
106          }
107        } while (tokenizer.token_is_delim());
108
109        if (num_lines > kMaxAddressLines)
110          break;
111
112        words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
113      }
114
115      // Check the word length. If too long, don't try to continue from
116      // the next house number as no address can hold this word.
117      const Word& current_word = words[next_word];
118      DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
119      size_t current_word_length = std::distance(
120          current_word.begin, current_word.end);
121      if (current_word_length > kMaxAddressNameWordLength) {
122        continue_on_house_number = false;
123        break;
124      }
125
126      // Check if the new word is a valid house number.
127      if (house_number_parser.Parse(current_word.begin, current_word.end,
128          NULL)) {
129        // Increase the number of consecutive house numbers since the beginning.
130        if (consecutive_house_numbers) {
131          // Check if there is a new line between consecutive house numbers.
132          // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
133          if (num_lines > 1) {
134            next_house_number_word = next_word;
135            break;
136          }
137        }
138
139        // Keep the next candidate to resume parsing from in case of failure.
140        if (next_house_number_word == 0) {
141          next_house_number_word = next_word;
142          continue;
143        }
144      } else {
145        consecutive_house_numbers = false;
146      }
147
148      // Look for location names in the words after the house number.
149      // A range limitation is introduced to avoid matching
150      // anything that starts with a number before a legitimate address.
151      if (next_word <= kMaxLocationNameDistance &&
152          IsValidLocationName(current_word)) {
153        found_location_name = true;
154        continue;
155      }
156
157      // Don't count the house number.
158      if (next_word > kMinAddressWords) {
159        // Looking for the state is likely to add new words to the list while
160        // checking for multi-word state names.
161        size_t state_first_word = next_word;
162        size_t state_last_word, state_index;
163        if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
164                                    &tokenizer, &state_index)) {
165
166          // A location name should have been found at this point.
167          if (!found_location_name)
168            break;
169
170          // Explicitly exclude "et al", as "al" is a valid state code.
171          if (current_word_length == 2 && words.size() > 2) {
172            const Word& previous_word = words[state_first_word - 1];
173            if (previous_word.end - previous_word.begin == 2 &&
174                LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
175                                     "et") &&
176                LowerCaseEqualsASCII(current_word.begin, current_word.end,
177                                     "al"))
178              break;
179          }
180
181          // Extract one more word from the tokenizer if not already available.
182          size_t zip_word = state_last_word + 1;
183          if (zip_word == words.size()) {
184            do {
185              if (!tokenizer.GetNext())
186                return false;
187            } while (tokenizer.token_is_delim());
188            words.push_back(Word(tokenizer.token_begin(),
189                            tokenizer.token_end()));
190          }
191
192          // Check the parsing validity and state range of the zip code.
193          next_word = state_last_word;
194          if (!IsZipValid(words[zip_word], state_index))
195            continue;
196
197          *start_pos = words[0].begin - begin;
198          *end_pos = words[zip_word].end - begin;
199          return true;
200        }
201      }
202    }
203
204    // Avoid skipping too many words because of a non-address number
205    // at the beginning of the contents to parse.
206    if (continue_on_house_number && next_house_number_word > 0) {
207      it = words[next_house_number_word].begin;
208    } else {
209      DCHECK(!words.empty());
210      next_word = std::min(next_word, words.size() - 1);
211      it = words[next_word].end;
212    }
213  }
214
215  return false;
216}
217
218}  // namespace address_parser
219
220}  // namespace content
221