address_detector.cpp revision 773979f92560dd1aead375c82fd75b584a141e5d
1773979f92560dd1aead375c82fd75b584a141e5dJohn Reck/*
2773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * Copyright (C) 2012 Google Inc. All rights reserved.
3773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *
4773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * Redistribution and use in source and binary forms, with or without
5773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * modification, are permitted provided that the following conditions are
6773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * met:
7773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *
8773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *    * Redistributions of source code must retain the above copyright
9773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * notice, this list of conditions and the following disclaimer.
10773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *    * Redistributions in binary form must reproduce the above
11773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * copyright notice, this list of conditions and the following disclaimer
12773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * in the documentation and/or other materials provided with the
13773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * distribution.
14773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *    * Neither the name of Google Inc. nor the names of its
15773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * contributors may be used to endorse or promote products derived from
16773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * this software without specific prior written permission.
17773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *
18773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29773979f92560dd1aead375c82fd75b584a141e5dJohn Reck */
30773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
31773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "config.h"
32773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
33773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Magic pretend-to-be-a-chromium-build flags
34773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#undef WEBKIT_IMPLEMENTATION
35773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#undef LOG
36773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
37773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "content/address_detector.h"
38773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
39773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include <bitset>
40773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
41773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "base/utf_string_conversions.h"
42773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "net/base/escape.h"
43773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "WebString.h"
44773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
45773979f92560dd1aead375c82fd75b584a141e5dJohn Recknamespace {
46773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
47773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Prefix used for geographical address intent URIs.
48773979f92560dd1aead375c82fd75b584a141e5dJohn Reckstatic const char kAddressSchemaPrefix[] = "geo:0,0?q=";
49773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
50773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum text length to be searched for address detection.
51773979f92560dd1aead375c82fd75b584a141e5dJohn Reckstatic const size_t kMaxAddressLength = 500;
52773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
53773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Minimum number of words in an address after the house number
54773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// before a state is expected to be found.
55773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// A value too high can miss short addresses.
56773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMinAddressWords = 3;
57773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
58773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of words allowed in an address between the house number
59773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// and the state, both not included.
60773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxAddressWords = 12;
61773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
62773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of lines allowed in an address between the house number
63773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// and the state, both not included.
64773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxAddressLines = 5;
65773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
66773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum length allowed for any address word between the house number
67773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// and the state, both not included.
68773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxAddressNameWordLength = 25;
69773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
70773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of words after the house number in which the location name
71773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// should be found.
72773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxLocationNameDistance = 4;
73773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
74773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Number of digits for a valid zip code.
75773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kZipDigits = 5;
76773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
77773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Number of digits for a valid zip code in the Zip Plus 4 format.
78773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kZipPlus4Digits = 9;
79773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
80773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of digits of a house number, including possible hyphens.
81773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxHouseDigits = 5;
82773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
83773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Additional characters used as new line delimiters.
84773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst char16 kNewlineDelimiters[] = {
85773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  ',',
86773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  '*',
87773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  0x2022,  // Unicode bullet
88773979f92560dd1aead375c82fd75b584a141e5dJohn Reck};
89773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
90773979f92560dd1aead375c82fd75b584a141e5dJohn Reckchar16 SafePreviousChar(const string16::const_iterator& it,
91773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const string16::const_iterator& begin) {
92773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (it == begin)
93773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return ' ';
94773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return *(it - 1);
95773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
96773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
97773979f92560dd1aead375c82fd75b584a141e5dJohn Reckchar16 SafeNextChar(const string16::const_iterator& it,
98773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const string16::const_iterator& end) {
99773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (it == end)
100773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return ' ';
101773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return *(it + 1);
102773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
103773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
104773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool WordLowerCaseEqualsASCII(string16::const_iterator word_begin,
105773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    string16::const_iterator word_end, const char* ascii_to_match) {
106773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (string16::const_iterator it = word_begin; it != word_end;
107773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      ++it, ++ascii_to_match) {
108773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
109773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return false;
110773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
111773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return *ascii_to_match == 0 || *ascii_to_match == ' ';
112773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
113773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
114773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin,
115773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    string16::const_iterator word_end, const char* ascii_to_match,
116773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    bool allow_plural) {
117773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (string16::const_iterator it = word_begin; it != word_end;
118773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      ++it, ++ascii_to_match) {
119773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end)
120773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return true;
121773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
122773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
123773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return false;
124773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
125773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return *ascii_to_match == 0;
126773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
127773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
128773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}  // anonymous namespace
129773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
130773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
131773979f92560dd1aead375c82fd75b584a141e5dJohn ReckAddressDetector::AddressDetector() {
132773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
133773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
134773979f92560dd1aead375c82fd75b584a141e5dJohn ReckAddressDetector::~AddressDetector() {
135773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
136773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
137773979f92560dd1aead375c82fd75b584a141e5dJohn Reckstd::string AddressDetector::GetContentText(const WebKit::WebRange& range) {
138773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Get the address and replace unicode bullets with commas.
139773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  string16 address_16 = CollapseWhitespace(range.toPlainText(), false);
140773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  std::replace(address_16.begin(), address_16.end(),
141773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      static_cast<char16>(0x2022), static_cast<char16>(','));
142773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return UTF16ToUTF8(address_16);
143773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
144773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
145773979f92560dd1aead375c82fd75b584a141e5dJohn ReckGURL AddressDetector::GetIntentURL(const std::string& content_text) {
146773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return GURL(kAddressSchemaPrefix +
147773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      EscapeQueryParamValue(content_text, true));
148773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
149773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
150773979f92560dd1aead375c82fd75b584a141e5dJohn Recksize_t AddressDetector::GetMaximumContentLength() {
151773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return kMaxAddressLength;
152773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
153773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
154773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::FindContent(const string16::const_iterator& begin,
155773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) {
156773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  HouseNumberParser house_number_parser;
157773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
158773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Keep going through the input string until a potential house number is
159773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // detected. Start tokenizing the following words to find a valid
160773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // street name within a word range. Then, find a state name followed
161773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // by a valid zip code for that state. Also keep a look for any other
162773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // possible house numbers to continue from in case of no match and for
163773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // state names not followed by a zip code (e.g. New York, NY 10000).
164773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  const string16 newline_delimiters = kNewlineDelimiters;
165773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  const string16 delimiters = kWhitespaceUTF16 + newline_delimiters;
166773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (string16::const_iterator it = begin; it != end; ) {
167773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    Word house_number;
168773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (!house_number_parser.Parse(it, end, &house_number))
169773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return false;
170773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
171773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    String16Tokenizer tokenizer(house_number.end, end, delimiters);
172773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
173773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
174773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    std::vector<Word> words;
175773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    words.push_back(house_number);
176773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
177773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    bool found_location_name = false;
178773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    bool continue_on_house_number = true;
179773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t next_house_number_word = 0;
180773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t num_lines = 1;
181773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
182773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    // Don't include the house number in the word count.
183773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t next_word = 1;
184773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    for (; next_word <= kMaxAddressWords + 1; ++next_word) {
185773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
186773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Extract a new word from the tokenizer.
187773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (next_word == words.size()) {
188773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        do {
189773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (!tokenizer.GetNext())
190773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            return false;
191773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
192773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          // Check the number of address lines.
193773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (tokenizer.token_is_delim() && newline_delimiters.find(
194773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              *tokenizer.token_begin()) != string16::npos) {
195773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            ++num_lines;
196773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          }
197773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        } while (tokenizer.token_is_delim());
198773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
199773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        if (num_lines > kMaxAddressLines)
200773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          break;
201773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
202773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
203773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
204773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
205773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Check the word length. If too long, don't try to continue from
206773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // the next house number as no address can hold this word.
207773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      const Word& current_word = words[next_word];
208773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
209773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      size_t current_word_length = std::distance(
210773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          current_word.begin, current_word.end);
211773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (current_word_length > kMaxAddressNameWordLength) {
212773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        continue_on_house_number = false;
213773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        break;
214773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
215773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
216773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Check if the new word is a valid house number.
217773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // This is used to properly resume parsing in case the maximum number
218773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // of words is exceeded.
219773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (next_house_number_word == 0 &&
220773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          house_number_parser.Parse(current_word.begin, current_word.end, NULL)) {
221773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        next_house_number_word = next_word;
222773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        continue;
223773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
224773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
225773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Look for location names in the words after the house number.
226773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // A range limitation is introduced to avoid matching
227773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // anything that starts with a number before a legitimate address.
228773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (next_word <= kMaxLocationNameDistance &&
229773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          IsValidLocationName(current_word)) {
230773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        found_location_name = true;
231773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        continue;
232773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
233773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
234773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Don't count the house number.
235773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (next_word > kMinAddressWords) {
236773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        // Looking for the state is likely to add new words to the list while
237773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        // checking for multi-word state names.
238773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        size_t state_first_word = next_word;
239773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        size_t state_last_word, state_index;
240773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
241773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            &tokenizer, &state_index)) {
242773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
243773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          // A location name should have been found at this point.
244773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (!found_location_name)
245773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
246773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
247773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          // Explicitly exclude "et al", as "al" is a valid state code.
248773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (current_word_length == 2 && words.size() > 2) {
249773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            const Word& previous_word = words[state_first_word - 1];
250773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if (previous_word.end - previous_word.begin == 2 &&
251773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
252773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                    "et") &&
253773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                LowerCaseEqualsASCII(current_word.begin, current_word.end,
254773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                    "al"))
255773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              break;
256773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          }
257773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
258773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          // Extract one more word from the tokenizer if not already available.
259773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          size_t zip_word = state_last_word + 1;
260773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (zip_word == words.size()) {
261773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            do {
262773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              if (!tokenizer.GetNext())
263773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                return false;
264773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            } while (tokenizer.token_is_delim());
265773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            words.push_back(Word(tokenizer.token_begin(),
266773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                tokenizer.token_end()));
267773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          }
268773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
269773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          // Check the parsing validity and state range of the zip code.
270773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          next_word = state_last_word;
271773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (!IsZipValid(words[zip_word], state_index))
272773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            continue;
273773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
274773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          *start_pos = words[0].begin - begin;
275773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          *end_pos = words[zip_word].end - begin;
276773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          return true;
277773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        }
278773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
279773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
280773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
281773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    // Avoid skipping too many words because of a non-address number
282773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    // at the beginning of the contents to parse.
283773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (continue_on_house_number && next_house_number_word > 0) {
284773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      it = words[next_house_number_word].begin;
285773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    } else {
286773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      DCHECK(!words.empty());
287773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      next_word = std::min(next_word, words.size() - 1);
288773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      it = words[next_word].end;
289773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
290773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
291773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
292773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return false;
293773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
294773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
295773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::IsPreDelimiter(
296773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char16 character) {
297773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return character == ':' || IsPostDelimiter(character);
298773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
299773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
300773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::IsPostDelimiter(
301773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char16 character) {
302773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return IsWhitespace(character) || strchr(",\"'", character);
303773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
304773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
305773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::RestartOnNextDelimiter() {
306773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  ResetState();
307773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {}
308773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
309773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
310773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) {
311773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)),
312773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      num_chars);
313773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  it_ += offset;
314773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  result_chars_ += offset;
315773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
316773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
317773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) {
318773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars);
319773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
320773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
321773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::ResetState() {
322773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  num_digits_ = 0;
323773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  result_chars_ = 0;
324773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
325773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
326773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const {
327773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // There should always be a number after a hyphen.
328773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-')
329773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
330773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
331773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (word) {
332773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    word->begin = it_ - result_chars_;
333773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    word->end = it_;
334773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
335773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return true;
336773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
337773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
338773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::Parse(
339773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const string16::const_iterator& begin,
340773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const string16::const_iterator& end, Word* word) {
341773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  it_ = begin_ = begin;
342773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  end_ = end;
343773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  ResetState();
344773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
345773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Iterations only used as a fail-safe against any buggy infinite loops.
346773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  size_t iterations = 0;
347773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  size_t max_iterations = end - begin + 1;
348773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (; it_ != end_ && iterations < max_iterations; ++iterations) {
349773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
350773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    // Word finished case.
351773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (IsPostDelimiter(*it_)) {
352773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (CheckFinished(word))
353773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        return true;
354773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      else if (result_chars_)
355773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        ResetState();
356773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
357773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      SkipChars(1);
358773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
359773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
360773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
361773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    // More digits. There should be no more after a letter was found.
362773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (IsAsciiDigit(*it_)) {
363773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (num_digits_ >= kMaxHouseDigits) {
364773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        RestartOnNextDelimiter();
365773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      } else {
366773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        AcceptChars(1);
367773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        ++num_digits_;
368773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
369773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
370773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
371773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
372773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (IsAsciiAlpha(*it_)) {
373773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Handle special case 'one'.
374773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (result_chars_ == 0) {
375773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one"))
376773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          AcceptChars(3);
377773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        else
378773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          RestartOnNextDelimiter();
379773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        continue;
380773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
381773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
382773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // There should be more than 1 character because of result_chars.
383773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      DCHECK_GT(result_chars_, 0U);
384773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      DCHECK_NE(it_, begin_);
385773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      char16 previous = SafePreviousChar(it_, begin_);
386773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (IsAsciiDigit(previous)) {
387773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        // Check cases like '12A'.
388773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        char16 next = SafeNextChar(it_, end_);
389773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        if (IsPostDelimiter(next)) {
390773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          AcceptChars(1);
391773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          continue;
392773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        }
393773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
394773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        // Handle cases like 12a, 1st, 2nd, 3rd, 7th.
395773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        if (IsAsciiAlpha(next)) {
396773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          char16 last_digit = previous;
397773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          char16 first_letter = base::ToLowerASCII(*it_);
398773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          char16 second_letter = base::ToLowerASCII(next);
399773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' &&
400773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              num_digits_ == 2;
401773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
402773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          switch (last_digit - '0') {
403773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 1:
404773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if ((first_letter == 's' && second_letter == 't') ||
405773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                (first_letter == 't' && second_letter == 'h' && is_teen)) {
406773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              AcceptChars(2);
407773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              continue;
408773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            }
409773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
410773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
411773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 2:
412773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if ((first_letter == 'n' && second_letter == 'd') ||
413773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                (first_letter == 't' && second_letter == 'h' && is_teen)) {
414773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              AcceptChars(2);
415773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              continue;
416773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            }
417773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
418773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
419773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 3:
420773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if ((first_letter == 'r' && second_letter == 'd') ||
421773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                (first_letter == 't' && second_letter == 'h' && is_teen)) {
422773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              AcceptChars(2);
423773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              continue;
424773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            }
425773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
426773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
427773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 0:
428773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            // Explicitly exclude '0th'.
429773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if (num_digits_ == 1)
430773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              break;
431773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
432773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 4:
433773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 5:
434773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 6:
435773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 7:
436773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 8:
437773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 9:
438773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if (first_letter == 't' && second_letter == 'h') {
439773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              AcceptChars(2);
440773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              continue;
441773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            }
442773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
443773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
444773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          default:
445773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            NOTREACHED();
446773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          }
447773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        }
448773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
449773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
450773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      RestartOnNextDelimiter();
451773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
452773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
453773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
454773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (*it_ == '-' && num_digits_ > 0) {
455773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      AcceptChars(1);
456773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      ++num_digits_;
457773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
458773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
459773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
460773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    RestartOnNextDelimiter();
461773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    SkipChars(1);
462773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
463773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
464773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (iterations >= max_iterations)
465773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
466773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
467773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return CheckFinished(word);
468773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
469773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
470773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::FindStateStartingInWord(WordList* words,
471773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t state_first_word, size_t* state_last_word,
472773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    String16Tokenizer* tokenizer, size_t* state_index) {
473773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
474773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Bitmasks containing the allowed suffixes for 2-letter state codes.
475773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  static const int state_two_letter_suffix[23] = {
476773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x02060c00,  // A followed by: [KLRSZ].
477773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000000,  // B.
478773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00084001,  // C followed by: [AOT].
479773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000014,  // D followed by: [CE].
480773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000000,  // E.
481773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00001800,  // F followed by: [LM].
482773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00100001,  // G followed by: [AU].
483773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000100,  // H followed by: [I].
484773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00002809,  // I followed by: [ADLN].
485773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000000,  // J.
486773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x01040000,  // K followed by: [SY].
487773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000001,  // L followed by: [A].
488773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x000ce199,  // M followed by: [ADEHINOPST].
489773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x0120129c,  // N followed by: [CDEHJMVY].
490773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00020480,  // O followed by: [HKR].
491773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00420001,  // P followed by: [ARW].
492773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000000,  // Q.
493773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000100,  // R followed by: [I].
494773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x0000000c,  // S followed by: [CD].
495773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00802000,  // T followed by: [NX].
496773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00080000,  // U followed by: [T].
497773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00080101,  // V followed by: [AIT].
498773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x01200101   // W followed by: [AIVY].
499773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  };
500773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
501773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Accumulative number of states for the 2-letter code indexed by the first.
502773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  static const int state_two_letter_accumulative[24] = {
503773979f92560dd1aead375c82fd75b584a141e5dJohn Reck     0,  5,  5,  8, 10, 10, 12, 14,
504773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    15, 19, 19, 21, 22, 32, 40, 43,
505773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    46, 46, 47, 49, 51, 52, 55, 59
506773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  };
507773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
508773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // State names sorted alphabetically with their lengths.
509773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // There can be more than one possible name for a same state if desired.
510773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  static const struct StateNameInfo {
511773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const char* string;
512773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char first_word_length;
513773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char length;
514773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char state_index; // Relative to two-character code alphabetical order.
515773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  } state_names[59] = {
516773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 },
517773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 },
518773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "arkansas", 8, 8, 2 },
519773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 },
520773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 },
521773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "district of columbia", 8, 20, 8 },
522773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 },
523773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 },
524773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "hawaii", 6, 6, 14 },
525773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 },
526773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "iowa", 4, 4, 15 },
527773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 },
528773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "louisiana", 9, 9, 21 },
529773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 },
530773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 },
531773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 },
532773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 },
533773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "montana", 7, 7, 31 },
534773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 },
535773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 },
536773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 },
537773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 },
538773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "northern mariana islands", 8, 24, 29 },
539773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 },
540773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 },
541773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "puerto rico", 6, 11, 44 },
542773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "rhode island", 5, 5, 46 },
543773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 },
544773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 },
545773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "utah", 4, 4, 51 },
546773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 },
547773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "virginia", 8, 8, 52 },
548773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 },
549773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 }
550773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  };
551773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
552773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Accumulative number of states for sorted names indexed by the first letter.
553773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Required a different one since there are codes that don't share their
554773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // first letter with the name of their state (MP = Northern Mariana Islands).
555773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  static const int state_names_accumulative[24] = {
556773979f92560dd1aead375c82fd75b584a141e5dJohn Reck     0,  5,  5,  8, 10, 10, 12, 14,
557773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    15, 19, 19, 21, 22, 31, 40, 43,
558773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    46, 46, 47, 49, 51, 52, 55, 59
559773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  };
560773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
561773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1],
562773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      static_cast<int>(ARRAYSIZE_UNSAFE(state_names)));
563773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
564773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  const Word& first_word = words->at(state_first_word);
565773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  int length = first_word.end - first_word.begin;
566773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (length < 2 || !IsAsciiAlpha(*first_word.begin))
567773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
568773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
569773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // No state names start with x, y, z.
570773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  char16 first_letter = base::ToLowerASCII(*first_word.begin);
571773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (first_letter > 'w')
572773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
573773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
574773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  DCHECK(first_letter >= 'a');
575773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  int first_index = first_letter - 'a';
576773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
577773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Look for two-letter state names.
578773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) {
579773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1));
580773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    DCHECK(second_letter >= 'a');
581773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
582773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    int second_index = second_letter - 'a';
583773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (!(state_two_letter_suffix[first_index] & (1 << second_index)))
584773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return false;
585773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
586773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] &
587773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        ((1 << second_index) - 1);
588773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    *state_last_word = state_first_word;
589773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    *state_index = state_two_letter_accumulative[first_index] +
590773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        previous_suffixes.count();
591773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return true;
592773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
593773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
594773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Look for full state names by their first letter. Discard by length.
595773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (int state = state_names_accumulative[first_index];
596773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      state < state_names_accumulative[first_index + 1]; ++state) {
597773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (state_names[state].first_word_length != length)
598773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
599773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
600773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    bool state_match = false;
601773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t state_word = state_first_word;
602773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    for (int pos = 0; true; ) {
603773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (!WordLowerCaseEqualsASCII(words->at(state_word).begin,
604773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          words->at(state_word).end, &state_names[state].string[pos]))
605773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        break;
606773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
607773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      pos += words->at(state_word).end - words->at(state_word).begin + 1;
608773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (pos >= state_names[state].length) {
609773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        state_match = true;
610773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        break;
611773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
612773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
613773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Ran out of words, extract more from the tokenizer.
614773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (++state_word == words->size()) {
615773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        do {
616773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (!tokenizer->GetNext())
617773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
618773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        } while (tokenizer->token_is_delim());
619773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end()));
620773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
621773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
622773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
623773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (state_match) {
624773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      *state_last_word = state_word;
625773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      *state_index = state_names[state].state_index;
626773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return true;
627773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
628773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
629773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
630773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return false;
631773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
632773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
633773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::IsZipValid(const Word& word, size_t state_index) {
634773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  size_t length = word.end - word.begin;
635773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (length != kZipDigits && length != kZipPlus4Digits + 1)
636773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
637773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
638773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (string16::const_iterator it = word.begin; it != word.end; ++it) {
639773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t pos = it - word.begin;
640773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits))
641773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
642773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
643773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
644773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return IsZipValidForState(word, state_index);
645773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
646773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
647773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::IsZipValidForState(const Word& word, size_t state_index) {
648773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // List of valid zip code ranges.
649773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  static const struct {
650773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char low;
651773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char high;
652773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char exception1;
653773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char exception2;
654773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  } zip_range[] = {
655773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 99, 99, -1, -1 }, // AK Alaska.
656773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 35, 36, -1, -1 }, // AL Alabama.
657773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 71, 72, -1, -1 }, // AR Arkansas.
658773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 96, 96, -1, -1 }, // AS American Samoa.
659773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 85, 86, -1, -1 }, // AZ Arizona.
660773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 90, 96, -1, -1 }, // CA California.
661773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 80, 81, -1, -1 }, // CO Colorado.
662773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    {  6,  6, -1, -1 }, // CT Connecticut.
663773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 20, 20, -1, -1 }, // DC District of Columbia.
664773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 19, 19, -1, -1 }, // DE Delaware.
665773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 32, 34, -1, -1 }, // FL Florida.
666773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 96, 96, -1, -1 }, // FM Federated States of Micronesia.
667773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 30, 31, -1, -1 }, // GA Georgia.
668773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 96, 96, -1, -1 }, // GU Guam.
669773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 96, 96, -1, -1 }, // HI Hawaii.
670773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 50, 52, -1, -1 }, // IA Iowa.
671773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 83, 83, -1, -1 }, // ID Idaho.
672773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 60, 62, -1, -1 }, // IL Illinois.
673773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 46, 47, -1, -1 }, // IN Indiana.
674773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 66, 67, 73, -1 }, // KS Kansas.
675773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 40, 42, -1, -1 }, // KY Kentucky.
676773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 70, 71, -1, -1 }, // LA Louisiana.
677773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    {  1,  2, -1, -1 }, // MA Massachusetts.
678773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 20, 21, -1, -1 }, // MD Maryland.
679773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    {  3,  4, -1, -1 }, // ME Maine.
680773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 96, 96, -1, -1 }, // MH Marshall Islands.
681773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 48, 49, -1, -1 }, // MI Michigan.
682773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 55, 56, -1, -1 }, // MN Minnesota.
683773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 63, 65, -1, -1 }, // MO Missouri.
684773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 96, 96, -1, -1 }, // MP Northern Mariana Islands.
685773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 38, 39, -1, -1 }, // MS Mississippi.
686773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 55, 56, -1, -1 }, // MT Montana.
687773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 27, 28, -1, -1 }, // NC North Carolina.
688773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 58, 58, -1, -1 }, // ND North Dakota.
689773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 68, 69, -1, -1 }, // NE Nebraska.
690773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    {  3,  4, -1, -1 }, // NH New Hampshire.
691773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    {  7,  8, -1, -1 }, // NJ New Jersey.
692773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 87, 88, 86, -1 }, // NM New Mexico.
693773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 88, 89, 96, -1 }, // NV Nevada.
694773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 10, 14,  0,  6 }, // NY New York.
695773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 43, 45, -1, -1 }, // OH Ohio.
696773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 73, 74, -1, -1 }, // OK Oklahoma.
697773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 97, 97, -1, -1 }, // OR Oregon.
698773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 15, 19, -1, -1 }, // PA Pennsylvania.
699773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    {  6,  6,  0,  9 }, // PR Puerto Rico.
700773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 96, 96, -1, -1 }, // PW Palau.
701773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    {  2,  2, -1, -1 }, // RI Rhode Island.
702773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 29, 29, -1, -1 }, // SC South Carolina.
703773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 57, 57, -1, -1 }, // SD South Dakota.
704773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 37, 38, -1, -1 }, // TN Tennessee.
705773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 75, 79, 87, 88 }, // TX Texas.
706773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 84, 84, -1, -1 }, // UT Utah.
707773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 22, 24, 20, -1 }, // VA Virginia.
708773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    {  6,  9, -1, -1 }, // VI Virgin Islands.
709773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    {  5,  5, -1, -1 }, // VT Vermont.
710773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 98, 99, -1, -1 }, // WA Washington.
711773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 53, 54, -1, -1 }, // WI Wisconsin.
712773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 24, 26, -1, -1 }, // WV West Virginia.
713773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { 82, 83, -1, -1 }  // WY Wyoming.
714773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  };
715773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
716773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Zip numeric value for the first two characters.
717773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  DCHECK(word.begin != word.end);
718773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  DCHECK(IsAsciiDigit(*word.begin));
719773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  DCHECK(IsAsciiDigit(*(word.begin + 1)));
720773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  int zip_prefix = (*word.begin - '0') * 10 + (*(word.begin + 1) - '0');
721773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
722773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if ((zip_prefix >= zip_range[state_index].low &&
723773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      zip_prefix <= zip_range[state_index].high) ||
724773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      zip_prefix == zip_range[state_index].exception1 ||
725773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      zip_prefix == zip_range[state_index].exception2) {
726773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return true;
727773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
728773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return false;
729773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
730773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
731773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::IsValidLocationName(const Word& word) {
732773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Supported location names sorted alphabetically and grouped by first letter.
733773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  static const struct LocationNameInfo {
734773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const char* string;
735773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char length;
736773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    bool allow_plural;
737773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  } location_names[157] = {
738773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "alley", 5, false }, { "annex", 5, false }, { "arcade", 6, false },
739773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "ave", 3, false }, { "ave.", 4, false }, { "avenue", 6, false },
740773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "alameda", 7, false },
741773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "bayou", 5, false }, { "beach", 5, false }, { "bend", 4, false },
742773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "bluff", 5, true }, { "bottom", 6, false }, { "boulevard", 9, false },
743773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "branch", 6, false }, { "bridge", 6, false }, { "brook", 5, true },
744773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "burg", 4, true }, { "bypass", 6, false }, { "broadway", 8, false },
745773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "camino", 6, false }, { "camp", 4, false }, { "canyon", 6, false },
746773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "cape", 4, false }, { "causeway", 8, false }, { "center", 6, true },
747773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "circle", 6, true }, { "cliff", 5, true }, { "club", 4, false },
748773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "common", 6, false }, { "corner", 6, true }, { "course", 6, false },
749773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "court", 5, true }, { "cove", 4, true }, { "creek", 5, false },
750773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "crescent", 8, false }, { "crest", 5, false }, { "crossing", 8, false },
751773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "crossroad", 9, false }, { "curve", 5, false }, { "circulo", 7, false },
752773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "dale", 4, false }, { "dam", 3, false }, { "divide", 6, false },
753773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "drive", 5, true },
754773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "estate", 6, true }, { "expressway", 10, false },
755773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "extension", 9, true },
756773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "fall", 4, true }, { "ferry", 5, false }, { "field", 5, true },
757773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "flat", 4, true }, { "ford", 4, true }, { "forest", 6, false },
758773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "forge", 5, true }, { "fork", 4, true }, { "fort", 4, false },
759773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "freeway", 7, false },
760773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "garden", 6, true }, { "gateway", 7, false }, { "glen", 4, true },
761773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "green", 5, true }, { "grove", 5, true },
762773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "harbor", 6, true }, { "haven", 5, false }, { "heights", 7, false },
763773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "highway", 7, false }, { "hill", 4, true }, { "hollow", 6, false },
764773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "inlet", 5, false }, { "island", 6, true }, { "isle", 4, false },
765773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "junction", 8, true },
766773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "key", 3, true }, { "knoll", 5, true },
767773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "lake", 4, true }, { "land", 4, false }, { "landing", 7, false },
768773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "lane", 4, false }, { "light", 5, true }, { "loaf", 4, false },
769773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "lock", 4, true }, { "lodge", 5, false }, { "loop", 4, false },
770773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "mall", 4, false }, { "manor", 5, true }, { "meadow", 6, true },
771773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "mews", 4, false }, { "mill", 4, true }, { "mission", 7, false },
772773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "motorway", 8, false }, { "mount", 5, false }, { "mountain", 8, true },
773773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "neck", 4, false },
774773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "orchard", 7, false }, { "oval", 4, false }, { "overpass", 8, false },
775773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "park", 4, true }, { "parkway", 7, true }, { "pass", 4, false },
776773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "passage", 7, false }, { "path", 4, false }, { "pike", 4, false },
777773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "pine", 4, true }, { "plain", 5, true }, { "plaza", 5, false },
778773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "point", 5, true }, { "port", 4, true }, { "prairie", 7, false },
779773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "privada", 7, false },
780773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "radial", 6, false }, { "ramp", 4, false }, { "ranch", 5, false },
781773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "rapid", 5, true }, { "rest", 4, false }, { "ridge", 5, true },
782773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "river", 5, false }, { "road", 4, true }, { "route", 5, false },
783773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "row", 3, false }, { "rue", 3, false }, { "run", 3, false },
784773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "shoal", 5, true }, { "shore", 5, true }, { "skyway", 6, false },
785773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "spring", 6, true }, { "spur", 4, true }, { "square", 6, true },
786773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "station", 7, false }, { "stravenue", 9, false }, { "stream", 6, false },
787773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "st", 2, false }, { "st.", 3, false }, { "street", 6, true },
788773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "summit", 6, false }, { "speedway", 8, false },
789773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "terrace", 7, false }, { "throughway", 10, false }, { "trace", 5, false },
790773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "track", 5, false }, { "trafficway", 10, false }, { "trail", 5, false },
791773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "tunnel", 6, false }, { "turnpike", 8, false },
792773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "underpass", 9, false }, { "union", 5, true },
793773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "valley", 6, true }, { "viaduct", 7, false }, { "view", 4, true },
794773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "village", 7, true }, { "ville", 5, false }, { "vista", 5, false },
795773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "walk", 4, true }, { "wall", 4, false }, { "way", 3, true },
796773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "well", 4, true },
797773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "xing", 4, false }, { "xrd", 3, false }
798773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  };
799773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
800773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Accumulative number of location names for each starting letter.
801773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  static const int location_names_accumulative[25] = {
802773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      0,   7,  19,  40,  44,
803773979f92560dd1aead375c82fd75b584a141e5dJohn Reck     47,  57,  62,  68,  71,
804773979f92560dd1aead375c82fd75b584a141e5dJohn Reck     72,  74,  83,  92,  93,
805773979f92560dd1aead375c82fd75b584a141e5dJohn Reck     96, 109, 109, 121, 135,
806773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    143, 145, 151, 155, 157
807773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  };
808773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
809773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  DCHECK_EQ(
810773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      location_names_accumulative[arraysize(location_names_accumulative) - 1],
811773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      static_cast<int>(ARRAYSIZE_UNSAFE(location_names)));
812773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
813773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (!IsAsciiAlpha(*word.begin))
814773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
815773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
816773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // No location names start with y, z.
817773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  char16 first_letter = base::ToLowerASCII(*word.begin);
818773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (first_letter > 'x')
819773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
820773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
821773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  DCHECK(first_letter >= 'a');
822773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  int index = first_letter - 'a';
823773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  int length = std::distance(word.begin, word.end);
824773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (int i = location_names_accumulative[index];
825773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      i < location_names_accumulative[index + 1]; ++i) {
826773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (location_names[i].length != length &&
827773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        (location_names[i].allow_plural &&
828773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        location_names[i].length + 1 != length)) {
829773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
830773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
831773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
832773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (LowerCaseEqualsASCIIWithPlural(word.begin, word.end,
833773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        location_names[i].string, location_names[i].allow_plural)) {
834773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return true;
835773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
836773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
837773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
838773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return false;
839773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
840