address_detector.cpp revision 77974d2fe8f7fb9c421c8d4240e02e9d76ec2c27
1773979f92560dd1aead375c82fd75b584a141e5dJohn Reck/*
2773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * Copyright (C) 2012 Google Inc. All rights reserved.
3773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *
4773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * Redistribution and use in source and binary forms, with or without
5773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * modification, are permitted provided that the following conditions are
6773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * met:
7773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *
8773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *    * Redistributions of source code must retain the above copyright
9773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * notice, this list of conditions and the following disclaimer.
10773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *    * Redistributions in binary form must reproduce the above
11773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * copyright notice, this list of conditions and the following disclaimer
12773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * in the documentation and/or other materials provided with the
13773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * distribution.
14773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *    * Neither the name of Google Inc. nor the names of its
15773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * contributors may be used to endorse or promote products derived from
16773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * this software without specific prior written permission.
17773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *
18773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29773979f92560dd1aead375c82fd75b584a141e5dJohn Reck */
30773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
31773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "config.h"
32773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
33773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Magic pretend-to-be-a-chromium-build flags
34773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#undef WEBKIT_IMPLEMENTATION
35773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#undef LOG
36773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
37773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "content/address_detector.h"
38773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
39773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include <bitset>
40773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
41773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "base/utf_string_conversions.h"
42773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "net/base/escape.h"
43773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "WebString.h"
44773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
456bf2577653884795f04bbf9d8196ed9998896afeJohn Reck#include <wtf/HashSet.h>
466bf2577653884795f04bbf9d8196ed9998896afeJohn Reck#include <wtf/Noncopyable.h>
476bf2577653884795f04bbf9d8196ed9998896afeJohn Reck#include <wtf/text/StringHash.h>
486bf2577653884795f04bbf9d8196ed9998896afeJohn Reck#include <wtf/text/WTFString.h>
496bf2577653884795f04bbf9d8196ed9998896afeJohn Reck
50773979f92560dd1aead375c82fd75b584a141e5dJohn Recknamespace {
51773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
52773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Prefix used for geographical address intent URIs.
53773979f92560dd1aead375c82fd75b584a141e5dJohn Reckstatic const char kAddressSchemaPrefix[] = "geo:0,0?q=";
54773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
55773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum text length to be searched for address detection.
56773979f92560dd1aead375c82fd75b584a141e5dJohn Reckstatic const size_t kMaxAddressLength = 500;
57773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
58773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Minimum number of words in an address after the house number
59773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// before a state is expected to be found.
60773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// A value too high can miss short addresses.
61773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMinAddressWords = 3;
62773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
63773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of words allowed in an address between the house number
64773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// and the state, both not included.
65773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxAddressWords = 12;
66773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
67773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of lines allowed in an address between the house number
68773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// and the state, both not included.
69773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxAddressLines = 5;
70773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
71773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum length allowed for any address word between the house number
72773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// and the state, both not included.
73773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxAddressNameWordLength = 25;
74773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
75773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of words after the house number in which the location name
76773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// should be found.
77773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxLocationNameDistance = 4;
78773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
79773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Number of digits for a valid zip code.
80773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kZipDigits = 5;
81773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
82773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Number of digits for a valid zip code in the Zip Plus 4 format.
83773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kZipPlus4Digits = 9;
84773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
85773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of digits of a house number, including possible hyphens.
86773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxHouseDigits = 5;
87773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
88773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Additional characters used as new line delimiters.
89773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst char16 kNewlineDelimiters[] = {
90773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  ',',
91773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  '*',
92773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  0x2022,  // Unicode bullet
9377974d2fe8f7fb9c421c8d4240e02e9d76ec2c27Selim Gurun  0,
94773979f92560dd1aead375c82fd75b584a141e5dJohn Reck};
95773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
96773979f92560dd1aead375c82fd75b584a141e5dJohn Reckchar16 SafePreviousChar(const string16::const_iterator& it,
97773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const string16::const_iterator& begin) {
98773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (it == begin)
99773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return ' ';
100773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return *(it - 1);
101773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
102773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
103773979f92560dd1aead375c82fd75b584a141e5dJohn Reckchar16 SafeNextChar(const string16::const_iterator& it,
104773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const string16::const_iterator& end) {
105773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (it == end)
106773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return ' ';
107773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return *(it + 1);
108773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
109773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
110773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool WordLowerCaseEqualsASCII(string16::const_iterator word_begin,
111773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    string16::const_iterator word_end, const char* ascii_to_match) {
112773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (string16::const_iterator it = word_begin; it != word_end;
113773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      ++it, ++ascii_to_match) {
114773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
115773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return false;
116773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
117773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return *ascii_to_match == 0 || *ascii_to_match == ' ';
118773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
119773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
120773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin,
121773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    string16::const_iterator word_end, const char* ascii_to_match,
122773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    bool allow_plural) {
123773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (string16::const_iterator it = word_begin; it != word_end;
124773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      ++it, ++ascii_to_match) {
125773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end)
126773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return true;
127773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
128773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
129773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return false;
130773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
131773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return *ascii_to_match == 0;
132773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
133773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
134773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}  // anonymous namespace
135773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
136773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
137773979f92560dd1aead375c82fd75b584a141e5dJohn ReckAddressDetector::AddressDetector() {
138773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
139773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
140773979f92560dd1aead375c82fd75b584a141e5dJohn ReckAddressDetector::~AddressDetector() {
141773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
142773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
143773979f92560dd1aead375c82fd75b584a141e5dJohn Reckstd::string AddressDetector::GetContentText(const WebKit::WebRange& range) {
144773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Get the address and replace unicode bullets with commas.
145773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  string16 address_16 = CollapseWhitespace(range.toPlainText(), false);
146773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  std::replace(address_16.begin(), address_16.end(),
147773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      static_cast<char16>(0x2022), static_cast<char16>(','));
148773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return UTF16ToUTF8(address_16);
149773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
150773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
151773979f92560dd1aead375c82fd75b584a141e5dJohn ReckGURL AddressDetector::GetIntentURL(const std::string& content_text) {
152773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return GURL(kAddressSchemaPrefix +
153773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      EscapeQueryParamValue(content_text, true));
154773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
155773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
156773979f92560dd1aead375c82fd75b584a141e5dJohn Recksize_t AddressDetector::GetMaximumContentLength() {
157773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return kMaxAddressLength;
158773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
159773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
160773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::FindContent(const string16::const_iterator& begin,
161773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) {
162773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  HouseNumberParser house_number_parser;
163773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
164773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Keep going through the input string until a potential house number is
165773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // detected. Start tokenizing the following words to find a valid
166773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // street name within a word range. Then, find a state name followed
167773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // by a valid zip code for that state. Also keep a look for any other
168773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // possible house numbers to continue from in case of no match and for
169773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // state names not followed by a zip code (e.g. New York, NY 10000).
170773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  const string16 newline_delimiters = kNewlineDelimiters;
171773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  const string16 delimiters = kWhitespaceUTF16 + newline_delimiters;
172773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (string16::const_iterator it = begin; it != end; ) {
173773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    Word house_number;
174773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (!house_number_parser.Parse(it, end, &house_number))
175773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return false;
176773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
177773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    String16Tokenizer tokenizer(house_number.end, end, delimiters);
178773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
179773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
180773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    std::vector<Word> words;
181773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    words.push_back(house_number);
182773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
183773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    bool found_location_name = false;
184773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    bool continue_on_house_number = true;
185773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t next_house_number_word = 0;
186773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t num_lines = 1;
187773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
188773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    // Don't include the house number in the word count.
189773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t next_word = 1;
190773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    for (; next_word <= kMaxAddressWords + 1; ++next_word) {
191773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
192773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Extract a new word from the tokenizer.
193773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (next_word == words.size()) {
194773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        do {
195773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (!tokenizer.GetNext())
196773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            return false;
197773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
198773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          // Check the number of address lines.
199773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (tokenizer.token_is_delim() && newline_delimiters.find(
200773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              *tokenizer.token_begin()) != string16::npos) {
201773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            ++num_lines;
202773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          }
203773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        } while (tokenizer.token_is_delim());
204773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
205773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        if (num_lines > kMaxAddressLines)
206773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          break;
207773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
208773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
209773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
210773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
211773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Check the word length. If too long, don't try to continue from
212773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // the next house number as no address can hold this word.
213773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      const Word& current_word = words[next_word];
214773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
215773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      size_t current_word_length = std::distance(
216773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          current_word.begin, current_word.end);
217773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (current_word_length > kMaxAddressNameWordLength) {
218773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        continue_on_house_number = false;
219773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        break;
220773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
221773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
222773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Check if the new word is a valid house number.
223773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // This is used to properly resume parsing in case the maximum number
224773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // of words is exceeded.
225773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (next_house_number_word == 0 &&
226773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          house_number_parser.Parse(current_word.begin, current_word.end, NULL)) {
227773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        next_house_number_word = next_word;
228773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        continue;
229773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
230773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
231773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Look for location names in the words after the house number.
232773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // A range limitation is introduced to avoid matching
233773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // anything that starts with a number before a legitimate address.
234773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (next_word <= kMaxLocationNameDistance &&
235773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          IsValidLocationName(current_word)) {
236773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        found_location_name = true;
237773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        continue;
238773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
239773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
240773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Don't count the house number.
241773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (next_word > kMinAddressWords) {
242773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        // Looking for the state is likely to add new words to the list while
243773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        // checking for multi-word state names.
244773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        size_t state_first_word = next_word;
245773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        size_t state_last_word, state_index;
246773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
247773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            &tokenizer, &state_index)) {
248773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
249773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          // A location name should have been found at this point.
250773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (!found_location_name)
251773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
252773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
253773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          // Explicitly exclude "et al", as "al" is a valid state code.
254773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (current_word_length == 2 && words.size() > 2) {
255773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            const Word& previous_word = words[state_first_word - 1];
256773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if (previous_word.end - previous_word.begin == 2 &&
257773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
258773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                    "et") &&
259773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                LowerCaseEqualsASCII(current_word.begin, current_word.end,
260773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                    "al"))
261773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              break;
262773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          }
263773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
264773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          // Extract one more word from the tokenizer if not already available.
265773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          size_t zip_word = state_last_word + 1;
266773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (zip_word == words.size()) {
267773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            do {
2686bf2577653884795f04bbf9d8196ed9998896afeJohn Reck              if (!tokenizer.GetNext()) {
2696bf2577653884795f04bbf9d8196ed9998896afeJohn Reck                // Zip is optional
2706bf2577653884795f04bbf9d8196ed9998896afeJohn Reck                *start_pos = words[0].begin - begin;
2716bf2577653884795f04bbf9d8196ed9998896afeJohn Reck                *end_pos = words[state_last_word].end - begin;
2726bf2577653884795f04bbf9d8196ed9998896afeJohn Reck                return true;
2736bf2577653884795f04bbf9d8196ed9998896afeJohn Reck              }
274773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            } while (tokenizer.token_is_delim());
275773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            words.push_back(Word(tokenizer.token_begin(),
276773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                tokenizer.token_end()));
277773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          }
278773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
279773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          // Check the parsing validity and state range of the zip code.
280773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          next_word = state_last_word;
281773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (!IsZipValid(words[zip_word], state_index))
282773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            continue;
283773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
284773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          *start_pos = words[0].begin - begin;
285773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          *end_pos = words[zip_word].end - begin;
286773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          return true;
287773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        }
288773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
289773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
290773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
291773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    // Avoid skipping too many words because of a non-address number
292773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    // at the beginning of the contents to parse.
293773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (continue_on_house_number && next_house_number_word > 0) {
294773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      it = words[next_house_number_word].begin;
295773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    } else {
296773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      DCHECK(!words.empty());
297773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      next_word = std::min(next_word, words.size() - 1);
298773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      it = words[next_word].end;
299773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
300773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
301773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
302773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return false;
303773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
304773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
305773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::IsPreDelimiter(
306773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char16 character) {
307773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return character == ':' || IsPostDelimiter(character);
308773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
309773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
310773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::IsPostDelimiter(
311773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char16 character) {
312773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return IsWhitespace(character) || strchr(",\"'", character);
313773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
314773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
315773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::RestartOnNextDelimiter() {
316773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  ResetState();
317773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {}
318773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
319773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
320773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) {
321773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)),
322773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      num_chars);
323773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  it_ += offset;
324773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  result_chars_ += offset;
325773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
326773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
327773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) {
328773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars);
329773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
330773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
331773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::ResetState() {
332773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  num_digits_ = 0;
333773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  result_chars_ = 0;
334773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
335773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
336773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const {
337773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // There should always be a number after a hyphen.
338773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-')
339773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
340773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
341773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (word) {
342773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    word->begin = it_ - result_chars_;
343773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    word->end = it_;
344773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
345773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return true;
346773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
347773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
348773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::Parse(
349773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const string16::const_iterator& begin,
350773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const string16::const_iterator& end, Word* word) {
351773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  it_ = begin_ = begin;
352773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  end_ = end;
353773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  ResetState();
354773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
355773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Iterations only used as a fail-safe against any buggy infinite loops.
356773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  size_t iterations = 0;
357773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  size_t max_iterations = end - begin + 1;
358773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (; it_ != end_ && iterations < max_iterations; ++iterations) {
359773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
360773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    // Word finished case.
361773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (IsPostDelimiter(*it_)) {
362773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (CheckFinished(word))
363773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        return true;
364773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      else if (result_chars_)
365773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        ResetState();
366773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
367773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      SkipChars(1);
368773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
369773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
370773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
371773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    // More digits. There should be no more after a letter was found.
372773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (IsAsciiDigit(*it_)) {
373773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (num_digits_ >= kMaxHouseDigits) {
374773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        RestartOnNextDelimiter();
375773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      } else {
376773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        AcceptChars(1);
377773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        ++num_digits_;
378773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
379773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
380773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
381773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
382773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (IsAsciiAlpha(*it_)) {
383773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Handle special case 'one'.
384773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (result_chars_ == 0) {
385773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one"))
386773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          AcceptChars(3);
387773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        else
388773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          RestartOnNextDelimiter();
389773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        continue;
390773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
391773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
392773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // There should be more than 1 character because of result_chars.
393773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      DCHECK_GT(result_chars_, 0U);
394773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      DCHECK_NE(it_, begin_);
395773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      char16 previous = SafePreviousChar(it_, begin_);
396773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (IsAsciiDigit(previous)) {
397773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        // Check cases like '12A'.
398773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        char16 next = SafeNextChar(it_, end_);
399773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        if (IsPostDelimiter(next)) {
400773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          AcceptChars(1);
401773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          continue;
402773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        }
403773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
404773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        // Handle cases like 12a, 1st, 2nd, 3rd, 7th.
405773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        if (IsAsciiAlpha(next)) {
406773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          char16 last_digit = previous;
407773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          char16 first_letter = base::ToLowerASCII(*it_);
408773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          char16 second_letter = base::ToLowerASCII(next);
409773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' &&
410773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              num_digits_ == 2;
411773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
412773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          switch (last_digit - '0') {
413773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 1:
414773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if ((first_letter == 's' && second_letter == 't') ||
415773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                (first_letter == 't' && second_letter == 'h' && is_teen)) {
416773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              AcceptChars(2);
417773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              continue;
418773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            }
419773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
420773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
421773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 2:
422773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if ((first_letter == 'n' && second_letter == 'd') ||
423773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                (first_letter == 't' && second_letter == 'h' && is_teen)) {
424773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              AcceptChars(2);
425773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              continue;
426773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            }
427773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
428773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
429773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 3:
430773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if ((first_letter == 'r' && second_letter == 'd') ||
431773979f92560dd1aead375c82fd75b584a141e5dJohn Reck                (first_letter == 't' && second_letter == 'h' && is_teen)) {
432773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              AcceptChars(2);
433773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              continue;
434773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            }
435773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
436773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
437773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 0:
438773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            // Explicitly exclude '0th'.
439773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if (num_digits_ == 1)
440773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              break;
441773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
442773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 4:
443773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 5:
444773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 6:
445773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 7:
446773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 8:
447773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          case 9:
448773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            if (first_letter == 't' && second_letter == 'h') {
449773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              AcceptChars(2);
450773979f92560dd1aead375c82fd75b584a141e5dJohn Reck              continue;
451773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            }
452773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
453773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
454773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          default:
455773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            NOTREACHED();
456773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          }
457773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        }
458773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
459773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
460773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      RestartOnNextDelimiter();
461773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
462773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
463773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
464773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (*it_ == '-' && num_digits_ > 0) {
465773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      AcceptChars(1);
466773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      ++num_digits_;
467773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
468773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
469773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
470773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    RestartOnNextDelimiter();
471773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    SkipChars(1);
472773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
473773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
474773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (iterations >= max_iterations)
475773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
476773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
477773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return CheckFinished(word);
478773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
479773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
480773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::FindStateStartingInWord(WordList* words,
481773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t state_first_word, size_t* state_last_word,
482773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    String16Tokenizer* tokenizer, size_t* state_index) {
483773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
484773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Bitmasks containing the allowed suffixes for 2-letter state codes.
485773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  static const int state_two_letter_suffix[23] = {
486773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x02060c00,  // A followed by: [KLRSZ].
487773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000000,  // B.
488773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00084001,  // C followed by: [AOT].
489773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000014,  // D followed by: [CE].
490773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000000,  // E.
491773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00001800,  // F followed by: [LM].
492773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00100001,  // G followed by: [AU].
493773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000100,  // H followed by: [I].
494773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00002809,  // I followed by: [ADLN].
495773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000000,  // J.
496773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x01040000,  // K followed by: [SY].
497773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000001,  // L followed by: [A].
498773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x000ce199,  // M followed by: [ADEHINOPST].
499773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x0120129c,  // N followed by: [CDEHJMVY].
500773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00020480,  // O followed by: [HKR].
501773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00420001,  // P followed by: [ARW].
502773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000000,  // Q.
503773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00000100,  // R followed by: [I].
504773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x0000000c,  // S followed by: [CD].
505773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00802000,  // T followed by: [NX].
506773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00080000,  // U followed by: [T].
507773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x00080101,  // V followed by: [AIT].
508773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    0x01200101   // W followed by: [AIVY].
509773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  };
510773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
511773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Accumulative number of states for the 2-letter code indexed by the first.
512773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  static const int state_two_letter_accumulative[24] = {
513773979f92560dd1aead375c82fd75b584a141e5dJohn Reck     0,  5,  5,  8, 10, 10, 12, 14,
514773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    15, 19, 19, 21, 22, 32, 40, 43,
515773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    46, 46, 47, 49, 51, 52, 55, 59
516773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  };
517773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
518773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // State names sorted alphabetically with their lengths.
519773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // There can be more than one possible name for a same state if desired.
520773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  static const struct StateNameInfo {
521773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    const char* string;
522773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char first_word_length;
523773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char length;
524773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char state_index; // Relative to two-character code alphabetical order.
525773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  } state_names[59] = {
526773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 },
527773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 },
528773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "arkansas", 8, 8, 2 },
529773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 },
530773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 },
531773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "district of columbia", 8, 20, 8 },
532773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 },
533773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 },
534773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "hawaii", 6, 6, 14 },
535773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 },
536773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "iowa", 4, 4, 15 },
537773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 },
538773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "louisiana", 9, 9, 21 },
539773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 },
540773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 },
541773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 },
542773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 },
543773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "montana", 7, 7, 31 },
544773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 },
545773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 },
546773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 },
547773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 },
548773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "northern mariana islands", 8, 24, 29 },
549773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 },
550773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 },
551773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "puerto rico", 6, 11, 44 },
552773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "rhode island", 5, 5, 46 },
553773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 },
554773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 },
555773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "utah", 4, 4, 51 },
556773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 },
557773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "virginia", 8, 8, 52 },
558773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 },
559773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 }
560773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  };
561773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
562773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Accumulative number of states for sorted names indexed by the first letter.
563773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Required a different one since there are codes that don't share their
564773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // first letter with the name of their state (MP = Northern Mariana Islands).
565773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  static const int state_names_accumulative[24] = {
566773979f92560dd1aead375c82fd75b584a141e5dJohn Reck     0,  5,  5,  8, 10, 10, 12, 14,
567773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    15, 19, 19, 21, 22, 31, 40, 43,
568773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    46, 46, 47, 49, 51, 52, 55, 59
569773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  };
570773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
571773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1],
572773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      static_cast<int>(ARRAYSIZE_UNSAFE(state_names)));
573773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
574773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  const Word& first_word = words->at(state_first_word);
575773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  int length = first_word.end - first_word.begin;
576773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (length < 2 || !IsAsciiAlpha(*first_word.begin))
577773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
578773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
579773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // No state names start with x, y, z.
580773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  char16 first_letter = base::ToLowerASCII(*first_word.begin);
581773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (first_letter > 'w')
582773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
583773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
584773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  DCHECK(first_letter >= 'a');
585773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  int first_index = first_letter - 'a';
586773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
587773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Look for two-letter state names.
588773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) {
589773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1));
590773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    DCHECK(second_letter >= 'a');
591773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
592773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    int second_index = second_letter - 'a';
593773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (!(state_two_letter_suffix[first_index] & (1 << second_index)))
594773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return false;
595773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
596773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] &
597773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        ((1 << second_index) - 1);
598773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    *state_last_word = state_first_word;
599773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    *state_index = state_two_letter_accumulative[first_index] +
600773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        previous_suffixes.count();
601773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return true;
602773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
603773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
604773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  // Look for full state names by their first letter. Discard by length.
605773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (int state = state_names_accumulative[first_index];
606773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      state < state_names_accumulative[first_index + 1]; ++state) {
607773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (state_names[state].first_word_length != length)
608773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
609773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
610773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    bool state_match = false;
611773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t state_word = state_first_word;
612773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    for (int pos = 0; true; ) {
613773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (!WordLowerCaseEqualsASCII(words->at(state_word).begin,
614773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          words->at(state_word).end, &state_names[state].string[pos]))
615773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        break;
616773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
617773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      pos += words->at(state_word).end - words->at(state_word).begin + 1;
618773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (pos >= state_names[state].length) {
619773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        state_match = true;
620773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        break;
621773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
622773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
623773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      // Ran out of words, extract more from the tokenizer.
624773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      if (++state_word == words->size()) {
625773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        do {
626773979f92560dd1aead375c82fd75b584a141e5dJohn Reck          if (!tokenizer->GetNext())
627773979f92560dd1aead375c82fd75b584a141e5dJohn Reck            break;
628773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        } while (tokenizer->token_is_delim());
629773979f92560dd1aead375c82fd75b584a141e5dJohn Reck        words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end()));
630773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      }
631773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
632773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
633773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (state_match) {
634773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      *state_last_word = state_word;
635773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      *state_index = state_names[state].state_index;
636773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      return true;
637773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
638773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
639773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
640773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return false;
641773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
642773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
643773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::IsZipValid(const Word& word, size_t state_index) {
644773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  size_t length = word.end - word.begin;
645773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  if (length != kZipDigits && length != kZipPlus4Digits + 1)
646773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
647773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
648773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  for (string16::const_iterator it = word.begin; it != word.end; ++it) {
649773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    size_t pos = it - word.begin;
650773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits))
651773979f92560dd1aead375c82fd75b584a141e5dJohn Reck      continue;
652773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    return false;
653773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  }
654773979f92560dd1aead375c82fd75b584a141e5dJohn Reck  return IsZipValidForState(word, state_index);
655773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
656773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
6576bf2577653884795f04bbf9d8196ed9998896afeJohn Reckbool AddressDetector::IsZipValidForState(const Word& word, size_t state_index)
6586bf2577653884795f04bbf9d8196ed9998896afeJohn Reck{
6596bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    enum USState {
6606bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AP = -4, // AP (military base in the Pacific)
6616bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AA = -3, // AA (military base inside the US)
6626bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AE = -2, // AE (military base outside the US)
6636bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        XX = -1, // (not in use)
6646bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AK =  0, // AK Alaska
6656bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AL =  1, // AL Alabama
6666bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AR =  2, // AR Arkansas
6676bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AS =  3, // AS American Samoa
6686bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AZ =  4, // AZ Arizona
6696bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CA =  5, // CA California
6706bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CO =  6, // CO Colorado
6716bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CT =  7, // CT Connecticut
6726bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        DC =  8, // DC District of Columbia
6736bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        DE =  9, // DE Delaware
6746bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        FL = 10, // FL Florida
6756bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        FM = 11, // FM Federated States of Micronesia
6766bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        GA = 12, // GA Georgia
6776bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        GU = 13, // GU Guam
6786bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        HI = 14, // HI Hawaii
6796bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        IA = 15, // IA Iowa
6806bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        ID = 16, // ID Idaho
6816bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        IL = 17, // IL Illinois
6826bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        IN = 18, // IN Indiana
6836bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        KS = 19, // KS Kansas
6846bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        KY = 20, // KY Kentucky
6856bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        LA = 21, // LA Louisiana
6866bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MA = 22, // MA Massachusetts
6876bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MD = 23, // MD Maryland
6886bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        ME = 24, // ME Maine
6896bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MH = 25, // MH Marshall Islands
6906bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MI = 26, // MI Michigan
6916bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MN = 27, // MN Minnesota
6926bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MO = 28, // MO Missouri
6936bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MP = 29, // MP Northern Mariana Islands
6946bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MS = 30, // MS Mississippi
6956bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MT = 31, // MT Montana
6966bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NC = 32, // NC North Carolina
6976bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        ND = 33, // ND North Dakota
6986bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NE = 34, // NE Nebraska
6996bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NH = 35, // NH New Hampshire
7006bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NJ = 36, // NJ New Jersey
7016bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NM = 37, // NM New Mexico
7026bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NV = 38, // NV Nevada
7036bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NY = 39, // NY New York
7046bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        OH = 40, // OH Ohio
7056bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        OK = 41, // OK Oklahoma
7066bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        OR = 42, // OR Oregon
7076bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        PA = 43, // PA Pennsylvania
7086bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        PR = 44, // PR Puerto Rico
7096bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        PW = 45, // PW Palau
7106bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        RI = 46, // RI Rhode Island
7116bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        SC = 47, // SC South Carolina
7126bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        SD = 48, // SD South Dakota
7136bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        TN = 49, // TN Tennessee
7146bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        TX = 50, // TX Texas
7156bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        UT = 51, // UT Utah
7166bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        VA = 52, // VA Virginia
7176bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        VI = 53, // VI Virgin Islands
7186bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        VT = 54, // VT Vermont
7196bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WA = 55, // WA Washington
7206bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WI = 56, // WI Wisconsin
7216bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WV = 57, // WV West Virginia
7226bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WY = 58, // WY Wyoming
7236bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    };
7246bf2577653884795f04bbf9d8196ed9998896afeJohn Reck
7256bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    static const USState stateForZipPrefix[] = {
7266bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    //   0   1   2   3   4   5   6   7   8   9
7276bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        XX, XX, XX, XX, XX, NY, PR, PR, VI, PR, // 000-009
7286bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MA, MA, MA, MA, MA, MA, MA, MA, MA, MA, // 010-019
7296bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MA, MA, MA, MA, MA, MA, MA, MA, RI, RI, // 020-029
7306bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NH, NH, NH, NH, NH, NH, NH, NH, NH, ME, // 030-039
7316bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        ME, ME, ME, ME, ME, ME, ME, ME, ME, ME, // 040-049
7326bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        VT, VT, VT, VT, VT, MA, VT, VT, VT, VT, // 050-059
7336bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 060-069
7346bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 070-079
7356bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 080-089
7366bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AE, AE, AE, AE, AE, AE, AE, AE, AE, XX, // 090-099
7376bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 100-109
7386bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 110-119
7396bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 120-129
7406bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 130-139
7416bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 140-149
7426bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 150-159
7436bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 160-169
7446bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 170-179
7456bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 180-189
7466bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        PA, PA, PA, PA, PA, PA, PA, DE, DE, DE, // 190-199
7476bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        DC, VA, DC, DC, DC, DC, MD, MD, MD, MD, // 200-209
7486bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MD, MD, MD, XX, MD, MD, MD, MD, MD, MD, // 210-219
7496bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 220-229
7506bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 230-239
7516bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        VA, VA, VA, VA, VA, VA, VA, WV, WV, WV, // 240-249
7526bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WV, WV, WV, WV, WV, WV, WV, WV, WV, WV, // 250-259
7536bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WV, WV, WV, WV, WV, WV, WV, WV, WV, XX, // 260-269
7546bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 270-279
7556bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 280-289
7566bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        SC, SC, SC, SC, SC, SC, SC, SC, SC, SC, // 290-299
7576bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 300-309
7586bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 310-319
7596bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 320-329
7606bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 330-339
7616bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AA, FL, FL, XX, FL, XX, FL, FL, XX, FL, // 340-349
7626bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AL, AL, AL, XX, AL, AL, AL, AL, AL, AL, // 350-359
7636bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 360-369
7646bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        TN, TN, TN, TN, TN, TN, TN, TN, TN, TN, // 370-379
7656bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        TN, TN, TN, TN, TN, TN, MS, MS, MS, MS, // 380-389
7666bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MS, MS, MS, MS, MS, MS, MS, MS, GA, GA, // 390-399
7676bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        KY, KY, KY, KY, KY, KY, KY, KY, KY, KY, // 400-409
7686bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        KY, KY, KY, KY, KY, KY, KY, KY, KY, XX, // 410-419
7696bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        KY, KY, KY, KY, KY, KY, KY, KY, XX, XX, // 420-429
7706bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 430-439
7716bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 440-449
7726bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 450-459
7736bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 460-469
7746bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 470-479
7756bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 480-489
7766bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 490-499
7776bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        IA, IA, IA, IA, IA, IA, IA, IA, IA, IA, // 500-509
7786bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        IA, IA, IA, IA, IA, IA, IA, XX, XX, XX, // 510-519
7796bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        IA, IA, IA, IA, IA, IA, IA, IA, IA, XX, // 520-529
7806bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WI, WI, WI, XX, WI, WI, XX, WI, WI, WI, // 530-539
7816bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WI, WI, WI, WI, WI, WI, WI, WI, WI, WI, // 540-549
7826bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MN, MN, XX, MN, MN, MN, MN, MN, MN, MN, // 550-559
7836bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MN, MN, MN, MN, MN, MN, MN, MN, XX, DC, // 560-569
7846bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        SD, SD, SD, SD, SD, SD, SD, SD, XX, XX, // 570-579
7856bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        ND, ND, ND, ND, ND, ND, ND, ND, ND, XX, // 580-589
7866bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, // 590-599
7876bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 600-609
7886bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 610-619
7896bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        IL, XX, IL, IL, IL, IL, IL, IL, IL, IL, // 620-629
7906bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MO, MO, XX, MO, MO, MO, MO, MO, MO, MO, // 630-639
7916bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MO, MO, XX, XX, MO, MO, MO, MO, MO, MO, // 640-649
7926bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        MO, MO, MO, MO, MO, MO, MO, MO, MO, XX, // 650-659
7936bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        KS, KS, KS, XX, KS, KS, KS, KS, KS, KS, // 660-669
7946bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        KS, KS, KS, KS, KS, KS, KS, KS, KS, KS, // 670-679
7956bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NE, NE, XX, NE, NE, NE, NE, NE, NE, NE, // 680-689
7966bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NE, NE, NE, NE, XX, XX, XX, XX, XX, XX, // 690-699
7976bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        LA, LA, XX, LA, LA, LA, LA, LA, LA, XX, // 700-709
7986bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        LA, LA, LA, LA, LA, XX, AR, AR, AR, AR, // 710-719
7996bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AR, AR, AR, AR, AR, AR, AR, AR, AR, AR, // 720-729
8006bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        OK, OK, XX, TX, OK, OK, OK, OK, OK, OK, // 730-739
8016bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        OK, OK, XX, OK, OK, OK, OK, OK, OK, OK, // 740-749
8026bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 750-759
8036bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 760-769
8046bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        TX, XX, TX, TX, TX, TX, TX, TX, TX, TX, // 770-779
8056bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 780-789
8066bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 790-799
8076bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CO, CO, CO, CO, CO, CO, CO, CO, CO, CO, // 800-809
8086bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CO, CO, CO, CO, CO, CO, CO, XX, XX, XX, // 810-819
8096bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WY, WY, WY, WY, WY, WY, WY, WY, WY, WY, // 820-829
8106bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WY, WY, ID, ID, ID, ID, ID, ID, ID, XX, // 830-839
8116bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        UT, UT, UT, UT, UT, UT, UT, UT, XX, XX, // 840-849
8126bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AZ, AZ, AZ, AZ, XX, AZ, AZ, AZ, XX, AZ, // 850-859
8136bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        AZ, XX, XX, AZ, AZ, AZ, XX, XX, XX, XX, // 860-869
8146bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NM, NM, NM, NM, NM, NM, XX, NM, NM, NM, // 870-879
8156bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NM, NM, NM, NM, NM, TX, XX, XX, XX, NV, // 880-889
8166bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        NV, NV, XX, NV, NV, NV, XX, NV, NV, XX, // 890-899
8176bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 900-909
8186bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 910-919
8196bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 920-929
8206bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 930-939
8216bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 940-949
8226bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 950-959
8236bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        CA, CA, AP, AP, AP, AP, AP, HI, HI, GU, // 960-969
8246bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        OR, OR, OR, OR, OR, OR, OR, OR, OR, OR, // 970-979
8256bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WA, WA, WA, WA, WA, WA, WA, XX, WA, WA, // 980-989
8266bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        WA, WA, WA, WA, WA, AK, AK, AK, AK, AK, // 990-999
8276bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    };
8286bf2577653884795f04bbf9d8196ed9998896afeJohn Reck
8296bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    if (!word.begin || !word.end || (word.end - word.begin) < 3)
8306bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        return false;
8316bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    const char16* zipPtr = word.begin;
8326bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    if (zipPtr[0] < '0' || zipPtr[0] > '9' ||
8336bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        zipPtr[1] < '0' || zipPtr[1] > '9' ||
8346bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        zipPtr[2] < '0' || zipPtr[2] > '9')
8356bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        return false;
8366bf2577653884795f04bbf9d8196ed9998896afeJohn Reck
8376bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    int zip = zipPtr[0] - '0';
8386bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    zip *= 10;
8396bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    zip += zipPtr[1] - '0';
8406bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    zip *= 10;
8416bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    zip += zipPtr[2] - '0';
8426bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    return stateForZipPrefix[zip] == (int) state_index;
843773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
844773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
8456bf2577653884795f04bbf9d8196ed9998896afeJohn Reckstatic const char* s_rawStreetSuffixes[] = {
8466bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "allee", "alley", "ally", "aly",
8476bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "anex", "annex", "anx", "arc", "arcade", "av", "ave", "aven", "avenu",
8486bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend",
8496bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "bg", "bgs", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd",
8506bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch",
8516bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks",
8526bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu",
8536bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "camp", "canyn", "canyon", "cape", "causeway", "causway", "cen", "cent",
8546bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "center", "centers", "centr", "centre", "cir", "circ", "circl",
8556bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff",
8566bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "cliffs", "club", "cmn", "cmp", "cnter", "cntr", "cnyn", "common",
8576bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "cor", "corner", "corners", "cors", "course", "court", "courts", "cove",
8586bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres",
8596bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "crescent", "cresent", "crest", "crk", "crossing", "crossroad",
8606bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt",
8616bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn",
8626bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive",
8636bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests",
8646bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "exp", "expr", "express", "expressway", "expw", "expy", "ext",
8656bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls",
8666bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls",
8676bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge",
8686bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy",
8696bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy",
8706bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy",
8716bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden",
8726bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove",
8736bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors",
8746bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts",
8756bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw",
8766bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts",
8776bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands",
8786bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn",
8796bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key",
8806bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la",
8816bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks",
8826bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks",
8836bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop",
8846bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "loops", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows",
8856bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "medows", "mews", "mill", "mills", "mission", "missn", "ml", "mls",
8866bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount",
8876bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn",
8886bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd",
8896bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "oval", "overpass", "ovl", "park", "parks", "parkway", "parkways",
8906bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine",
8916bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain",
8926bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne",
8936bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie",
8946bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "prk", "prr", "prt", "prts", "psge", "pt", "pts", "rad", "radial",
8956bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd",
896bf0d5c6dc816bca8f00bde31ddda7ba41e740ccdJohn Reck    "rdg", "rdge", "rdgs", "rds", "real", "rest", "ridge", "ridges", "riv", "river",
8976bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "rivr", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds",
8986bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals",
8996bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway",
9006bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng",
9016bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ",
9026bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "square", "squares", "st", "sta", "station", "statn", "stn", "str",
9036bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream",
9046bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "street", "streets", "streme", "strm", "strt", "strvn", "strvnue",
9056bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "sts", "sumit", "sumitt", "summit", "ter", "terr", "terrace",
9066bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "throughway", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks",
9076bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "trafficway", "trail", "trails", "trak", "trce", "trfy", "trk", "trks",
9086bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "trl", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls",
9096bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "tunnel", "tunnels", "tunnl", "turnpike", "turnpk", "un", "underpass",
9106bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct",
9116bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "via", "viadct", "viaduct", "view", "views", "vill", "villag",
9126bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "village", "villages", "ville", "villg", "villiage", "vis", "vist",
9136bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta",
9146bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells",
9156bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    "wl", "wls", "wy", "xing", "xrd",
9166bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    0,
9176bf2577653884795f04bbf9d8196ed9998896afeJohn Reck};
918773979f92560dd1aead375c82fd75b584a141e5dJohn Reck
9196bf2577653884795f04bbf9d8196ed9998896afeJohn Reckbool AddressDetector::IsValidLocationName(const Word& word) {
9206bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    using namespace WTF;
9216bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    static HashSet<String> streetNames;
9226bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    if (!streetNames.size()) {
9236bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        const char** suffixes = s_rawStreetSuffixes;
9246bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        while (const char* suffix = *suffixes) {
9256bf2577653884795f04bbf9d8196ed9998896afeJohn Reck            int index = suffix[0] - 'a';
9266bf2577653884795f04bbf9d8196ed9998896afeJohn Reck            streetNames.add(suffix);
9276bf2577653884795f04bbf9d8196ed9998896afeJohn Reck            suffixes++;
9286bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        }
929773979f92560dd1aead375c82fd75b584a141e5dJohn Reck    }
9306bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    char16 first_letter = base::ToLowerASCII(*word.begin);
9316bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    if (first_letter > 'z' || first_letter < 'a')
9326bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        return false;
9336bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    int index = first_letter - 'a';
9346bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    int length = std::distance(word.begin, word.end);
9356bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    if (*word.end == '.')
9366bf2577653884795f04bbf9d8196ed9998896afeJohn Reck        length--;
9376bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    String value(word.begin, length);
9386bf2577653884795f04bbf9d8196ed9998896afeJohn Reck    return streetNames.contains(value.lower());
939773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}
940