1773979f92560dd1aead375c82fd75b584a141e5dJohn Reck/* 2773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * Copyright (C) 2012 Google Inc. All rights reserved. 3773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * 4773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * Redistribution and use in source and binary forms, with or without 5773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * modification, are permitted provided that the following conditions are 6773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * met: 7773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * 8773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * * Redistributions of source code must retain the above copyright 9773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * notice, this list of conditions and the following disclaimer. 10773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * * Redistributions in binary form must reproduce the above 11773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * copyright notice, this list of conditions and the following disclaimer 12773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * in the documentation and/or other materials provided with the 13773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * distribution. 14773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * * Neither the name of Google Inc. nor the names of its 15773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * contributors may be used to endorse or promote products derived from 16773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * this software without specific prior written permission. 17773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * 18773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28773979f92560dd1aead375c82fd75b584a141e5dJohn Reck * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29773979f92560dd1aead375c82fd75b584a141e5dJohn Reck */ 30773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 31773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "config.h" 32773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 33773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Magic pretend-to-be-a-chromium-build flags 34773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#undef WEBKIT_IMPLEMENTATION 35773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#undef LOG 36773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 37773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "content/address_detector.h" 38773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 39773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include <bitset> 40773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 41773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "base/utf_string_conversions.h" 42773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "net/base/escape.h" 43917ab176521f67983bd1c7cdb99f55ce8fc412f5John Reck#include "Settings.h" 44773979f92560dd1aead375c82fd75b584a141e5dJohn Reck#include "WebString.h" 45773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 466bf2577653884795f04bbf9d8196ed9998896afeJohn Reck#include <wtf/HashSet.h> 476bf2577653884795f04bbf9d8196ed9998896afeJohn Reck#include <wtf/Noncopyable.h> 486bf2577653884795f04bbf9d8196ed9998896afeJohn Reck#include <wtf/text/StringHash.h> 496bf2577653884795f04bbf9d8196ed9998896afeJohn Reck#include <wtf/text/WTFString.h> 506bf2577653884795f04bbf9d8196ed9998896afeJohn Reck 51773979f92560dd1aead375c82fd75b584a141e5dJohn Recknamespace { 52773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 53773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Prefix used for geographical address intent URIs. 54773979f92560dd1aead375c82fd75b584a141e5dJohn Reckstatic const char kAddressSchemaPrefix[] = "geo:0,0?q="; 55773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 56773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum text length to be searched for address detection. 57773979f92560dd1aead375c82fd75b584a141e5dJohn Reckstatic const size_t kMaxAddressLength = 500; 58773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 59773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Minimum number of words in an address after the house number 60773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// before a state is expected to be found. 61773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// A value too high can miss short addresses. 62773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMinAddressWords = 3; 63773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 64773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of words allowed in an address between the house number 65773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// and the state, both not included. 66773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxAddressWords = 12; 67773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 68773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of lines allowed in an address between the house number 69773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// and the state, both not included. 70773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxAddressLines = 5; 71773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 72773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum length allowed for any address word between the house number 73773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// and the state, both not included. 74773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxAddressNameWordLength = 25; 75773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 76773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of words after the house number in which the location name 77773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// should be found. 78773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxLocationNameDistance = 4; 79773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 80773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Number of digits for a valid zip code. 81773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kZipDigits = 5; 82773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 83773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Number of digits for a valid zip code in the Zip Plus 4 format. 84773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kZipPlus4Digits = 9; 85773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 86773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Maximum number of digits of a house number, including possible hyphens. 87773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst size_t kMaxHouseDigits = 5; 88773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 89773979f92560dd1aead375c82fd75b584a141e5dJohn Reck// Additional characters used as new line delimiters. 90773979f92560dd1aead375c82fd75b584a141e5dJohn Reckconst char16 kNewlineDelimiters[] = { 91773979f92560dd1aead375c82fd75b584a141e5dJohn Reck ',', 92773979f92560dd1aead375c82fd75b584a141e5dJohn Reck '*', 93773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x2022, // Unicode bullet 9477974d2fe8f7fb9c421c8d4240e02e9d76ec2c27Selim Gurun 0, 95773979f92560dd1aead375c82fd75b584a141e5dJohn Reck}; 96773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 97773979f92560dd1aead375c82fd75b584a141e5dJohn Reckchar16 SafePreviousChar(const string16::const_iterator& it, 98773979f92560dd1aead375c82fd75b584a141e5dJohn Reck const string16::const_iterator& begin) { 99773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (it == begin) 100773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return ' '; 101773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return *(it - 1); 102773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 103773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 104773979f92560dd1aead375c82fd75b584a141e5dJohn Reckchar16 SafeNextChar(const string16::const_iterator& it, 105773979f92560dd1aead375c82fd75b584a141e5dJohn Reck const string16::const_iterator& end) { 106773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (it == end) 107773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return ' '; 108773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return *(it + 1); 109773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 110773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 111773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool WordLowerCaseEqualsASCII(string16::const_iterator word_begin, 112773979f92560dd1aead375c82fd75b584a141e5dJohn Reck string16::const_iterator word_end, const char* ascii_to_match) { 113773979f92560dd1aead375c82fd75b584a141e5dJohn Reck for (string16::const_iterator it = word_begin; it != word_end; 114773979f92560dd1aead375c82fd75b584a141e5dJohn Reck ++it, ++ascii_to_match) { 115773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match) 116773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 117773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 118773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return *ascii_to_match == 0 || *ascii_to_match == ' '; 119773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 120773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 121773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin, 122773979f92560dd1aead375c82fd75b584a141e5dJohn Reck string16::const_iterator word_end, const char* ascii_to_match, 123773979f92560dd1aead375c82fd75b584a141e5dJohn Reck bool allow_plural) { 124773979f92560dd1aead375c82fd75b584a141e5dJohn Reck for (string16::const_iterator it = word_begin; it != word_end; 125773979f92560dd1aead375c82fd75b584a141e5dJohn Reck ++it, ++ascii_to_match) { 126773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end) 127773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return true; 128773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 129773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match) 130773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 131773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 132773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return *ascii_to_match == 0; 133773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 134773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 135773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} // anonymous namespace 136773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 137773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 138773979f92560dd1aead375c82fd75b584a141e5dJohn ReckAddressDetector::AddressDetector() { 139773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 140773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 141773979f92560dd1aead375c82fd75b584a141e5dJohn ReckAddressDetector::~AddressDetector() { 142773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 143773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 144773979f92560dd1aead375c82fd75b584a141e5dJohn Reckstd::string AddressDetector::GetContentText(const WebKit::WebRange& range) { 145773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Get the address and replace unicode bullets with commas. 146773979f92560dd1aead375c82fd75b584a141e5dJohn Reck string16 address_16 = CollapseWhitespace(range.toPlainText(), false); 147773979f92560dd1aead375c82fd75b584a141e5dJohn Reck std::replace(address_16.begin(), address_16.end(), 148773979f92560dd1aead375c82fd75b584a141e5dJohn Reck static_cast<char16>(0x2022), static_cast<char16>(',')); 149773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return UTF16ToUTF8(address_16); 150773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 151773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 152773979f92560dd1aead375c82fd75b584a141e5dJohn ReckGURL AddressDetector::GetIntentURL(const std::string& content_text) { 153773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return GURL(kAddressSchemaPrefix + 154773979f92560dd1aead375c82fd75b584a141e5dJohn Reck EscapeQueryParamValue(content_text, true)); 155773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 156773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 157773979f92560dd1aead375c82fd75b584a141e5dJohn Recksize_t AddressDetector::GetMaximumContentLength() { 158773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return kMaxAddressLength; 159773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 160773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 161917ab176521f67983bd1c7cdb99f55ce8fc412f5John Reckbool AddressDetector::IsEnabled(const WebKit::WebHitTestInfo& hit_test) { 162917ab176521f67983bd1c7cdb99f55ce8fc412f5John Reck WebCore::Settings* settings = GetSettings(hit_test); 163917ab176521f67983bd1c7cdb99f55ce8fc412f5John Reck return settings && settings->formatDetectionAddress(); 164917ab176521f67983bd1c7cdb99f55ce8fc412f5John Reck} 165917ab176521f67983bd1c7cdb99f55ce8fc412f5John Reck 166773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::FindContent(const string16::const_iterator& begin, 167773979f92560dd1aead375c82fd75b584a141e5dJohn Reck const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) { 168773979f92560dd1aead375c82fd75b584a141e5dJohn Reck HouseNumberParser house_number_parser; 169773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 170773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Keep going through the input string until a potential house number is 171773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // detected. Start tokenizing the following words to find a valid 172773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // street name within a word range. Then, find a state name followed 173773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // by a valid zip code for that state. Also keep a look for any other 174773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // possible house numbers to continue from in case of no match and for 175773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // state names not followed by a zip code (e.g. New York, NY 10000). 176773979f92560dd1aead375c82fd75b584a141e5dJohn Reck const string16 newline_delimiters = kNewlineDelimiters; 177773979f92560dd1aead375c82fd75b584a141e5dJohn Reck const string16 delimiters = kWhitespaceUTF16 + newline_delimiters; 178773979f92560dd1aead375c82fd75b584a141e5dJohn Reck for (string16::const_iterator it = begin; it != end; ) { 179773979f92560dd1aead375c82fd75b584a141e5dJohn Reck Word house_number; 180773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (!house_number_parser.Parse(it, end, &house_number)) 181773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 182773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 183773979f92560dd1aead375c82fd75b584a141e5dJohn Reck String16Tokenizer tokenizer(house_number.end, end, delimiters); 184773979f92560dd1aead375c82fd75b584a141e5dJohn Reck tokenizer.set_options(String16Tokenizer::RETURN_DELIMS); 185773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 186773979f92560dd1aead375c82fd75b584a141e5dJohn Reck std::vector<Word> words; 187773979f92560dd1aead375c82fd75b584a141e5dJohn Reck words.push_back(house_number); 188773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 189773979f92560dd1aead375c82fd75b584a141e5dJohn Reck bool found_location_name = false; 190773979f92560dd1aead375c82fd75b584a141e5dJohn Reck bool continue_on_house_number = true; 191773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t next_house_number_word = 0; 192773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t num_lines = 1; 193773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 194773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Don't include the house number in the word count. 195773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t next_word = 1; 196773979f92560dd1aead375c82fd75b584a141e5dJohn Reck for (; next_word <= kMaxAddressWords + 1; ++next_word) { 197773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 198773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Extract a new word from the tokenizer. 199773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (next_word == words.size()) { 200773979f92560dd1aead375c82fd75b584a141e5dJohn Reck do { 201773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (!tokenizer.GetNext()) 202773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 203773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 204773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Check the number of address lines. 205773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (tokenizer.token_is_delim() && newline_delimiters.find( 206773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *tokenizer.token_begin()) != string16::npos) { 207773979f92560dd1aead375c82fd75b584a141e5dJohn Reck ++num_lines; 208773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 209773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } while (tokenizer.token_is_delim()); 210773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 211773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (num_lines > kMaxAddressLines) 212773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 213773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 214773979f92560dd1aead375c82fd75b584a141e5dJohn Reck words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); 215773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 216773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 217773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Check the word length. If too long, don't try to continue from 218773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // the next house number as no address can hold this word. 219773979f92560dd1aead375c82fd75b584a141e5dJohn Reck const Word& current_word = words[next_word]; 220773979f92560dd1aead375c82fd75b584a141e5dJohn Reck DCHECK_GT(std::distance(current_word.begin, current_word.end), 0); 221773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t current_word_length = std::distance( 222773979f92560dd1aead375c82fd75b584a141e5dJohn Reck current_word.begin, current_word.end); 223773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (current_word_length > kMaxAddressNameWordLength) { 224773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue_on_house_number = false; 225773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 226773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 227773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 228773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Check if the new word is a valid house number. 229773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // This is used to properly resume parsing in case the maximum number 230773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // of words is exceeded. 231773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (next_house_number_word == 0 && 232773979f92560dd1aead375c82fd75b584a141e5dJohn Reck house_number_parser.Parse(current_word.begin, current_word.end, NULL)) { 233773979f92560dd1aead375c82fd75b584a141e5dJohn Reck next_house_number_word = next_word; 234773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 235773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 236773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 237773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Look for location names in the words after the house number. 238773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // A range limitation is introduced to avoid matching 239773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // anything that starts with a number before a legitimate address. 240773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (next_word <= kMaxLocationNameDistance && 241773979f92560dd1aead375c82fd75b584a141e5dJohn Reck IsValidLocationName(current_word)) { 242773979f92560dd1aead375c82fd75b584a141e5dJohn Reck found_location_name = true; 243773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 244773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 245773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 246773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Don't count the house number. 247773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (next_word > kMinAddressWords) { 248773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Looking for the state is likely to add new words to the list while 249773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // checking for multi-word state names. 250773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t state_first_word = next_word; 251773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t state_last_word, state_index; 252773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (FindStateStartingInWord(&words, state_first_word, &state_last_word, 253773979f92560dd1aead375c82fd75b584a141e5dJohn Reck &tokenizer, &state_index)) { 254773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 255773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // A location name should have been found at this point. 256773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (!found_location_name) 257773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 258773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 259773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Explicitly exclude "et al", as "al" is a valid state code. 260773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (current_word_length == 2 && words.size() > 2) { 261773979f92560dd1aead375c82fd75b584a141e5dJohn Reck const Word& previous_word = words[state_first_word - 1]; 262773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (previous_word.end - previous_word.begin == 2 && 263773979f92560dd1aead375c82fd75b584a141e5dJohn Reck LowerCaseEqualsASCII(previous_word.begin, previous_word.end, 264773979f92560dd1aead375c82fd75b584a141e5dJohn Reck "et") && 265773979f92560dd1aead375c82fd75b584a141e5dJohn Reck LowerCaseEqualsASCII(current_word.begin, current_word.end, 266773979f92560dd1aead375c82fd75b584a141e5dJohn Reck "al")) 267773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 268773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 269773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 270773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Extract one more word from the tokenizer if not already available. 271773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t zip_word = state_last_word + 1; 272773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (zip_word == words.size()) { 273773979f92560dd1aead375c82fd75b584a141e5dJohn Reck do { 2746bf2577653884795f04bbf9d8196ed9998896afeJohn Reck if (!tokenizer.GetNext()) { 2756bf2577653884795f04bbf9d8196ed9998896afeJohn Reck // Zip is optional 2766bf2577653884795f04bbf9d8196ed9998896afeJohn Reck *start_pos = words[0].begin - begin; 2776bf2577653884795f04bbf9d8196ed9998896afeJohn Reck *end_pos = words[state_last_word].end - begin; 2786bf2577653884795f04bbf9d8196ed9998896afeJohn Reck return true; 2796bf2577653884795f04bbf9d8196ed9998896afeJohn Reck } 280773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } while (tokenizer.token_is_delim()); 281773979f92560dd1aead375c82fd75b584a141e5dJohn Reck words.push_back(Word(tokenizer.token_begin(), 282773979f92560dd1aead375c82fd75b584a141e5dJohn Reck tokenizer.token_end())); 283773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 284773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 285773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Check the parsing validity and state range of the zip code. 286773979f92560dd1aead375c82fd75b584a141e5dJohn Reck next_word = state_last_word; 287773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (!IsZipValid(words[zip_word], state_index)) 288773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 289773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 290773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *start_pos = words[0].begin - begin; 291773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *end_pos = words[zip_word].end - begin; 292773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return true; 293773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 294773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 295773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 296773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 297773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Avoid skipping too many words because of a non-address number 298773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // at the beginning of the contents to parse. 299773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (continue_on_house_number && next_house_number_word > 0) { 300773979f92560dd1aead375c82fd75b584a141e5dJohn Reck it = words[next_house_number_word].begin; 301773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } else { 302773979f92560dd1aead375c82fd75b584a141e5dJohn Reck DCHECK(!words.empty()); 303773979f92560dd1aead375c82fd75b584a141e5dJohn Reck next_word = std::min(next_word, words.size() - 1); 304773979f92560dd1aead375c82fd75b584a141e5dJohn Reck it = words[next_word].end; 305773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 306773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 307773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 308773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 309773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 310773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 311773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::IsPreDelimiter( 312773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char16 character) { 313773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return character == ':' || IsPostDelimiter(character); 314773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 315773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 316773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::IsPostDelimiter( 317773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char16 character) { 318773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return IsWhitespace(character) || strchr(",\"'", character); 319773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 320773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 321773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::RestartOnNextDelimiter() { 322773979f92560dd1aead375c82fd75b584a141e5dJohn Reck ResetState(); 323773979f92560dd1aead375c82fd75b584a141e5dJohn Reck for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {} 324773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 325773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 326773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) { 327773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)), 328773979f92560dd1aead375c82fd75b584a141e5dJohn Reck num_chars); 329773979f92560dd1aead375c82fd75b584a141e5dJohn Reck it_ += offset; 330773979f92560dd1aead375c82fd75b584a141e5dJohn Reck result_chars_ += offset; 331773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 332773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 333773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) { 334773979f92560dd1aead375c82fd75b584a141e5dJohn Reck it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars); 335773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 336773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 337773979f92560dd1aead375c82fd75b584a141e5dJohn Reckvoid AddressDetector::HouseNumberParser::ResetState() { 338773979f92560dd1aead375c82fd75b584a141e5dJohn Reck num_digits_ = 0; 339773979f92560dd1aead375c82fd75b584a141e5dJohn Reck result_chars_ = 0; 340773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 341773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 342773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const { 343773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // There should always be a number after a hyphen. 344773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-') 345773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 346773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 347773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (word) { 348773979f92560dd1aead375c82fd75b584a141e5dJohn Reck word->begin = it_ - result_chars_; 349773979f92560dd1aead375c82fd75b584a141e5dJohn Reck word->end = it_; 350773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 351773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return true; 352773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 353773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 354773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::HouseNumberParser::Parse( 355773979f92560dd1aead375c82fd75b584a141e5dJohn Reck const string16::const_iterator& begin, 356773979f92560dd1aead375c82fd75b584a141e5dJohn Reck const string16::const_iterator& end, Word* word) { 357773979f92560dd1aead375c82fd75b584a141e5dJohn Reck it_ = begin_ = begin; 358773979f92560dd1aead375c82fd75b584a141e5dJohn Reck end_ = end; 359773979f92560dd1aead375c82fd75b584a141e5dJohn Reck ResetState(); 360773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 361773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Iterations only used as a fail-safe against any buggy infinite loops. 362773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t iterations = 0; 363773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t max_iterations = end - begin + 1; 364773979f92560dd1aead375c82fd75b584a141e5dJohn Reck for (; it_ != end_ && iterations < max_iterations; ++iterations) { 365773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 366773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Word finished case. 367773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (IsPostDelimiter(*it_)) { 368773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (CheckFinished(word)) 369773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return true; 370773979f92560dd1aead375c82fd75b584a141e5dJohn Reck else if (result_chars_) 371773979f92560dd1aead375c82fd75b584a141e5dJohn Reck ResetState(); 372773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 373773979f92560dd1aead375c82fd75b584a141e5dJohn Reck SkipChars(1); 374773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 375773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 376773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 377773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // More digits. There should be no more after a letter was found. 378773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (IsAsciiDigit(*it_)) { 379773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (num_digits_ >= kMaxHouseDigits) { 380773979f92560dd1aead375c82fd75b584a141e5dJohn Reck RestartOnNextDelimiter(); 381773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } else { 382773979f92560dd1aead375c82fd75b584a141e5dJohn Reck AcceptChars(1); 383773979f92560dd1aead375c82fd75b584a141e5dJohn Reck ++num_digits_; 384773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 385773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 386773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 387773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 388773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (IsAsciiAlpha(*it_)) { 389773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Handle special case 'one'. 390773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (result_chars_ == 0) { 391773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one")) 392773979f92560dd1aead375c82fd75b584a141e5dJohn Reck AcceptChars(3); 393773979f92560dd1aead375c82fd75b584a141e5dJohn Reck else 394773979f92560dd1aead375c82fd75b584a141e5dJohn Reck RestartOnNextDelimiter(); 395773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 396773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 397773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 398773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // There should be more than 1 character because of result_chars. 399773979f92560dd1aead375c82fd75b584a141e5dJohn Reck DCHECK_GT(result_chars_, 0U); 400773979f92560dd1aead375c82fd75b584a141e5dJohn Reck DCHECK_NE(it_, begin_); 401773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char16 previous = SafePreviousChar(it_, begin_); 402773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (IsAsciiDigit(previous)) { 403773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Check cases like '12A'. 404773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char16 next = SafeNextChar(it_, end_); 405773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (IsPostDelimiter(next)) { 406773979f92560dd1aead375c82fd75b584a141e5dJohn Reck AcceptChars(1); 407773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 408773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 409773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 410773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Handle cases like 12a, 1st, 2nd, 3rd, 7th. 411773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (IsAsciiAlpha(next)) { 412773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char16 last_digit = previous; 413773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char16 first_letter = base::ToLowerASCII(*it_); 414773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char16 second_letter = base::ToLowerASCII(next); 415773979f92560dd1aead375c82fd75b584a141e5dJohn Reck bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' && 416773979f92560dd1aead375c82fd75b584a141e5dJohn Reck num_digits_ == 2; 417773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 418773979f92560dd1aead375c82fd75b584a141e5dJohn Reck switch (last_digit - '0') { 419773979f92560dd1aead375c82fd75b584a141e5dJohn Reck case 1: 420773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if ((first_letter == 's' && second_letter == 't') || 421773979f92560dd1aead375c82fd75b584a141e5dJohn Reck (first_letter == 't' && second_letter == 'h' && is_teen)) { 422773979f92560dd1aead375c82fd75b584a141e5dJohn Reck AcceptChars(2); 423773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 424773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 425773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 426773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 427773979f92560dd1aead375c82fd75b584a141e5dJohn Reck case 2: 428773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if ((first_letter == 'n' && second_letter == 'd') || 429773979f92560dd1aead375c82fd75b584a141e5dJohn Reck (first_letter == 't' && second_letter == 'h' && is_teen)) { 430773979f92560dd1aead375c82fd75b584a141e5dJohn Reck AcceptChars(2); 431773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 432773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 433773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 434773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 435773979f92560dd1aead375c82fd75b584a141e5dJohn Reck case 3: 436773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if ((first_letter == 'r' && second_letter == 'd') || 437773979f92560dd1aead375c82fd75b584a141e5dJohn Reck (first_letter == 't' && second_letter == 'h' && is_teen)) { 438773979f92560dd1aead375c82fd75b584a141e5dJohn Reck AcceptChars(2); 439773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 440773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 441773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 442773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 443773979f92560dd1aead375c82fd75b584a141e5dJohn Reck case 0: 444773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Explicitly exclude '0th'. 445773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (num_digits_ == 1) 446773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 447773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 448773979f92560dd1aead375c82fd75b584a141e5dJohn Reck case 4: 449773979f92560dd1aead375c82fd75b584a141e5dJohn Reck case 5: 450773979f92560dd1aead375c82fd75b584a141e5dJohn Reck case 6: 451773979f92560dd1aead375c82fd75b584a141e5dJohn Reck case 7: 452773979f92560dd1aead375c82fd75b584a141e5dJohn Reck case 8: 453773979f92560dd1aead375c82fd75b584a141e5dJohn Reck case 9: 454773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (first_letter == 't' && second_letter == 'h') { 455773979f92560dd1aead375c82fd75b584a141e5dJohn Reck AcceptChars(2); 456773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 457773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 458773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 459773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 460773979f92560dd1aead375c82fd75b584a141e5dJohn Reck default: 461773979f92560dd1aead375c82fd75b584a141e5dJohn Reck NOTREACHED(); 462773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 463773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 464773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 465773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 466773979f92560dd1aead375c82fd75b584a141e5dJohn Reck RestartOnNextDelimiter(); 467773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 468773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 469773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 470773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (*it_ == '-' && num_digits_ > 0) { 471773979f92560dd1aead375c82fd75b584a141e5dJohn Reck AcceptChars(1); 472773979f92560dd1aead375c82fd75b584a141e5dJohn Reck ++num_digits_; 473773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 474773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 475773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 476773979f92560dd1aead375c82fd75b584a141e5dJohn Reck RestartOnNextDelimiter(); 477773979f92560dd1aead375c82fd75b584a141e5dJohn Reck SkipChars(1); 478773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 479773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 480773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (iterations >= max_iterations) 481773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 482773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 483773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return CheckFinished(word); 484773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 485773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 486773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::FindStateStartingInWord(WordList* words, 487773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t state_first_word, size_t* state_last_word, 488773979f92560dd1aead375c82fd75b584a141e5dJohn Reck String16Tokenizer* tokenizer, size_t* state_index) { 489773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 490773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Bitmasks containing the allowed suffixes for 2-letter state codes. 491773979f92560dd1aead375c82fd75b584a141e5dJohn Reck static const int state_two_letter_suffix[23] = { 492773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x02060c00, // A followed by: [KLRSZ]. 493773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00000000, // B. 494773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00084001, // C followed by: [AOT]. 495773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00000014, // D followed by: [CE]. 496773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00000000, // E. 497773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00001800, // F followed by: [LM]. 498773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00100001, // G followed by: [AU]. 499773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00000100, // H followed by: [I]. 500773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00002809, // I followed by: [ADLN]. 501773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00000000, // J. 502773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x01040000, // K followed by: [SY]. 503773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00000001, // L followed by: [A]. 504773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x000ce199, // M followed by: [ADEHINOPST]. 505773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x0120129c, // N followed by: [CDEHJMVY]. 506773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00020480, // O followed by: [HKR]. 507773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00420001, // P followed by: [ARW]. 508773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00000000, // Q. 509773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00000100, // R followed by: [I]. 510773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x0000000c, // S followed by: [CD]. 511773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00802000, // T followed by: [NX]. 512773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00080000, // U followed by: [T]. 513773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x00080101, // V followed by: [AIT]. 514773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0x01200101 // W followed by: [AIVY]. 515773979f92560dd1aead375c82fd75b584a141e5dJohn Reck }; 516773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 517773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Accumulative number of states for the 2-letter code indexed by the first. 518773979f92560dd1aead375c82fd75b584a141e5dJohn Reck static const int state_two_letter_accumulative[24] = { 519773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0, 5, 5, 8, 10, 10, 12, 14, 520773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 15, 19, 19, 21, 22, 32, 40, 43, 521773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 46, 46, 47, 49, 51, 52, 55, 59 522773979f92560dd1aead375c82fd75b584a141e5dJohn Reck }; 523773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 524773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // State names sorted alphabetically with their lengths. 525773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // There can be more than one possible name for a same state if desired. 526773979f92560dd1aead375c82fd75b584a141e5dJohn Reck static const struct StateNameInfo { 527773979f92560dd1aead375c82fd75b584a141e5dJohn Reck const char* string; 528773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char first_word_length; 529773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char length; 530773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char state_index; // Relative to two-character code alphabetical order. 531773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } state_names[59] = { 532773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 }, 533773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 }, 534773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "arkansas", 8, 8, 2 }, 535773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 }, 536773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 }, 537773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "district of columbia", 8, 20, 8 }, 538773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 }, 539773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 }, 540773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "hawaii", 6, 6, 14 }, 541773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 }, 542773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "iowa", 4, 4, 15 }, 543773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 }, 544773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "louisiana", 9, 9, 21 }, 545773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 }, 546773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 }, 547773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 }, 548773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 }, 549773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "montana", 7, 7, 31 }, 550773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 }, 551773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 }, 552773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 }, 553773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 }, 554773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "northern mariana islands", 8, 24, 29 }, 555773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 }, 556773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 }, 557773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "puerto rico", 6, 11, 44 }, 558773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "rhode island", 5, 5, 46 }, 559773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 }, 560773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 }, 561773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "utah", 4, 4, 51 }, 562773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 }, 563773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "virginia", 8, 8, 52 }, 564773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 }, 565773979f92560dd1aead375c82fd75b584a141e5dJohn Reck { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 } 566773979f92560dd1aead375c82fd75b584a141e5dJohn Reck }; 567773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 568773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Accumulative number of states for sorted names indexed by the first letter. 569773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Required a different one since there are codes that don't share their 570773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // first letter with the name of their state (MP = Northern Mariana Islands). 571773979f92560dd1aead375c82fd75b584a141e5dJohn Reck static const int state_names_accumulative[24] = { 572773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 0, 5, 5, 8, 10, 10, 12, 14, 573773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 15, 19, 19, 21, 22, 31, 40, 43, 574773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 46, 46, 47, 49, 51, 52, 55, 59 575773979f92560dd1aead375c82fd75b584a141e5dJohn Reck }; 576773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 577773979f92560dd1aead375c82fd75b584a141e5dJohn Reck DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1], 578773979f92560dd1aead375c82fd75b584a141e5dJohn Reck static_cast<int>(ARRAYSIZE_UNSAFE(state_names))); 579773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 580773979f92560dd1aead375c82fd75b584a141e5dJohn Reck const Word& first_word = words->at(state_first_word); 581773979f92560dd1aead375c82fd75b584a141e5dJohn Reck int length = first_word.end - first_word.begin; 582773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (length < 2 || !IsAsciiAlpha(*first_word.begin)) 583773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 584773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 585773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // No state names start with x, y, z. 586773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char16 first_letter = base::ToLowerASCII(*first_word.begin); 587773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (first_letter > 'w') 588773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 589773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 590773979f92560dd1aead375c82fd75b584a141e5dJohn Reck DCHECK(first_letter >= 'a'); 591773979f92560dd1aead375c82fd75b584a141e5dJohn Reck int first_index = first_letter - 'a'; 592773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 593773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Look for two-letter state names. 594773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) { 595773979f92560dd1aead375c82fd75b584a141e5dJohn Reck char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1)); 596773979f92560dd1aead375c82fd75b584a141e5dJohn Reck DCHECK(second_letter >= 'a'); 597773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 598773979f92560dd1aead375c82fd75b584a141e5dJohn Reck int second_index = second_letter - 'a'; 599773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (!(state_two_letter_suffix[first_index] & (1 << second_index))) 600773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 601773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 602773979f92560dd1aead375c82fd75b584a141e5dJohn Reck std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] & 603773979f92560dd1aead375c82fd75b584a141e5dJohn Reck ((1 << second_index) - 1); 604773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *state_last_word = state_first_word; 605773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *state_index = state_two_letter_accumulative[first_index] + 606773979f92560dd1aead375c82fd75b584a141e5dJohn Reck previous_suffixes.count(); 607773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return true; 608773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 609773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 610773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Look for full state names by their first letter. Discard by length. 611773979f92560dd1aead375c82fd75b584a141e5dJohn Reck for (int state = state_names_accumulative[first_index]; 612773979f92560dd1aead375c82fd75b584a141e5dJohn Reck state < state_names_accumulative[first_index + 1]; ++state) { 613773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (state_names[state].first_word_length != length) 614773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 615773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 616773979f92560dd1aead375c82fd75b584a141e5dJohn Reck bool state_match = false; 617773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t state_word = state_first_word; 618773979f92560dd1aead375c82fd75b584a141e5dJohn Reck for (int pos = 0; true; ) { 619773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (!WordLowerCaseEqualsASCII(words->at(state_word).begin, 620773979f92560dd1aead375c82fd75b584a141e5dJohn Reck words->at(state_word).end, &state_names[state].string[pos])) 621773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 622773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 623773979f92560dd1aead375c82fd75b584a141e5dJohn Reck pos += words->at(state_word).end - words->at(state_word).begin + 1; 624773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (pos >= state_names[state].length) { 625773979f92560dd1aead375c82fd75b584a141e5dJohn Reck state_match = true; 626773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 627773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 628773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 629773979f92560dd1aead375c82fd75b584a141e5dJohn Reck // Ran out of words, extract more from the tokenizer. 630773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (++state_word == words->size()) { 631773979f92560dd1aead375c82fd75b584a141e5dJohn Reck do { 632773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (!tokenizer->GetNext()) 633773979f92560dd1aead375c82fd75b584a141e5dJohn Reck break; 634773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } while (tokenizer->token_is_delim()); 635773979f92560dd1aead375c82fd75b584a141e5dJohn Reck words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end())); 636773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 637773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 638773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 639773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (state_match) { 640773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *state_last_word = state_word; 641773979f92560dd1aead375c82fd75b584a141e5dJohn Reck *state_index = state_names[state].state_index; 642773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return true; 643773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 644773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 645773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 646773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 647773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 648773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 649773979f92560dd1aead375c82fd75b584a141e5dJohn Reckbool AddressDetector::IsZipValid(const Word& word, size_t state_index) { 650773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t length = word.end - word.begin; 651773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (length != kZipDigits && length != kZipPlus4Digits + 1) 652773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 653773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 654773979f92560dd1aead375c82fd75b584a141e5dJohn Reck for (string16::const_iterator it = word.begin; it != word.end; ++it) { 655773979f92560dd1aead375c82fd75b584a141e5dJohn Reck size_t pos = it - word.begin; 656773979f92560dd1aead375c82fd75b584a141e5dJohn Reck if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits)) 657773979f92560dd1aead375c82fd75b584a141e5dJohn Reck continue; 658773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return false; 659773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 660773979f92560dd1aead375c82fd75b584a141e5dJohn Reck return IsZipValidForState(word, state_index); 661773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 662773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 6636bf2577653884795f04bbf9d8196ed9998896afeJohn Reckbool AddressDetector::IsZipValidForState(const Word& word, size_t state_index) 6646bf2577653884795f04bbf9d8196ed9998896afeJohn Reck{ 6656bf2577653884795f04bbf9d8196ed9998896afeJohn Reck enum USState { 6666bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AP = -4, // AP (military base in the Pacific) 6676bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AA = -3, // AA (military base inside the US) 6686bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AE = -2, // AE (military base outside the US) 6696bf2577653884795f04bbf9d8196ed9998896afeJohn Reck XX = -1, // (not in use) 6706bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AK = 0, // AK Alaska 6716bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AL = 1, // AL Alabama 6726bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AR = 2, // AR Arkansas 6736bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AS = 3, // AS American Samoa 6746bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AZ = 4, // AZ Arizona 6756bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CA = 5, // CA California 6766bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CO = 6, // CO Colorado 6776bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CT = 7, // CT Connecticut 6786bf2577653884795f04bbf9d8196ed9998896afeJohn Reck DC = 8, // DC District of Columbia 6796bf2577653884795f04bbf9d8196ed9998896afeJohn Reck DE = 9, // DE Delaware 6806bf2577653884795f04bbf9d8196ed9998896afeJohn Reck FL = 10, // FL Florida 6816bf2577653884795f04bbf9d8196ed9998896afeJohn Reck FM = 11, // FM Federated States of Micronesia 6826bf2577653884795f04bbf9d8196ed9998896afeJohn Reck GA = 12, // GA Georgia 6836bf2577653884795f04bbf9d8196ed9998896afeJohn Reck GU = 13, // GU Guam 6846bf2577653884795f04bbf9d8196ed9998896afeJohn Reck HI = 14, // HI Hawaii 6856bf2577653884795f04bbf9d8196ed9998896afeJohn Reck IA = 15, // IA Iowa 6866bf2577653884795f04bbf9d8196ed9998896afeJohn Reck ID = 16, // ID Idaho 6876bf2577653884795f04bbf9d8196ed9998896afeJohn Reck IL = 17, // IL Illinois 6886bf2577653884795f04bbf9d8196ed9998896afeJohn Reck IN = 18, // IN Indiana 6896bf2577653884795f04bbf9d8196ed9998896afeJohn Reck KS = 19, // KS Kansas 6906bf2577653884795f04bbf9d8196ed9998896afeJohn Reck KY = 20, // KY Kentucky 6916bf2577653884795f04bbf9d8196ed9998896afeJohn Reck LA = 21, // LA Louisiana 6926bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MA = 22, // MA Massachusetts 6936bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MD = 23, // MD Maryland 6946bf2577653884795f04bbf9d8196ed9998896afeJohn Reck ME = 24, // ME Maine 6956bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MH = 25, // MH Marshall Islands 6966bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MI = 26, // MI Michigan 6976bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MN = 27, // MN Minnesota 6986bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MO = 28, // MO Missouri 6996bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MP = 29, // MP Northern Mariana Islands 7006bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MS = 30, // MS Mississippi 7016bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MT = 31, // MT Montana 7026bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NC = 32, // NC North Carolina 7036bf2577653884795f04bbf9d8196ed9998896afeJohn Reck ND = 33, // ND North Dakota 7046bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NE = 34, // NE Nebraska 7056bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NH = 35, // NH New Hampshire 7066bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NJ = 36, // NJ New Jersey 7076bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NM = 37, // NM New Mexico 7086bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NV = 38, // NV Nevada 7096bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NY = 39, // NY New York 7106bf2577653884795f04bbf9d8196ed9998896afeJohn Reck OH = 40, // OH Ohio 7116bf2577653884795f04bbf9d8196ed9998896afeJohn Reck OK = 41, // OK Oklahoma 7126bf2577653884795f04bbf9d8196ed9998896afeJohn Reck OR = 42, // OR Oregon 7136bf2577653884795f04bbf9d8196ed9998896afeJohn Reck PA = 43, // PA Pennsylvania 7146bf2577653884795f04bbf9d8196ed9998896afeJohn Reck PR = 44, // PR Puerto Rico 7156bf2577653884795f04bbf9d8196ed9998896afeJohn Reck PW = 45, // PW Palau 7166bf2577653884795f04bbf9d8196ed9998896afeJohn Reck RI = 46, // RI Rhode Island 7176bf2577653884795f04bbf9d8196ed9998896afeJohn Reck SC = 47, // SC South Carolina 7186bf2577653884795f04bbf9d8196ed9998896afeJohn Reck SD = 48, // SD South Dakota 7196bf2577653884795f04bbf9d8196ed9998896afeJohn Reck TN = 49, // TN Tennessee 7206bf2577653884795f04bbf9d8196ed9998896afeJohn Reck TX = 50, // TX Texas 7216bf2577653884795f04bbf9d8196ed9998896afeJohn Reck UT = 51, // UT Utah 7226bf2577653884795f04bbf9d8196ed9998896afeJohn Reck VA = 52, // VA Virginia 7236bf2577653884795f04bbf9d8196ed9998896afeJohn Reck VI = 53, // VI Virgin Islands 7246bf2577653884795f04bbf9d8196ed9998896afeJohn Reck VT = 54, // VT Vermont 7256bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WA = 55, // WA Washington 7266bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WI = 56, // WI Wisconsin 7276bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WV = 57, // WV West Virginia 7286bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WY = 58, // WY Wyoming 7296bf2577653884795f04bbf9d8196ed9998896afeJohn Reck }; 7306bf2577653884795f04bbf9d8196ed9998896afeJohn Reck 7316bf2577653884795f04bbf9d8196ed9998896afeJohn Reck static const USState stateForZipPrefix[] = { 7326bf2577653884795f04bbf9d8196ed9998896afeJohn Reck // 0 1 2 3 4 5 6 7 8 9 7336bf2577653884795f04bbf9d8196ed9998896afeJohn Reck XX, XX, XX, XX, XX, NY, PR, PR, VI, PR, // 000-009 7346bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MA, MA, MA, MA, MA, MA, MA, MA, MA, MA, // 010-019 7356bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MA, MA, MA, MA, MA, MA, MA, MA, RI, RI, // 020-029 7366bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NH, NH, NH, NH, NH, NH, NH, NH, NH, ME, // 030-039 7376bf2577653884795f04bbf9d8196ed9998896afeJohn Reck ME, ME, ME, ME, ME, ME, ME, ME, ME, ME, // 040-049 7386bf2577653884795f04bbf9d8196ed9998896afeJohn Reck VT, VT, VT, VT, VT, MA, VT, VT, VT, VT, // 050-059 7396bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 060-069 7406bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 070-079 7416bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 080-089 7426bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AE, AE, AE, AE, AE, AE, AE, AE, AE, XX, // 090-099 7436bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 100-109 7446bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 110-119 7456bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 120-129 7466bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 130-139 7476bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 140-149 7486bf2577653884795f04bbf9d8196ed9998896afeJohn Reck PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 150-159 7496bf2577653884795f04bbf9d8196ed9998896afeJohn Reck PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 160-169 7506bf2577653884795f04bbf9d8196ed9998896afeJohn Reck PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 170-179 7516bf2577653884795f04bbf9d8196ed9998896afeJohn Reck PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 180-189 7526bf2577653884795f04bbf9d8196ed9998896afeJohn Reck PA, PA, PA, PA, PA, PA, PA, DE, DE, DE, // 190-199 7536bf2577653884795f04bbf9d8196ed9998896afeJohn Reck DC, VA, DC, DC, DC, DC, MD, MD, MD, MD, // 200-209 7546bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MD, MD, MD, XX, MD, MD, MD, MD, MD, MD, // 210-219 7556bf2577653884795f04bbf9d8196ed9998896afeJohn Reck VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 220-229 7566bf2577653884795f04bbf9d8196ed9998896afeJohn Reck VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 230-239 7576bf2577653884795f04bbf9d8196ed9998896afeJohn Reck VA, VA, VA, VA, VA, VA, VA, WV, WV, WV, // 240-249 7586bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WV, WV, WV, WV, WV, WV, WV, WV, WV, WV, // 250-259 7596bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WV, WV, WV, WV, WV, WV, WV, WV, WV, XX, // 260-269 7606bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 270-279 7616bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 280-289 7626bf2577653884795f04bbf9d8196ed9998896afeJohn Reck SC, SC, SC, SC, SC, SC, SC, SC, SC, SC, // 290-299 7636bf2577653884795f04bbf9d8196ed9998896afeJohn Reck GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 300-309 7646bf2577653884795f04bbf9d8196ed9998896afeJohn Reck GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 310-319 7656bf2577653884795f04bbf9d8196ed9998896afeJohn Reck FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 320-329 7666bf2577653884795f04bbf9d8196ed9998896afeJohn Reck FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 330-339 7676bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AA, FL, FL, XX, FL, XX, FL, FL, XX, FL, // 340-349 7686bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AL, AL, AL, XX, AL, AL, AL, AL, AL, AL, // 350-359 7696bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 360-369 7706bf2577653884795f04bbf9d8196ed9998896afeJohn Reck TN, TN, TN, TN, TN, TN, TN, TN, TN, TN, // 370-379 7716bf2577653884795f04bbf9d8196ed9998896afeJohn Reck TN, TN, TN, TN, TN, TN, MS, MS, MS, MS, // 380-389 7726bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MS, MS, MS, MS, MS, MS, MS, MS, GA, GA, // 390-399 7736bf2577653884795f04bbf9d8196ed9998896afeJohn Reck KY, KY, KY, KY, KY, KY, KY, KY, KY, KY, // 400-409 7746bf2577653884795f04bbf9d8196ed9998896afeJohn Reck KY, KY, KY, KY, KY, KY, KY, KY, KY, XX, // 410-419 7756bf2577653884795f04bbf9d8196ed9998896afeJohn Reck KY, KY, KY, KY, KY, KY, KY, KY, XX, XX, // 420-429 7766bf2577653884795f04bbf9d8196ed9998896afeJohn Reck OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 430-439 7776bf2577653884795f04bbf9d8196ed9998896afeJohn Reck OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 440-449 7786bf2577653884795f04bbf9d8196ed9998896afeJohn Reck OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 450-459 7796bf2577653884795f04bbf9d8196ed9998896afeJohn Reck IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 460-469 7806bf2577653884795f04bbf9d8196ed9998896afeJohn Reck IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 470-479 7816bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 480-489 7826bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 490-499 7836bf2577653884795f04bbf9d8196ed9998896afeJohn Reck IA, IA, IA, IA, IA, IA, IA, IA, IA, IA, // 500-509 7846bf2577653884795f04bbf9d8196ed9998896afeJohn Reck IA, IA, IA, IA, IA, IA, IA, XX, XX, XX, // 510-519 7856bf2577653884795f04bbf9d8196ed9998896afeJohn Reck IA, IA, IA, IA, IA, IA, IA, IA, IA, XX, // 520-529 7866bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WI, WI, WI, XX, WI, WI, XX, WI, WI, WI, // 530-539 7876bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WI, WI, WI, WI, WI, WI, WI, WI, WI, WI, // 540-549 7886bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MN, MN, XX, MN, MN, MN, MN, MN, MN, MN, // 550-559 7896bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MN, MN, MN, MN, MN, MN, MN, MN, XX, DC, // 560-569 7906bf2577653884795f04bbf9d8196ed9998896afeJohn Reck SD, SD, SD, SD, SD, SD, SD, SD, XX, XX, // 570-579 7916bf2577653884795f04bbf9d8196ed9998896afeJohn Reck ND, ND, ND, ND, ND, ND, ND, ND, ND, XX, // 580-589 7926bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, // 590-599 7936bf2577653884795f04bbf9d8196ed9998896afeJohn Reck IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 600-609 7946bf2577653884795f04bbf9d8196ed9998896afeJohn Reck IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 610-619 7956bf2577653884795f04bbf9d8196ed9998896afeJohn Reck IL, XX, IL, IL, IL, IL, IL, IL, IL, IL, // 620-629 7966bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MO, MO, XX, MO, MO, MO, MO, MO, MO, MO, // 630-639 7976bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MO, MO, XX, XX, MO, MO, MO, MO, MO, MO, // 640-649 7986bf2577653884795f04bbf9d8196ed9998896afeJohn Reck MO, MO, MO, MO, MO, MO, MO, MO, MO, XX, // 650-659 7996bf2577653884795f04bbf9d8196ed9998896afeJohn Reck KS, KS, KS, XX, KS, KS, KS, KS, KS, KS, // 660-669 8006bf2577653884795f04bbf9d8196ed9998896afeJohn Reck KS, KS, KS, KS, KS, KS, KS, KS, KS, KS, // 670-679 8016bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NE, NE, XX, NE, NE, NE, NE, NE, NE, NE, // 680-689 8026bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NE, NE, NE, NE, XX, XX, XX, XX, XX, XX, // 690-699 8036bf2577653884795f04bbf9d8196ed9998896afeJohn Reck LA, LA, XX, LA, LA, LA, LA, LA, LA, XX, // 700-709 8046bf2577653884795f04bbf9d8196ed9998896afeJohn Reck LA, LA, LA, LA, LA, XX, AR, AR, AR, AR, // 710-719 8056bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AR, AR, AR, AR, AR, AR, AR, AR, AR, AR, // 720-729 8066bf2577653884795f04bbf9d8196ed9998896afeJohn Reck OK, OK, XX, TX, OK, OK, OK, OK, OK, OK, // 730-739 8076bf2577653884795f04bbf9d8196ed9998896afeJohn Reck OK, OK, XX, OK, OK, OK, OK, OK, OK, OK, // 740-749 8086bf2577653884795f04bbf9d8196ed9998896afeJohn Reck TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 750-759 8096bf2577653884795f04bbf9d8196ed9998896afeJohn Reck TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 760-769 8106bf2577653884795f04bbf9d8196ed9998896afeJohn Reck TX, XX, TX, TX, TX, TX, TX, TX, TX, TX, // 770-779 8116bf2577653884795f04bbf9d8196ed9998896afeJohn Reck TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 780-789 8126bf2577653884795f04bbf9d8196ed9998896afeJohn Reck TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 790-799 8136bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CO, CO, CO, CO, CO, CO, CO, CO, CO, CO, // 800-809 8146bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CO, CO, CO, CO, CO, CO, CO, XX, XX, XX, // 810-819 8156bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WY, WY, WY, WY, WY, WY, WY, WY, WY, WY, // 820-829 8166bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WY, WY, ID, ID, ID, ID, ID, ID, ID, XX, // 830-839 8176bf2577653884795f04bbf9d8196ed9998896afeJohn Reck UT, UT, UT, UT, UT, UT, UT, UT, XX, XX, // 840-849 8186bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AZ, AZ, AZ, AZ, XX, AZ, AZ, AZ, XX, AZ, // 850-859 8196bf2577653884795f04bbf9d8196ed9998896afeJohn Reck AZ, XX, XX, AZ, AZ, AZ, XX, XX, XX, XX, // 860-869 8206bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NM, NM, NM, NM, NM, NM, XX, NM, NM, NM, // 870-879 8216bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NM, NM, NM, NM, NM, TX, XX, XX, XX, NV, // 880-889 8226bf2577653884795f04bbf9d8196ed9998896afeJohn Reck NV, NV, XX, NV, NV, NV, XX, NV, NV, XX, // 890-899 8236bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 900-909 8246bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 910-919 8256bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 920-929 8266bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 930-939 8276bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 940-949 8286bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 950-959 8296bf2577653884795f04bbf9d8196ed9998896afeJohn Reck CA, CA, AP, AP, AP, AP, AP, HI, HI, GU, // 960-969 8306bf2577653884795f04bbf9d8196ed9998896afeJohn Reck OR, OR, OR, OR, OR, OR, OR, OR, OR, OR, // 970-979 8316bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WA, WA, WA, WA, WA, WA, WA, XX, WA, WA, // 980-989 8326bf2577653884795f04bbf9d8196ed9998896afeJohn Reck WA, WA, WA, WA, WA, AK, AK, AK, AK, AK, // 990-999 8336bf2577653884795f04bbf9d8196ed9998896afeJohn Reck }; 8346bf2577653884795f04bbf9d8196ed9998896afeJohn Reck 8356bf2577653884795f04bbf9d8196ed9998896afeJohn Reck if (!word.begin || !word.end || (word.end - word.begin) < 3) 8366bf2577653884795f04bbf9d8196ed9998896afeJohn Reck return false; 8376bf2577653884795f04bbf9d8196ed9998896afeJohn Reck const char16* zipPtr = word.begin; 8386bf2577653884795f04bbf9d8196ed9998896afeJohn Reck if (zipPtr[0] < '0' || zipPtr[0] > '9' || 8396bf2577653884795f04bbf9d8196ed9998896afeJohn Reck zipPtr[1] < '0' || zipPtr[1] > '9' || 8406bf2577653884795f04bbf9d8196ed9998896afeJohn Reck zipPtr[2] < '0' || zipPtr[2] > '9') 8416bf2577653884795f04bbf9d8196ed9998896afeJohn Reck return false; 8426bf2577653884795f04bbf9d8196ed9998896afeJohn Reck 8436bf2577653884795f04bbf9d8196ed9998896afeJohn Reck int zip = zipPtr[0] - '0'; 8446bf2577653884795f04bbf9d8196ed9998896afeJohn Reck zip *= 10; 8456bf2577653884795f04bbf9d8196ed9998896afeJohn Reck zip += zipPtr[1] - '0'; 8466bf2577653884795f04bbf9d8196ed9998896afeJohn Reck zip *= 10; 8476bf2577653884795f04bbf9d8196ed9998896afeJohn Reck zip += zipPtr[2] - '0'; 8486bf2577653884795f04bbf9d8196ed9998896afeJohn Reck return stateForZipPrefix[zip] == (int) state_index; 849773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 850773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 8516bf2577653884795f04bbf9d8196ed9998896afeJohn Reckstatic const char* s_rawStreetSuffixes[] = { 8526bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "allee", "alley", "ally", "aly", 8536bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "anex", "annex", "anx", "arc", "arcade", "av", "ave", "aven", "avenu", 8546bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend", 8556bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "bg", "bgs", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd", 8566bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch", 8576bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks", 8586bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu", 8596bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "camp", "canyn", "canyon", "cape", "causeway", "causway", "cen", "cent", 8606bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "center", "centers", "centr", "centre", "cir", "circ", "circl", 8616bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff", 8626bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "cliffs", "club", "cmn", "cmp", "cnter", "cntr", "cnyn", "common", 8636bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "cor", "corner", "corners", "cors", "course", "court", "courts", "cove", 8646bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres", 8656bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "crescent", "cresent", "crest", "crk", "crossing", "crossroad", 8666bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt", 8676bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn", 8686bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive", 8696bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests", 8706bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "exp", "expr", "express", "expressway", "expw", "expy", "ext", 8716bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls", 8726bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls", 8736bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge", 8746bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy", 8756bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy", 8766bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy", 8776bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden", 8786bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove", 8796bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors", 8806bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts", 8816bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw", 8826bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts", 8836bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands", 8846bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn", 8856bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key", 8866bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la", 8876bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks", 8886bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks", 8896bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop", 8906bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "loops", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows", 8916bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "medows", "mews", "mill", "mills", "mission", "missn", "ml", "mls", 8926bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount", 8936bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn", 8946bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd", 8956bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "oval", "overpass", "ovl", "park", "parks", "parkway", "parkways", 8966bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine", 8976bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain", 8986bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne", 8996bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie", 9006bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "prk", "prr", "prt", "prts", "psge", "pt", "pts", "rad", "radial", 9016bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd", 902bf0d5c6dc816bca8f00bde31ddda7ba41e740ccdJohn Reck "rdg", "rdge", "rdgs", "rds", "real", "rest", "ridge", "ridges", "riv", "river", 9036bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "rivr", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds", 9046bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals", 9056bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway", 9066bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng", 9076bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ", 9086bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "square", "squares", "st", "sta", "station", "statn", "stn", "str", 9096bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream", 9106bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "street", "streets", "streme", "strm", "strt", "strvn", "strvnue", 9116bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "sts", "sumit", "sumitt", "summit", "ter", "terr", "terrace", 9126bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "throughway", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks", 9136bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "trafficway", "trail", "trails", "trak", "trce", "trfy", "trk", "trks", 9146bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "trl", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls", 9156bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "tunnel", "tunnels", "tunnl", "turnpike", "turnpk", "un", "underpass", 9166bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct", 9176bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "via", "viadct", "viaduct", "view", "views", "vill", "villag", 9186bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "village", "villages", "ville", "villg", "villiage", "vis", "vist", 9196bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta", 9206bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells", 9216bf2577653884795f04bbf9d8196ed9998896afeJohn Reck "wl", "wls", "wy", "xing", "xrd", 9226bf2577653884795f04bbf9d8196ed9998896afeJohn Reck 0, 9236bf2577653884795f04bbf9d8196ed9998896afeJohn Reck}; 924773979f92560dd1aead375c82fd75b584a141e5dJohn Reck 9256bf2577653884795f04bbf9d8196ed9998896afeJohn Reckbool AddressDetector::IsValidLocationName(const Word& word) { 9266bf2577653884795f04bbf9d8196ed9998896afeJohn Reck using namespace WTF; 9276bf2577653884795f04bbf9d8196ed9998896afeJohn Reck static HashSet<String> streetNames; 9286bf2577653884795f04bbf9d8196ed9998896afeJohn Reck if (!streetNames.size()) { 9296bf2577653884795f04bbf9d8196ed9998896afeJohn Reck const char** suffixes = s_rawStreetSuffixes; 9306bf2577653884795f04bbf9d8196ed9998896afeJohn Reck while (const char* suffix = *suffixes) { 9316bf2577653884795f04bbf9d8196ed9998896afeJohn Reck int index = suffix[0] - 'a'; 9326bf2577653884795f04bbf9d8196ed9998896afeJohn Reck streetNames.add(suffix); 9336bf2577653884795f04bbf9d8196ed9998896afeJohn Reck suffixes++; 9346bf2577653884795f04bbf9d8196ed9998896afeJohn Reck } 935773979f92560dd1aead375c82fd75b584a141e5dJohn Reck } 9366bf2577653884795f04bbf9d8196ed9998896afeJohn Reck char16 first_letter = base::ToLowerASCII(*word.begin); 9376bf2577653884795f04bbf9d8196ed9998896afeJohn Reck if (first_letter > 'z' || first_letter < 'a') 9386bf2577653884795f04bbf9d8196ed9998896afeJohn Reck return false; 9396bf2577653884795f04bbf9d8196ed9998896afeJohn Reck int index = first_letter - 'a'; 9406bf2577653884795f04bbf9d8196ed9998896afeJohn Reck int length = std::distance(word.begin, word.end); 9416bf2577653884795f04bbf9d8196ed9998896afeJohn Reck if (*word.end == '.') 9426bf2577653884795f04bbf9d8196ed9998896afeJohn Reck length--; 9436bf2577653884795f04bbf9d8196ed9998896afeJohn Reck String value(word.begin, length); 9446bf2577653884795f04bbf9d8196ed9998896afeJohn Reck return streetNames.contains(value.lower()); 945773979f92560dd1aead375c82fd75b584a141e5dJohn Reck} 946