address_detector.cpp revision bf0d5c6dc816bca8f00bde31ddda7ba41e740ccd
1/* 2 * Copyright (C) 2012 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include "config.h" 32 33// Magic pretend-to-be-a-chromium-build flags 34#undef WEBKIT_IMPLEMENTATION 35#undef LOG 36 37#include "content/address_detector.h" 38 39#include <bitset> 40 41#include "base/utf_string_conversions.h" 42#include "net/base/escape.h" 43#include "WebString.h" 44 45#include <wtf/HashSet.h> 46#include <wtf/Noncopyable.h> 47#include <wtf/text/StringHash.h> 48#include <wtf/text/WTFString.h> 49 50namespace { 51 52// Prefix used for geographical address intent URIs. 53static const char kAddressSchemaPrefix[] = "geo:0,0?q="; 54 55// Maximum text length to be searched for address detection. 56static const size_t kMaxAddressLength = 500; 57 58// Minimum number of words in an address after the house number 59// before a state is expected to be found. 60// A value too high can miss short addresses. 61const size_t kMinAddressWords = 3; 62 63// Maximum number of words allowed in an address between the house number 64// and the state, both not included. 65const size_t kMaxAddressWords = 12; 66 67// Maximum number of lines allowed in an address between the house number 68// and the state, both not included. 69const size_t kMaxAddressLines = 5; 70 71// Maximum length allowed for any address word between the house number 72// and the state, both not included. 73const size_t kMaxAddressNameWordLength = 25; 74 75// Maximum number of words after the house number in which the location name 76// should be found. 77const size_t kMaxLocationNameDistance = 4; 78 79// Number of digits for a valid zip code. 80const size_t kZipDigits = 5; 81 82// Number of digits for a valid zip code in the Zip Plus 4 format. 83const size_t kZipPlus4Digits = 9; 84 85// Maximum number of digits of a house number, including possible hyphens. 86const size_t kMaxHouseDigits = 5; 87 88// Additional characters used as new line delimiters. 89const char16 kNewlineDelimiters[] = { 90 ',', 91 '*', 92 0x2022, // Unicode bullet 93}; 94 95char16 SafePreviousChar(const string16::const_iterator& it, 96 const string16::const_iterator& begin) { 97 if (it == begin) 98 return ' '; 99 return *(it - 1); 100} 101 102char16 SafeNextChar(const string16::const_iterator& it, 103 const string16::const_iterator& end) { 104 if (it == end) 105 return ' '; 106 return *(it + 1); 107} 108 109bool WordLowerCaseEqualsASCII(string16::const_iterator word_begin, 110 string16::const_iterator word_end, const char* ascii_to_match) { 111 for (string16::const_iterator it = word_begin; it != word_end; 112 ++it, ++ascii_to_match) { 113 if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match) 114 return false; 115 } 116 return *ascii_to_match == 0 || *ascii_to_match == ' '; 117} 118 119bool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin, 120 string16::const_iterator word_end, const char* ascii_to_match, 121 bool allow_plural) { 122 for (string16::const_iterator it = word_begin; it != word_end; 123 ++it, ++ascii_to_match) { 124 if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end) 125 return true; 126 127 if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match) 128 return false; 129 } 130 return *ascii_to_match == 0; 131} 132 133} // anonymous namespace 134 135 136AddressDetector::AddressDetector() { 137} 138 139AddressDetector::~AddressDetector() { 140} 141 142std::string AddressDetector::GetContentText(const WebKit::WebRange& range) { 143 // Get the address and replace unicode bullets with commas. 144 string16 address_16 = CollapseWhitespace(range.toPlainText(), false); 145 std::replace(address_16.begin(), address_16.end(), 146 static_cast<char16>(0x2022), static_cast<char16>(',')); 147 return UTF16ToUTF8(address_16); 148} 149 150GURL AddressDetector::GetIntentURL(const std::string& content_text) { 151 return GURL(kAddressSchemaPrefix + 152 EscapeQueryParamValue(content_text, true)); 153} 154 155size_t AddressDetector::GetMaximumContentLength() { 156 return kMaxAddressLength; 157} 158 159bool AddressDetector::FindContent(const string16::const_iterator& begin, 160 const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) { 161 HouseNumberParser house_number_parser; 162 163 // Keep going through the input string until a potential house number is 164 // detected. Start tokenizing the following words to find a valid 165 // street name within a word range. Then, find a state name followed 166 // by a valid zip code for that state. Also keep a look for any other 167 // possible house numbers to continue from in case of no match and for 168 // state names not followed by a zip code (e.g. New York, NY 10000). 169 const string16 newline_delimiters = kNewlineDelimiters; 170 const string16 delimiters = kWhitespaceUTF16 + newline_delimiters; 171 for (string16::const_iterator it = begin; it != end; ) { 172 Word house_number; 173 if (!house_number_parser.Parse(it, end, &house_number)) 174 return false; 175 176 String16Tokenizer tokenizer(house_number.end, end, delimiters); 177 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS); 178 179 std::vector<Word> words; 180 words.push_back(house_number); 181 182 bool found_location_name = false; 183 bool continue_on_house_number = true; 184 size_t next_house_number_word = 0; 185 size_t num_lines = 1; 186 187 // Don't include the house number in the word count. 188 size_t next_word = 1; 189 for (; next_word <= kMaxAddressWords + 1; ++next_word) { 190 191 // Extract a new word from the tokenizer. 192 if (next_word == words.size()) { 193 do { 194 if (!tokenizer.GetNext()) 195 return false; 196 197 // Check the number of address lines. 198 if (tokenizer.token_is_delim() && newline_delimiters.find( 199 *tokenizer.token_begin()) != string16::npos) { 200 ++num_lines; 201 } 202 } while (tokenizer.token_is_delim()); 203 204 if (num_lines > kMaxAddressLines) 205 break; 206 207 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); 208 } 209 210 // Check the word length. If too long, don't try to continue from 211 // the next house number as no address can hold this word. 212 const Word& current_word = words[next_word]; 213 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0); 214 size_t current_word_length = std::distance( 215 current_word.begin, current_word.end); 216 if (current_word_length > kMaxAddressNameWordLength) { 217 continue_on_house_number = false; 218 break; 219 } 220 221 // Check if the new word is a valid house number. 222 // This is used to properly resume parsing in case the maximum number 223 // of words is exceeded. 224 if (next_house_number_word == 0 && 225 house_number_parser.Parse(current_word.begin, current_word.end, NULL)) { 226 next_house_number_word = next_word; 227 continue; 228 } 229 230 // Look for location names in the words after the house number. 231 // A range limitation is introduced to avoid matching 232 // anything that starts with a number before a legitimate address. 233 if (next_word <= kMaxLocationNameDistance && 234 IsValidLocationName(current_word)) { 235 found_location_name = true; 236 continue; 237 } 238 239 // Don't count the house number. 240 if (next_word > kMinAddressWords) { 241 // Looking for the state is likely to add new words to the list while 242 // checking for multi-word state names. 243 size_t state_first_word = next_word; 244 size_t state_last_word, state_index; 245 if (FindStateStartingInWord(&words, state_first_word, &state_last_word, 246 &tokenizer, &state_index)) { 247 248 // A location name should have been found at this point. 249 if (!found_location_name) 250 break; 251 252 // Explicitly exclude "et al", as "al" is a valid state code. 253 if (current_word_length == 2 && words.size() > 2) { 254 const Word& previous_word = words[state_first_word - 1]; 255 if (previous_word.end - previous_word.begin == 2 && 256 LowerCaseEqualsASCII(previous_word.begin, previous_word.end, 257 "et") && 258 LowerCaseEqualsASCII(current_word.begin, current_word.end, 259 "al")) 260 break; 261 } 262 263 // Extract one more word from the tokenizer if not already available. 264 size_t zip_word = state_last_word + 1; 265 if (zip_word == words.size()) { 266 do { 267 if (!tokenizer.GetNext()) { 268 // Zip is optional 269 *start_pos = words[0].begin - begin; 270 *end_pos = words[state_last_word].end - begin; 271 return true; 272 } 273 } while (tokenizer.token_is_delim()); 274 words.push_back(Word(tokenizer.token_begin(), 275 tokenizer.token_end())); 276 } 277 278 // Check the parsing validity and state range of the zip code. 279 next_word = state_last_word; 280 if (!IsZipValid(words[zip_word], state_index)) 281 continue; 282 283 *start_pos = words[0].begin - begin; 284 *end_pos = words[zip_word].end - begin; 285 return true; 286 } 287 } 288 } 289 290 // Avoid skipping too many words because of a non-address number 291 // at the beginning of the contents to parse. 292 if (continue_on_house_number && next_house_number_word > 0) { 293 it = words[next_house_number_word].begin; 294 } else { 295 DCHECK(!words.empty()); 296 next_word = std::min(next_word, words.size() - 1); 297 it = words[next_word].end; 298 } 299 } 300 301 return false; 302} 303 304bool AddressDetector::HouseNumberParser::IsPreDelimiter( 305 char16 character) { 306 return character == ':' || IsPostDelimiter(character); 307} 308 309bool AddressDetector::HouseNumberParser::IsPostDelimiter( 310 char16 character) { 311 return IsWhitespace(character) || strchr(",\"'", character); 312} 313 314void AddressDetector::HouseNumberParser::RestartOnNextDelimiter() { 315 ResetState(); 316 for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {} 317} 318 319void AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) { 320 size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)), 321 num_chars); 322 it_ += offset; 323 result_chars_ += offset; 324} 325 326void AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) { 327 it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars); 328} 329 330void AddressDetector::HouseNumberParser::ResetState() { 331 num_digits_ = 0; 332 result_chars_ = 0; 333} 334 335bool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const { 336 // There should always be a number after a hyphen. 337 if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-') 338 return false; 339 340 if (word) { 341 word->begin = it_ - result_chars_; 342 word->end = it_; 343 } 344 return true; 345} 346 347bool AddressDetector::HouseNumberParser::Parse( 348 const string16::const_iterator& begin, 349 const string16::const_iterator& end, Word* word) { 350 it_ = begin_ = begin; 351 end_ = end; 352 ResetState(); 353 354 // Iterations only used as a fail-safe against any buggy infinite loops. 355 size_t iterations = 0; 356 size_t max_iterations = end - begin + 1; 357 for (; it_ != end_ && iterations < max_iterations; ++iterations) { 358 359 // Word finished case. 360 if (IsPostDelimiter(*it_)) { 361 if (CheckFinished(word)) 362 return true; 363 else if (result_chars_) 364 ResetState(); 365 366 SkipChars(1); 367 continue; 368 } 369 370 // More digits. There should be no more after a letter was found. 371 if (IsAsciiDigit(*it_)) { 372 if (num_digits_ >= kMaxHouseDigits) { 373 RestartOnNextDelimiter(); 374 } else { 375 AcceptChars(1); 376 ++num_digits_; 377 } 378 continue; 379 } 380 381 if (IsAsciiAlpha(*it_)) { 382 // Handle special case 'one'. 383 if (result_chars_ == 0) { 384 if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one")) 385 AcceptChars(3); 386 else 387 RestartOnNextDelimiter(); 388 continue; 389 } 390 391 // There should be more than 1 character because of result_chars. 392 DCHECK_GT(result_chars_, 0U); 393 DCHECK_NE(it_, begin_); 394 char16 previous = SafePreviousChar(it_, begin_); 395 if (IsAsciiDigit(previous)) { 396 // Check cases like '12A'. 397 char16 next = SafeNextChar(it_, end_); 398 if (IsPostDelimiter(next)) { 399 AcceptChars(1); 400 continue; 401 } 402 403 // Handle cases like 12a, 1st, 2nd, 3rd, 7th. 404 if (IsAsciiAlpha(next)) { 405 char16 last_digit = previous; 406 char16 first_letter = base::ToLowerASCII(*it_); 407 char16 second_letter = base::ToLowerASCII(next); 408 bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' && 409 num_digits_ == 2; 410 411 switch (last_digit - '0') { 412 case 1: 413 if ((first_letter == 's' && second_letter == 't') || 414 (first_letter == 't' && second_letter == 'h' && is_teen)) { 415 AcceptChars(2); 416 continue; 417 } 418 break; 419 420 case 2: 421 if ((first_letter == 'n' && second_letter == 'd') || 422 (first_letter == 't' && second_letter == 'h' && is_teen)) { 423 AcceptChars(2); 424 continue; 425 } 426 break; 427 428 case 3: 429 if ((first_letter == 'r' && second_letter == 'd') || 430 (first_letter == 't' && second_letter == 'h' && is_teen)) { 431 AcceptChars(2); 432 continue; 433 } 434 break; 435 436 case 0: 437 // Explicitly exclude '0th'. 438 if (num_digits_ == 1) 439 break; 440 441 case 4: 442 case 5: 443 case 6: 444 case 7: 445 case 8: 446 case 9: 447 if (first_letter == 't' && second_letter == 'h') { 448 AcceptChars(2); 449 continue; 450 } 451 break; 452 453 default: 454 NOTREACHED(); 455 } 456 } 457 } 458 459 RestartOnNextDelimiter(); 460 continue; 461 } 462 463 if (*it_ == '-' && num_digits_ > 0) { 464 AcceptChars(1); 465 ++num_digits_; 466 continue; 467 } 468 469 RestartOnNextDelimiter(); 470 SkipChars(1); 471 } 472 473 if (iterations >= max_iterations) 474 return false; 475 476 return CheckFinished(word); 477} 478 479bool AddressDetector::FindStateStartingInWord(WordList* words, 480 size_t state_first_word, size_t* state_last_word, 481 String16Tokenizer* tokenizer, size_t* state_index) { 482 483 // Bitmasks containing the allowed suffixes for 2-letter state codes. 484 static const int state_two_letter_suffix[23] = { 485 0x02060c00, // A followed by: [KLRSZ]. 486 0x00000000, // B. 487 0x00084001, // C followed by: [AOT]. 488 0x00000014, // D followed by: [CE]. 489 0x00000000, // E. 490 0x00001800, // F followed by: [LM]. 491 0x00100001, // G followed by: [AU]. 492 0x00000100, // H followed by: [I]. 493 0x00002809, // I followed by: [ADLN]. 494 0x00000000, // J. 495 0x01040000, // K followed by: [SY]. 496 0x00000001, // L followed by: [A]. 497 0x000ce199, // M followed by: [ADEHINOPST]. 498 0x0120129c, // N followed by: [CDEHJMVY]. 499 0x00020480, // O followed by: [HKR]. 500 0x00420001, // P followed by: [ARW]. 501 0x00000000, // Q. 502 0x00000100, // R followed by: [I]. 503 0x0000000c, // S followed by: [CD]. 504 0x00802000, // T followed by: [NX]. 505 0x00080000, // U followed by: [T]. 506 0x00080101, // V followed by: [AIT]. 507 0x01200101 // W followed by: [AIVY]. 508 }; 509 510 // Accumulative number of states for the 2-letter code indexed by the first. 511 static const int state_two_letter_accumulative[24] = { 512 0, 5, 5, 8, 10, 10, 12, 14, 513 15, 19, 19, 21, 22, 32, 40, 43, 514 46, 46, 47, 49, 51, 52, 55, 59 515 }; 516 517 // State names sorted alphabetically with their lengths. 518 // There can be more than one possible name for a same state if desired. 519 static const struct StateNameInfo { 520 const char* string; 521 char first_word_length; 522 char length; 523 char state_index; // Relative to two-character code alphabetical order. 524 } state_names[59] = { 525 { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 }, 526 { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 }, 527 { "arkansas", 8, 8, 2 }, 528 { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 }, 529 { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 }, 530 { "district of columbia", 8, 20, 8 }, 531 { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 }, 532 { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 }, 533 { "hawaii", 6, 6, 14 }, 534 { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 }, 535 { "iowa", 4, 4, 15 }, 536 { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 }, 537 { "louisiana", 9, 9, 21 }, 538 { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 }, 539 { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 }, 540 { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 }, 541 { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 }, 542 { "montana", 7, 7, 31 }, 543 { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 }, 544 { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 }, 545 { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 }, 546 { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 }, 547 { "northern mariana islands", 8, 24, 29 }, 548 { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 }, 549 { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 }, 550 { "puerto rico", 6, 11, 44 }, 551 { "rhode island", 5, 5, 46 }, 552 { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 }, 553 { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 }, 554 { "utah", 4, 4, 51 }, 555 { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 }, 556 { "virginia", 8, 8, 52 }, 557 { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 }, 558 { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 } 559 }; 560 561 // Accumulative number of states for sorted names indexed by the first letter. 562 // Required a different one since there are codes that don't share their 563 // first letter with the name of their state (MP = Northern Mariana Islands). 564 static const int state_names_accumulative[24] = { 565 0, 5, 5, 8, 10, 10, 12, 14, 566 15, 19, 19, 21, 22, 31, 40, 43, 567 46, 46, 47, 49, 51, 52, 55, 59 568 }; 569 570 DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1], 571 static_cast<int>(ARRAYSIZE_UNSAFE(state_names))); 572 573 const Word& first_word = words->at(state_first_word); 574 int length = first_word.end - first_word.begin; 575 if (length < 2 || !IsAsciiAlpha(*first_word.begin)) 576 return false; 577 578 // No state names start with x, y, z. 579 char16 first_letter = base::ToLowerASCII(*first_word.begin); 580 if (first_letter > 'w') 581 return false; 582 583 DCHECK(first_letter >= 'a'); 584 int first_index = first_letter - 'a'; 585 586 // Look for two-letter state names. 587 if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) { 588 char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1)); 589 DCHECK(second_letter >= 'a'); 590 591 int second_index = second_letter - 'a'; 592 if (!(state_two_letter_suffix[first_index] & (1 << second_index))) 593 return false; 594 595 std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] & 596 ((1 << second_index) - 1); 597 *state_last_word = state_first_word; 598 *state_index = state_two_letter_accumulative[first_index] + 599 previous_suffixes.count(); 600 return true; 601 } 602 603 // Look for full state names by their first letter. Discard by length. 604 for (int state = state_names_accumulative[first_index]; 605 state < state_names_accumulative[first_index + 1]; ++state) { 606 if (state_names[state].first_word_length != length) 607 continue; 608 609 bool state_match = false; 610 size_t state_word = state_first_word; 611 for (int pos = 0; true; ) { 612 if (!WordLowerCaseEqualsASCII(words->at(state_word).begin, 613 words->at(state_word).end, &state_names[state].string[pos])) 614 break; 615 616 pos += words->at(state_word).end - words->at(state_word).begin + 1; 617 if (pos >= state_names[state].length) { 618 state_match = true; 619 break; 620 } 621 622 // Ran out of words, extract more from the tokenizer. 623 if (++state_word == words->size()) { 624 do { 625 if (!tokenizer->GetNext()) 626 break; 627 } while (tokenizer->token_is_delim()); 628 words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end())); 629 } 630 } 631 632 if (state_match) { 633 *state_last_word = state_word; 634 *state_index = state_names[state].state_index; 635 return true; 636 } 637 } 638 639 return false; 640} 641 642bool AddressDetector::IsZipValid(const Word& word, size_t state_index) { 643 size_t length = word.end - word.begin; 644 if (length != kZipDigits && length != kZipPlus4Digits + 1) 645 return false; 646 647 for (string16::const_iterator it = word.begin; it != word.end; ++it) { 648 size_t pos = it - word.begin; 649 if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits)) 650 continue; 651 return false; 652 } 653 return IsZipValidForState(word, state_index); 654} 655 656bool AddressDetector::IsZipValidForState(const Word& word, size_t state_index) 657{ 658 enum USState { 659 AP = -4, // AP (military base in the Pacific) 660 AA = -3, // AA (military base inside the US) 661 AE = -2, // AE (military base outside the US) 662 XX = -1, // (not in use) 663 AK = 0, // AK Alaska 664 AL = 1, // AL Alabama 665 AR = 2, // AR Arkansas 666 AS = 3, // AS American Samoa 667 AZ = 4, // AZ Arizona 668 CA = 5, // CA California 669 CO = 6, // CO Colorado 670 CT = 7, // CT Connecticut 671 DC = 8, // DC District of Columbia 672 DE = 9, // DE Delaware 673 FL = 10, // FL Florida 674 FM = 11, // FM Federated States of Micronesia 675 GA = 12, // GA Georgia 676 GU = 13, // GU Guam 677 HI = 14, // HI Hawaii 678 IA = 15, // IA Iowa 679 ID = 16, // ID Idaho 680 IL = 17, // IL Illinois 681 IN = 18, // IN Indiana 682 KS = 19, // KS Kansas 683 KY = 20, // KY Kentucky 684 LA = 21, // LA Louisiana 685 MA = 22, // MA Massachusetts 686 MD = 23, // MD Maryland 687 ME = 24, // ME Maine 688 MH = 25, // MH Marshall Islands 689 MI = 26, // MI Michigan 690 MN = 27, // MN Minnesota 691 MO = 28, // MO Missouri 692 MP = 29, // MP Northern Mariana Islands 693 MS = 30, // MS Mississippi 694 MT = 31, // MT Montana 695 NC = 32, // NC North Carolina 696 ND = 33, // ND North Dakota 697 NE = 34, // NE Nebraska 698 NH = 35, // NH New Hampshire 699 NJ = 36, // NJ New Jersey 700 NM = 37, // NM New Mexico 701 NV = 38, // NV Nevada 702 NY = 39, // NY New York 703 OH = 40, // OH Ohio 704 OK = 41, // OK Oklahoma 705 OR = 42, // OR Oregon 706 PA = 43, // PA Pennsylvania 707 PR = 44, // PR Puerto Rico 708 PW = 45, // PW Palau 709 RI = 46, // RI Rhode Island 710 SC = 47, // SC South Carolina 711 SD = 48, // SD South Dakota 712 TN = 49, // TN Tennessee 713 TX = 50, // TX Texas 714 UT = 51, // UT Utah 715 VA = 52, // VA Virginia 716 VI = 53, // VI Virgin Islands 717 VT = 54, // VT Vermont 718 WA = 55, // WA Washington 719 WI = 56, // WI Wisconsin 720 WV = 57, // WV West Virginia 721 WY = 58, // WY Wyoming 722 }; 723 724 static const USState stateForZipPrefix[] = { 725 // 0 1 2 3 4 5 6 7 8 9 726 XX, XX, XX, XX, XX, NY, PR, PR, VI, PR, // 000-009 727 MA, MA, MA, MA, MA, MA, MA, MA, MA, MA, // 010-019 728 MA, MA, MA, MA, MA, MA, MA, MA, RI, RI, // 020-029 729 NH, NH, NH, NH, NH, NH, NH, NH, NH, ME, // 030-039 730 ME, ME, ME, ME, ME, ME, ME, ME, ME, ME, // 040-049 731 VT, VT, VT, VT, VT, MA, VT, VT, VT, VT, // 050-059 732 CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 060-069 733 NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 070-079 734 NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 080-089 735 AE, AE, AE, AE, AE, AE, AE, AE, AE, XX, // 090-099 736 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 100-109 737 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 110-119 738 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 120-129 739 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 130-139 740 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 140-149 741 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 150-159 742 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 160-169 743 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 170-179 744 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 180-189 745 PA, PA, PA, PA, PA, PA, PA, DE, DE, DE, // 190-199 746 DC, VA, DC, DC, DC, DC, MD, MD, MD, MD, // 200-209 747 MD, MD, MD, XX, MD, MD, MD, MD, MD, MD, // 210-219 748 VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 220-229 749 VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 230-239 750 VA, VA, VA, VA, VA, VA, VA, WV, WV, WV, // 240-249 751 WV, WV, WV, WV, WV, WV, WV, WV, WV, WV, // 250-259 752 WV, WV, WV, WV, WV, WV, WV, WV, WV, XX, // 260-269 753 NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 270-279 754 NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 280-289 755 SC, SC, SC, SC, SC, SC, SC, SC, SC, SC, // 290-299 756 GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 300-309 757 GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 310-319 758 FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 320-329 759 FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 330-339 760 AA, FL, FL, XX, FL, XX, FL, FL, XX, FL, // 340-349 761 AL, AL, AL, XX, AL, AL, AL, AL, AL, AL, // 350-359 762 AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 360-369 763 TN, TN, TN, TN, TN, TN, TN, TN, TN, TN, // 370-379 764 TN, TN, TN, TN, TN, TN, MS, MS, MS, MS, // 380-389 765 MS, MS, MS, MS, MS, MS, MS, MS, GA, GA, // 390-399 766 KY, KY, KY, KY, KY, KY, KY, KY, KY, KY, // 400-409 767 KY, KY, KY, KY, KY, KY, KY, KY, KY, XX, // 410-419 768 KY, KY, KY, KY, KY, KY, KY, KY, XX, XX, // 420-429 769 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 430-439 770 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 440-449 771 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 450-459 772 IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 460-469 773 IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 470-479 774 MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 480-489 775 MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 490-499 776 IA, IA, IA, IA, IA, IA, IA, IA, IA, IA, // 500-509 777 IA, IA, IA, IA, IA, IA, IA, XX, XX, XX, // 510-519 778 IA, IA, IA, IA, IA, IA, IA, IA, IA, XX, // 520-529 779 WI, WI, WI, XX, WI, WI, XX, WI, WI, WI, // 530-539 780 WI, WI, WI, WI, WI, WI, WI, WI, WI, WI, // 540-549 781 MN, MN, XX, MN, MN, MN, MN, MN, MN, MN, // 550-559 782 MN, MN, MN, MN, MN, MN, MN, MN, XX, DC, // 560-569 783 SD, SD, SD, SD, SD, SD, SD, SD, XX, XX, // 570-579 784 ND, ND, ND, ND, ND, ND, ND, ND, ND, XX, // 580-589 785 MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, // 590-599 786 IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 600-609 787 IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 610-619 788 IL, XX, IL, IL, IL, IL, IL, IL, IL, IL, // 620-629 789 MO, MO, XX, MO, MO, MO, MO, MO, MO, MO, // 630-639 790 MO, MO, XX, XX, MO, MO, MO, MO, MO, MO, // 640-649 791 MO, MO, MO, MO, MO, MO, MO, MO, MO, XX, // 650-659 792 KS, KS, KS, XX, KS, KS, KS, KS, KS, KS, // 660-669 793 KS, KS, KS, KS, KS, KS, KS, KS, KS, KS, // 670-679 794 NE, NE, XX, NE, NE, NE, NE, NE, NE, NE, // 680-689 795 NE, NE, NE, NE, XX, XX, XX, XX, XX, XX, // 690-699 796 LA, LA, XX, LA, LA, LA, LA, LA, LA, XX, // 700-709 797 LA, LA, LA, LA, LA, XX, AR, AR, AR, AR, // 710-719 798 AR, AR, AR, AR, AR, AR, AR, AR, AR, AR, // 720-729 799 OK, OK, XX, TX, OK, OK, OK, OK, OK, OK, // 730-739 800 OK, OK, XX, OK, OK, OK, OK, OK, OK, OK, // 740-749 801 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 750-759 802 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 760-769 803 TX, XX, TX, TX, TX, TX, TX, TX, TX, TX, // 770-779 804 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 780-789 805 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 790-799 806 CO, CO, CO, CO, CO, CO, CO, CO, CO, CO, // 800-809 807 CO, CO, CO, CO, CO, CO, CO, XX, XX, XX, // 810-819 808 WY, WY, WY, WY, WY, WY, WY, WY, WY, WY, // 820-829 809 WY, WY, ID, ID, ID, ID, ID, ID, ID, XX, // 830-839 810 UT, UT, UT, UT, UT, UT, UT, UT, XX, XX, // 840-849 811 AZ, AZ, AZ, AZ, XX, AZ, AZ, AZ, XX, AZ, // 850-859 812 AZ, XX, XX, AZ, AZ, AZ, XX, XX, XX, XX, // 860-869 813 NM, NM, NM, NM, NM, NM, XX, NM, NM, NM, // 870-879 814 NM, NM, NM, NM, NM, TX, XX, XX, XX, NV, // 880-889 815 NV, NV, XX, NV, NV, NV, XX, NV, NV, XX, // 890-899 816 CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 900-909 817 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 910-919 818 CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 920-929 819 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 930-939 820 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 940-949 821 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 950-959 822 CA, CA, AP, AP, AP, AP, AP, HI, HI, GU, // 960-969 823 OR, OR, OR, OR, OR, OR, OR, OR, OR, OR, // 970-979 824 WA, WA, WA, WA, WA, WA, WA, XX, WA, WA, // 980-989 825 WA, WA, WA, WA, WA, AK, AK, AK, AK, AK, // 990-999 826 }; 827 828 if (!word.begin || !word.end || (word.end - word.begin) < 3) 829 return false; 830 const char16* zipPtr = word.begin; 831 if (zipPtr[0] < '0' || zipPtr[0] > '9' || 832 zipPtr[1] < '0' || zipPtr[1] > '9' || 833 zipPtr[2] < '0' || zipPtr[2] > '9') 834 return false; 835 836 int zip = zipPtr[0] - '0'; 837 zip *= 10; 838 zip += zipPtr[1] - '0'; 839 zip *= 10; 840 zip += zipPtr[2] - '0'; 841 return stateForZipPrefix[zip] == (int) state_index; 842} 843 844static const char* s_rawStreetSuffixes[] = { 845 "allee", "alley", "ally", "aly", 846 "anex", "annex", "anx", "arc", "arcade", "av", "ave", "aven", "avenu", 847 "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend", 848 "bg", "bgs", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd", 849 "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch", 850 "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks", 851 "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu", 852 "camp", "canyn", "canyon", "cape", "causeway", "causway", "cen", "cent", 853 "center", "centers", "centr", "centre", "cir", "circ", "circl", 854 "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff", 855 "cliffs", "club", "cmn", "cmp", "cnter", "cntr", "cnyn", "common", 856 "cor", "corner", "corners", "cors", "course", "court", "courts", "cove", 857 "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres", 858 "crescent", "cresent", "crest", "crk", "crossing", "crossroad", 859 "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt", 860 "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn", 861 "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive", 862 "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests", 863 "exp", "expr", "express", "expressway", "expw", "expy", "ext", 864 "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls", 865 "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls", 866 "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge", 867 "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy", 868 "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy", 869 "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy", 870 "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden", 871 "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove", 872 "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors", 873 "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts", 874 "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw", 875 "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts", 876 "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands", 877 "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn", 878 "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key", 879 "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la", 880 "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks", 881 "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks", 882 "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop", 883 "loops", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows", 884 "medows", "mews", "mill", "mills", "mission", "missn", "ml", "mls", 885 "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount", 886 "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn", 887 "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd", 888 "oval", "overpass", "ovl", "park", "parks", "parkway", "parkways", 889 "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine", 890 "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain", 891 "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne", 892 "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie", 893 "prk", "prr", "prt", "prts", "psge", "pt", "pts", "rad", "radial", 894 "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd", 895 "rdg", "rdge", "rdgs", "rds", "real", "rest", "ridge", "ridges", "riv", "river", 896 "rivr", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds", 897 "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals", 898 "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway", 899 "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng", 900 "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ", 901 "square", "squares", "st", "sta", "station", "statn", "stn", "str", 902 "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream", 903 "street", "streets", "streme", "strm", "strt", "strvn", "strvnue", 904 "sts", "sumit", "sumitt", "summit", "ter", "terr", "terrace", 905 "throughway", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks", 906 "trafficway", "trail", "trails", "trak", "trce", "trfy", "trk", "trks", 907 "trl", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls", 908 "tunnel", "tunnels", "tunnl", "turnpike", "turnpk", "un", "underpass", 909 "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct", 910 "via", "viadct", "viaduct", "view", "views", "vill", "villag", 911 "village", "villages", "ville", "villg", "villiage", "vis", "vist", 912 "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta", 913 "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells", 914 "wl", "wls", "wy", "xing", "xrd", 915 0, 916}; 917 918bool AddressDetector::IsValidLocationName(const Word& word) { 919 using namespace WTF; 920 static HashSet<String> streetNames; 921 if (!streetNames.size()) { 922 const char** suffixes = s_rawStreetSuffixes; 923 while (const char* suffix = *suffixes) { 924 int index = suffix[0] - 'a'; 925 streetNames.add(suffix); 926 suffixes++; 927 } 928 } 929 char16 first_letter = base::ToLowerASCII(*word.begin); 930 if (first_letter > 'z' || first_letter < 'a') 931 return false; 932 int index = first_letter - 'a'; 933 int length = std::distance(word.begin, word.end); 934 if (*word.end == '.') 935 length--; 936 String value(word.begin, length); 937 return streetNames.contains(value.lower()); 938} 939