address_detector.cpp revision 77974d2fe8f7fb9c421c8d4240e02e9d76ec2c27
1/* 2 * Copyright (C) 2012 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include "config.h" 32 33// Magic pretend-to-be-a-chromium-build flags 34#undef WEBKIT_IMPLEMENTATION 35#undef LOG 36 37#include "content/address_detector.h" 38 39#include <bitset> 40 41#include "base/utf_string_conversions.h" 42#include "net/base/escape.h" 43#include "WebString.h" 44 45#include <wtf/HashSet.h> 46#include <wtf/Noncopyable.h> 47#include <wtf/text/StringHash.h> 48#include <wtf/text/WTFString.h> 49 50namespace { 51 52// Prefix used for geographical address intent URIs. 53static const char kAddressSchemaPrefix[] = "geo:0,0?q="; 54 55// Maximum text length to be searched for address detection. 56static const size_t kMaxAddressLength = 500; 57 58// Minimum number of words in an address after the house number 59// before a state is expected to be found. 60// A value too high can miss short addresses. 61const size_t kMinAddressWords = 3; 62 63// Maximum number of words allowed in an address between the house number 64// and the state, both not included. 65const size_t kMaxAddressWords = 12; 66 67// Maximum number of lines allowed in an address between the house number 68// and the state, both not included. 69const size_t kMaxAddressLines = 5; 70 71// Maximum length allowed for any address word between the house number 72// and the state, both not included. 73const size_t kMaxAddressNameWordLength = 25; 74 75// Maximum number of words after the house number in which the location name 76// should be found. 77const size_t kMaxLocationNameDistance = 4; 78 79// Number of digits for a valid zip code. 80const size_t kZipDigits = 5; 81 82// Number of digits for a valid zip code in the Zip Plus 4 format. 83const size_t kZipPlus4Digits = 9; 84 85// Maximum number of digits of a house number, including possible hyphens. 86const size_t kMaxHouseDigits = 5; 87 88// Additional characters used as new line delimiters. 89const char16 kNewlineDelimiters[] = { 90 ',', 91 '*', 92 0x2022, // Unicode bullet 93 0, 94}; 95 96char16 SafePreviousChar(const string16::const_iterator& it, 97 const string16::const_iterator& begin) { 98 if (it == begin) 99 return ' '; 100 return *(it - 1); 101} 102 103char16 SafeNextChar(const string16::const_iterator& it, 104 const string16::const_iterator& end) { 105 if (it == end) 106 return ' '; 107 return *(it + 1); 108} 109 110bool WordLowerCaseEqualsASCII(string16::const_iterator word_begin, 111 string16::const_iterator word_end, const char* ascii_to_match) { 112 for (string16::const_iterator it = word_begin; it != word_end; 113 ++it, ++ascii_to_match) { 114 if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match) 115 return false; 116 } 117 return *ascii_to_match == 0 || *ascii_to_match == ' '; 118} 119 120bool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin, 121 string16::const_iterator word_end, const char* ascii_to_match, 122 bool allow_plural) { 123 for (string16::const_iterator it = word_begin; it != word_end; 124 ++it, ++ascii_to_match) { 125 if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end) 126 return true; 127 128 if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match) 129 return false; 130 } 131 return *ascii_to_match == 0; 132} 133 134} // anonymous namespace 135 136 137AddressDetector::AddressDetector() { 138} 139 140AddressDetector::~AddressDetector() { 141} 142 143std::string AddressDetector::GetContentText(const WebKit::WebRange& range) { 144 // Get the address and replace unicode bullets with commas. 145 string16 address_16 = CollapseWhitespace(range.toPlainText(), false); 146 std::replace(address_16.begin(), address_16.end(), 147 static_cast<char16>(0x2022), static_cast<char16>(',')); 148 return UTF16ToUTF8(address_16); 149} 150 151GURL AddressDetector::GetIntentURL(const std::string& content_text) { 152 return GURL(kAddressSchemaPrefix + 153 EscapeQueryParamValue(content_text, true)); 154} 155 156size_t AddressDetector::GetMaximumContentLength() { 157 return kMaxAddressLength; 158} 159 160bool AddressDetector::FindContent(const string16::const_iterator& begin, 161 const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) { 162 HouseNumberParser house_number_parser; 163 164 // Keep going through the input string until a potential house number is 165 // detected. Start tokenizing the following words to find a valid 166 // street name within a word range. Then, find a state name followed 167 // by a valid zip code for that state. Also keep a look for any other 168 // possible house numbers to continue from in case of no match and for 169 // state names not followed by a zip code (e.g. New York, NY 10000). 170 const string16 newline_delimiters = kNewlineDelimiters; 171 const string16 delimiters = kWhitespaceUTF16 + newline_delimiters; 172 for (string16::const_iterator it = begin; it != end; ) { 173 Word house_number; 174 if (!house_number_parser.Parse(it, end, &house_number)) 175 return false; 176 177 String16Tokenizer tokenizer(house_number.end, end, delimiters); 178 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS); 179 180 std::vector<Word> words; 181 words.push_back(house_number); 182 183 bool found_location_name = false; 184 bool continue_on_house_number = true; 185 size_t next_house_number_word = 0; 186 size_t num_lines = 1; 187 188 // Don't include the house number in the word count. 189 size_t next_word = 1; 190 for (; next_word <= kMaxAddressWords + 1; ++next_word) { 191 192 // Extract a new word from the tokenizer. 193 if (next_word == words.size()) { 194 do { 195 if (!tokenizer.GetNext()) 196 return false; 197 198 // Check the number of address lines. 199 if (tokenizer.token_is_delim() && newline_delimiters.find( 200 *tokenizer.token_begin()) != string16::npos) { 201 ++num_lines; 202 } 203 } while (tokenizer.token_is_delim()); 204 205 if (num_lines > kMaxAddressLines) 206 break; 207 208 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); 209 } 210 211 // Check the word length. If too long, don't try to continue from 212 // the next house number as no address can hold this word. 213 const Word& current_word = words[next_word]; 214 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0); 215 size_t current_word_length = std::distance( 216 current_word.begin, current_word.end); 217 if (current_word_length > kMaxAddressNameWordLength) { 218 continue_on_house_number = false; 219 break; 220 } 221 222 // Check if the new word is a valid house number. 223 // This is used to properly resume parsing in case the maximum number 224 // of words is exceeded. 225 if (next_house_number_word == 0 && 226 house_number_parser.Parse(current_word.begin, current_word.end, NULL)) { 227 next_house_number_word = next_word; 228 continue; 229 } 230 231 // Look for location names in the words after the house number. 232 // A range limitation is introduced to avoid matching 233 // anything that starts with a number before a legitimate address. 234 if (next_word <= kMaxLocationNameDistance && 235 IsValidLocationName(current_word)) { 236 found_location_name = true; 237 continue; 238 } 239 240 // Don't count the house number. 241 if (next_word > kMinAddressWords) { 242 // Looking for the state is likely to add new words to the list while 243 // checking for multi-word state names. 244 size_t state_first_word = next_word; 245 size_t state_last_word, state_index; 246 if (FindStateStartingInWord(&words, state_first_word, &state_last_word, 247 &tokenizer, &state_index)) { 248 249 // A location name should have been found at this point. 250 if (!found_location_name) 251 break; 252 253 // Explicitly exclude "et al", as "al" is a valid state code. 254 if (current_word_length == 2 && words.size() > 2) { 255 const Word& previous_word = words[state_first_word - 1]; 256 if (previous_word.end - previous_word.begin == 2 && 257 LowerCaseEqualsASCII(previous_word.begin, previous_word.end, 258 "et") && 259 LowerCaseEqualsASCII(current_word.begin, current_word.end, 260 "al")) 261 break; 262 } 263 264 // Extract one more word from the tokenizer if not already available. 265 size_t zip_word = state_last_word + 1; 266 if (zip_word == words.size()) { 267 do { 268 if (!tokenizer.GetNext()) { 269 // Zip is optional 270 *start_pos = words[0].begin - begin; 271 *end_pos = words[state_last_word].end - begin; 272 return true; 273 } 274 } while (tokenizer.token_is_delim()); 275 words.push_back(Word(tokenizer.token_begin(), 276 tokenizer.token_end())); 277 } 278 279 // Check the parsing validity and state range of the zip code. 280 next_word = state_last_word; 281 if (!IsZipValid(words[zip_word], state_index)) 282 continue; 283 284 *start_pos = words[0].begin - begin; 285 *end_pos = words[zip_word].end - begin; 286 return true; 287 } 288 } 289 } 290 291 // Avoid skipping too many words because of a non-address number 292 // at the beginning of the contents to parse. 293 if (continue_on_house_number && next_house_number_word > 0) { 294 it = words[next_house_number_word].begin; 295 } else { 296 DCHECK(!words.empty()); 297 next_word = std::min(next_word, words.size() - 1); 298 it = words[next_word].end; 299 } 300 } 301 302 return false; 303} 304 305bool AddressDetector::HouseNumberParser::IsPreDelimiter( 306 char16 character) { 307 return character == ':' || IsPostDelimiter(character); 308} 309 310bool AddressDetector::HouseNumberParser::IsPostDelimiter( 311 char16 character) { 312 return IsWhitespace(character) || strchr(",\"'", character); 313} 314 315void AddressDetector::HouseNumberParser::RestartOnNextDelimiter() { 316 ResetState(); 317 for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {} 318} 319 320void AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) { 321 size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)), 322 num_chars); 323 it_ += offset; 324 result_chars_ += offset; 325} 326 327void AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) { 328 it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars); 329} 330 331void AddressDetector::HouseNumberParser::ResetState() { 332 num_digits_ = 0; 333 result_chars_ = 0; 334} 335 336bool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const { 337 // There should always be a number after a hyphen. 338 if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-') 339 return false; 340 341 if (word) { 342 word->begin = it_ - result_chars_; 343 word->end = it_; 344 } 345 return true; 346} 347 348bool AddressDetector::HouseNumberParser::Parse( 349 const string16::const_iterator& begin, 350 const string16::const_iterator& end, Word* word) { 351 it_ = begin_ = begin; 352 end_ = end; 353 ResetState(); 354 355 // Iterations only used as a fail-safe against any buggy infinite loops. 356 size_t iterations = 0; 357 size_t max_iterations = end - begin + 1; 358 for (; it_ != end_ && iterations < max_iterations; ++iterations) { 359 360 // Word finished case. 361 if (IsPostDelimiter(*it_)) { 362 if (CheckFinished(word)) 363 return true; 364 else if (result_chars_) 365 ResetState(); 366 367 SkipChars(1); 368 continue; 369 } 370 371 // More digits. There should be no more after a letter was found. 372 if (IsAsciiDigit(*it_)) { 373 if (num_digits_ >= kMaxHouseDigits) { 374 RestartOnNextDelimiter(); 375 } else { 376 AcceptChars(1); 377 ++num_digits_; 378 } 379 continue; 380 } 381 382 if (IsAsciiAlpha(*it_)) { 383 // Handle special case 'one'. 384 if (result_chars_ == 0) { 385 if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one")) 386 AcceptChars(3); 387 else 388 RestartOnNextDelimiter(); 389 continue; 390 } 391 392 // There should be more than 1 character because of result_chars. 393 DCHECK_GT(result_chars_, 0U); 394 DCHECK_NE(it_, begin_); 395 char16 previous = SafePreviousChar(it_, begin_); 396 if (IsAsciiDigit(previous)) { 397 // Check cases like '12A'. 398 char16 next = SafeNextChar(it_, end_); 399 if (IsPostDelimiter(next)) { 400 AcceptChars(1); 401 continue; 402 } 403 404 // Handle cases like 12a, 1st, 2nd, 3rd, 7th. 405 if (IsAsciiAlpha(next)) { 406 char16 last_digit = previous; 407 char16 first_letter = base::ToLowerASCII(*it_); 408 char16 second_letter = base::ToLowerASCII(next); 409 bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' && 410 num_digits_ == 2; 411 412 switch (last_digit - '0') { 413 case 1: 414 if ((first_letter == 's' && second_letter == 't') || 415 (first_letter == 't' && second_letter == 'h' && is_teen)) { 416 AcceptChars(2); 417 continue; 418 } 419 break; 420 421 case 2: 422 if ((first_letter == 'n' && second_letter == 'd') || 423 (first_letter == 't' && second_letter == 'h' && is_teen)) { 424 AcceptChars(2); 425 continue; 426 } 427 break; 428 429 case 3: 430 if ((first_letter == 'r' && second_letter == 'd') || 431 (first_letter == 't' && second_letter == 'h' && is_teen)) { 432 AcceptChars(2); 433 continue; 434 } 435 break; 436 437 case 0: 438 // Explicitly exclude '0th'. 439 if (num_digits_ == 1) 440 break; 441 442 case 4: 443 case 5: 444 case 6: 445 case 7: 446 case 8: 447 case 9: 448 if (first_letter == 't' && second_letter == 'h') { 449 AcceptChars(2); 450 continue; 451 } 452 break; 453 454 default: 455 NOTREACHED(); 456 } 457 } 458 } 459 460 RestartOnNextDelimiter(); 461 continue; 462 } 463 464 if (*it_ == '-' && num_digits_ > 0) { 465 AcceptChars(1); 466 ++num_digits_; 467 continue; 468 } 469 470 RestartOnNextDelimiter(); 471 SkipChars(1); 472 } 473 474 if (iterations >= max_iterations) 475 return false; 476 477 return CheckFinished(word); 478} 479 480bool AddressDetector::FindStateStartingInWord(WordList* words, 481 size_t state_first_word, size_t* state_last_word, 482 String16Tokenizer* tokenizer, size_t* state_index) { 483 484 // Bitmasks containing the allowed suffixes for 2-letter state codes. 485 static const int state_two_letter_suffix[23] = { 486 0x02060c00, // A followed by: [KLRSZ]. 487 0x00000000, // B. 488 0x00084001, // C followed by: [AOT]. 489 0x00000014, // D followed by: [CE]. 490 0x00000000, // E. 491 0x00001800, // F followed by: [LM]. 492 0x00100001, // G followed by: [AU]. 493 0x00000100, // H followed by: [I]. 494 0x00002809, // I followed by: [ADLN]. 495 0x00000000, // J. 496 0x01040000, // K followed by: [SY]. 497 0x00000001, // L followed by: [A]. 498 0x000ce199, // M followed by: [ADEHINOPST]. 499 0x0120129c, // N followed by: [CDEHJMVY]. 500 0x00020480, // O followed by: [HKR]. 501 0x00420001, // P followed by: [ARW]. 502 0x00000000, // Q. 503 0x00000100, // R followed by: [I]. 504 0x0000000c, // S followed by: [CD]. 505 0x00802000, // T followed by: [NX]. 506 0x00080000, // U followed by: [T]. 507 0x00080101, // V followed by: [AIT]. 508 0x01200101 // W followed by: [AIVY]. 509 }; 510 511 // Accumulative number of states for the 2-letter code indexed by the first. 512 static const int state_two_letter_accumulative[24] = { 513 0, 5, 5, 8, 10, 10, 12, 14, 514 15, 19, 19, 21, 22, 32, 40, 43, 515 46, 46, 47, 49, 51, 52, 55, 59 516 }; 517 518 // State names sorted alphabetically with their lengths. 519 // There can be more than one possible name for a same state if desired. 520 static const struct StateNameInfo { 521 const char* string; 522 char first_word_length; 523 char length; 524 char state_index; // Relative to two-character code alphabetical order. 525 } state_names[59] = { 526 { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 }, 527 { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 }, 528 { "arkansas", 8, 8, 2 }, 529 { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 }, 530 { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 }, 531 { "district of columbia", 8, 20, 8 }, 532 { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 }, 533 { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 }, 534 { "hawaii", 6, 6, 14 }, 535 { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 }, 536 { "iowa", 4, 4, 15 }, 537 { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 }, 538 { "louisiana", 9, 9, 21 }, 539 { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 }, 540 { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 }, 541 { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 }, 542 { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 }, 543 { "montana", 7, 7, 31 }, 544 { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 }, 545 { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 }, 546 { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 }, 547 { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 }, 548 { "northern mariana islands", 8, 24, 29 }, 549 { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 }, 550 { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 }, 551 { "puerto rico", 6, 11, 44 }, 552 { "rhode island", 5, 5, 46 }, 553 { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 }, 554 { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 }, 555 { "utah", 4, 4, 51 }, 556 { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 }, 557 { "virginia", 8, 8, 52 }, 558 { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 }, 559 { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 } 560 }; 561 562 // Accumulative number of states for sorted names indexed by the first letter. 563 // Required a different one since there are codes that don't share their 564 // first letter with the name of their state (MP = Northern Mariana Islands). 565 static const int state_names_accumulative[24] = { 566 0, 5, 5, 8, 10, 10, 12, 14, 567 15, 19, 19, 21, 22, 31, 40, 43, 568 46, 46, 47, 49, 51, 52, 55, 59 569 }; 570 571 DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1], 572 static_cast<int>(ARRAYSIZE_UNSAFE(state_names))); 573 574 const Word& first_word = words->at(state_first_word); 575 int length = first_word.end - first_word.begin; 576 if (length < 2 || !IsAsciiAlpha(*first_word.begin)) 577 return false; 578 579 // No state names start with x, y, z. 580 char16 first_letter = base::ToLowerASCII(*first_word.begin); 581 if (first_letter > 'w') 582 return false; 583 584 DCHECK(first_letter >= 'a'); 585 int first_index = first_letter - 'a'; 586 587 // Look for two-letter state names. 588 if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) { 589 char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1)); 590 DCHECK(second_letter >= 'a'); 591 592 int second_index = second_letter - 'a'; 593 if (!(state_two_letter_suffix[first_index] & (1 << second_index))) 594 return false; 595 596 std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] & 597 ((1 << second_index) - 1); 598 *state_last_word = state_first_word; 599 *state_index = state_two_letter_accumulative[first_index] + 600 previous_suffixes.count(); 601 return true; 602 } 603 604 // Look for full state names by their first letter. Discard by length. 605 for (int state = state_names_accumulative[first_index]; 606 state < state_names_accumulative[first_index + 1]; ++state) { 607 if (state_names[state].first_word_length != length) 608 continue; 609 610 bool state_match = false; 611 size_t state_word = state_first_word; 612 for (int pos = 0; true; ) { 613 if (!WordLowerCaseEqualsASCII(words->at(state_word).begin, 614 words->at(state_word).end, &state_names[state].string[pos])) 615 break; 616 617 pos += words->at(state_word).end - words->at(state_word).begin + 1; 618 if (pos >= state_names[state].length) { 619 state_match = true; 620 break; 621 } 622 623 // Ran out of words, extract more from the tokenizer. 624 if (++state_word == words->size()) { 625 do { 626 if (!tokenizer->GetNext()) 627 break; 628 } while (tokenizer->token_is_delim()); 629 words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end())); 630 } 631 } 632 633 if (state_match) { 634 *state_last_word = state_word; 635 *state_index = state_names[state].state_index; 636 return true; 637 } 638 } 639 640 return false; 641} 642 643bool AddressDetector::IsZipValid(const Word& word, size_t state_index) { 644 size_t length = word.end - word.begin; 645 if (length != kZipDigits && length != kZipPlus4Digits + 1) 646 return false; 647 648 for (string16::const_iterator it = word.begin; it != word.end; ++it) { 649 size_t pos = it - word.begin; 650 if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits)) 651 continue; 652 return false; 653 } 654 return IsZipValidForState(word, state_index); 655} 656 657bool AddressDetector::IsZipValidForState(const Word& word, size_t state_index) 658{ 659 enum USState { 660 AP = -4, // AP (military base in the Pacific) 661 AA = -3, // AA (military base inside the US) 662 AE = -2, // AE (military base outside the US) 663 XX = -1, // (not in use) 664 AK = 0, // AK Alaska 665 AL = 1, // AL Alabama 666 AR = 2, // AR Arkansas 667 AS = 3, // AS American Samoa 668 AZ = 4, // AZ Arizona 669 CA = 5, // CA California 670 CO = 6, // CO Colorado 671 CT = 7, // CT Connecticut 672 DC = 8, // DC District of Columbia 673 DE = 9, // DE Delaware 674 FL = 10, // FL Florida 675 FM = 11, // FM Federated States of Micronesia 676 GA = 12, // GA Georgia 677 GU = 13, // GU Guam 678 HI = 14, // HI Hawaii 679 IA = 15, // IA Iowa 680 ID = 16, // ID Idaho 681 IL = 17, // IL Illinois 682 IN = 18, // IN Indiana 683 KS = 19, // KS Kansas 684 KY = 20, // KY Kentucky 685 LA = 21, // LA Louisiana 686 MA = 22, // MA Massachusetts 687 MD = 23, // MD Maryland 688 ME = 24, // ME Maine 689 MH = 25, // MH Marshall Islands 690 MI = 26, // MI Michigan 691 MN = 27, // MN Minnesota 692 MO = 28, // MO Missouri 693 MP = 29, // MP Northern Mariana Islands 694 MS = 30, // MS Mississippi 695 MT = 31, // MT Montana 696 NC = 32, // NC North Carolina 697 ND = 33, // ND North Dakota 698 NE = 34, // NE Nebraska 699 NH = 35, // NH New Hampshire 700 NJ = 36, // NJ New Jersey 701 NM = 37, // NM New Mexico 702 NV = 38, // NV Nevada 703 NY = 39, // NY New York 704 OH = 40, // OH Ohio 705 OK = 41, // OK Oklahoma 706 OR = 42, // OR Oregon 707 PA = 43, // PA Pennsylvania 708 PR = 44, // PR Puerto Rico 709 PW = 45, // PW Palau 710 RI = 46, // RI Rhode Island 711 SC = 47, // SC South Carolina 712 SD = 48, // SD South Dakota 713 TN = 49, // TN Tennessee 714 TX = 50, // TX Texas 715 UT = 51, // UT Utah 716 VA = 52, // VA Virginia 717 VI = 53, // VI Virgin Islands 718 VT = 54, // VT Vermont 719 WA = 55, // WA Washington 720 WI = 56, // WI Wisconsin 721 WV = 57, // WV West Virginia 722 WY = 58, // WY Wyoming 723 }; 724 725 static const USState stateForZipPrefix[] = { 726 // 0 1 2 3 4 5 6 7 8 9 727 XX, XX, XX, XX, XX, NY, PR, PR, VI, PR, // 000-009 728 MA, MA, MA, MA, MA, MA, MA, MA, MA, MA, // 010-019 729 MA, MA, MA, MA, MA, MA, MA, MA, RI, RI, // 020-029 730 NH, NH, NH, NH, NH, NH, NH, NH, NH, ME, // 030-039 731 ME, ME, ME, ME, ME, ME, ME, ME, ME, ME, // 040-049 732 VT, VT, VT, VT, VT, MA, VT, VT, VT, VT, // 050-059 733 CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 060-069 734 NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 070-079 735 NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 080-089 736 AE, AE, AE, AE, AE, AE, AE, AE, AE, XX, // 090-099 737 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 100-109 738 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 110-119 739 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 120-129 740 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 130-139 741 NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 140-149 742 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 150-159 743 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 160-169 744 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 170-179 745 PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 180-189 746 PA, PA, PA, PA, PA, PA, PA, DE, DE, DE, // 190-199 747 DC, VA, DC, DC, DC, DC, MD, MD, MD, MD, // 200-209 748 MD, MD, MD, XX, MD, MD, MD, MD, MD, MD, // 210-219 749 VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 220-229 750 VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 230-239 751 VA, VA, VA, VA, VA, VA, VA, WV, WV, WV, // 240-249 752 WV, WV, WV, WV, WV, WV, WV, WV, WV, WV, // 250-259 753 WV, WV, WV, WV, WV, WV, WV, WV, WV, XX, // 260-269 754 NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 270-279 755 NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 280-289 756 SC, SC, SC, SC, SC, SC, SC, SC, SC, SC, // 290-299 757 GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 300-309 758 GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 310-319 759 FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 320-329 760 FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 330-339 761 AA, FL, FL, XX, FL, XX, FL, FL, XX, FL, // 340-349 762 AL, AL, AL, XX, AL, AL, AL, AL, AL, AL, // 350-359 763 AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 360-369 764 TN, TN, TN, TN, TN, TN, TN, TN, TN, TN, // 370-379 765 TN, TN, TN, TN, TN, TN, MS, MS, MS, MS, // 380-389 766 MS, MS, MS, MS, MS, MS, MS, MS, GA, GA, // 390-399 767 KY, KY, KY, KY, KY, KY, KY, KY, KY, KY, // 400-409 768 KY, KY, KY, KY, KY, KY, KY, KY, KY, XX, // 410-419 769 KY, KY, KY, KY, KY, KY, KY, KY, XX, XX, // 420-429 770 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 430-439 771 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 440-449 772 OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 450-459 773 IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 460-469 774 IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 470-479 775 MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 480-489 776 MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 490-499 777 IA, IA, IA, IA, IA, IA, IA, IA, IA, IA, // 500-509 778 IA, IA, IA, IA, IA, IA, IA, XX, XX, XX, // 510-519 779 IA, IA, IA, IA, IA, IA, IA, IA, IA, XX, // 520-529 780 WI, WI, WI, XX, WI, WI, XX, WI, WI, WI, // 530-539 781 WI, WI, WI, WI, WI, WI, WI, WI, WI, WI, // 540-549 782 MN, MN, XX, MN, MN, MN, MN, MN, MN, MN, // 550-559 783 MN, MN, MN, MN, MN, MN, MN, MN, XX, DC, // 560-569 784 SD, SD, SD, SD, SD, SD, SD, SD, XX, XX, // 570-579 785 ND, ND, ND, ND, ND, ND, ND, ND, ND, XX, // 580-589 786 MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, // 590-599 787 IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 600-609 788 IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 610-619 789 IL, XX, IL, IL, IL, IL, IL, IL, IL, IL, // 620-629 790 MO, MO, XX, MO, MO, MO, MO, MO, MO, MO, // 630-639 791 MO, MO, XX, XX, MO, MO, MO, MO, MO, MO, // 640-649 792 MO, MO, MO, MO, MO, MO, MO, MO, MO, XX, // 650-659 793 KS, KS, KS, XX, KS, KS, KS, KS, KS, KS, // 660-669 794 KS, KS, KS, KS, KS, KS, KS, KS, KS, KS, // 670-679 795 NE, NE, XX, NE, NE, NE, NE, NE, NE, NE, // 680-689 796 NE, NE, NE, NE, XX, XX, XX, XX, XX, XX, // 690-699 797 LA, LA, XX, LA, LA, LA, LA, LA, LA, XX, // 700-709 798 LA, LA, LA, LA, LA, XX, AR, AR, AR, AR, // 710-719 799 AR, AR, AR, AR, AR, AR, AR, AR, AR, AR, // 720-729 800 OK, OK, XX, TX, OK, OK, OK, OK, OK, OK, // 730-739 801 OK, OK, XX, OK, OK, OK, OK, OK, OK, OK, // 740-749 802 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 750-759 803 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 760-769 804 TX, XX, TX, TX, TX, TX, TX, TX, TX, TX, // 770-779 805 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 780-789 806 TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 790-799 807 CO, CO, CO, CO, CO, CO, CO, CO, CO, CO, // 800-809 808 CO, CO, CO, CO, CO, CO, CO, XX, XX, XX, // 810-819 809 WY, WY, WY, WY, WY, WY, WY, WY, WY, WY, // 820-829 810 WY, WY, ID, ID, ID, ID, ID, ID, ID, XX, // 830-839 811 UT, UT, UT, UT, UT, UT, UT, UT, XX, XX, // 840-849 812 AZ, AZ, AZ, AZ, XX, AZ, AZ, AZ, XX, AZ, // 850-859 813 AZ, XX, XX, AZ, AZ, AZ, XX, XX, XX, XX, // 860-869 814 NM, NM, NM, NM, NM, NM, XX, NM, NM, NM, // 870-879 815 NM, NM, NM, NM, NM, TX, XX, XX, XX, NV, // 880-889 816 NV, NV, XX, NV, NV, NV, XX, NV, NV, XX, // 890-899 817 CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 900-909 818 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 910-919 819 CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 920-929 820 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 930-939 821 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 940-949 822 CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 950-959 823 CA, CA, AP, AP, AP, AP, AP, HI, HI, GU, // 960-969 824 OR, OR, OR, OR, OR, OR, OR, OR, OR, OR, // 970-979 825 WA, WA, WA, WA, WA, WA, WA, XX, WA, WA, // 980-989 826 WA, WA, WA, WA, WA, AK, AK, AK, AK, AK, // 990-999 827 }; 828 829 if (!word.begin || !word.end || (word.end - word.begin) < 3) 830 return false; 831 const char16* zipPtr = word.begin; 832 if (zipPtr[0] < '0' || zipPtr[0] > '9' || 833 zipPtr[1] < '0' || zipPtr[1] > '9' || 834 zipPtr[2] < '0' || zipPtr[2] > '9') 835 return false; 836 837 int zip = zipPtr[0] - '0'; 838 zip *= 10; 839 zip += zipPtr[1] - '0'; 840 zip *= 10; 841 zip += zipPtr[2] - '0'; 842 return stateForZipPrefix[zip] == (int) state_index; 843} 844 845static const char* s_rawStreetSuffixes[] = { 846 "allee", "alley", "ally", "aly", 847 "anex", "annex", "anx", "arc", "arcade", "av", "ave", "aven", "avenu", 848 "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend", 849 "bg", "bgs", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd", 850 "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch", 851 "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks", 852 "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu", 853 "camp", "canyn", "canyon", "cape", "causeway", "causway", "cen", "cent", 854 "center", "centers", "centr", "centre", "cir", "circ", "circl", 855 "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff", 856 "cliffs", "club", "cmn", "cmp", "cnter", "cntr", "cnyn", "common", 857 "cor", "corner", "corners", "cors", "course", "court", "courts", "cove", 858 "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres", 859 "crescent", "cresent", "crest", "crk", "crossing", "crossroad", 860 "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt", 861 "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn", 862 "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive", 863 "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests", 864 "exp", "expr", "express", "expressway", "expw", "expy", "ext", 865 "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls", 866 "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls", 867 "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge", 868 "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy", 869 "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy", 870 "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy", 871 "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden", 872 "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove", 873 "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors", 874 "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts", 875 "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw", 876 "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts", 877 "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands", 878 "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn", 879 "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key", 880 "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la", 881 "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks", 882 "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks", 883 "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop", 884 "loops", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows", 885 "medows", "mews", "mill", "mills", "mission", "missn", "ml", "mls", 886 "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount", 887 "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn", 888 "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd", 889 "oval", "overpass", "ovl", "park", "parks", "parkway", "parkways", 890 "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine", 891 "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain", 892 "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne", 893 "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie", 894 "prk", "prr", "prt", "prts", "psge", "pt", "pts", "rad", "radial", 895 "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd", 896 "rdg", "rdge", "rdgs", "rds", "real", "rest", "ridge", "ridges", "riv", "river", 897 "rivr", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds", 898 "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals", 899 "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway", 900 "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng", 901 "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ", 902 "square", "squares", "st", "sta", "station", "statn", "stn", "str", 903 "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream", 904 "street", "streets", "streme", "strm", "strt", "strvn", "strvnue", 905 "sts", "sumit", "sumitt", "summit", "ter", "terr", "terrace", 906 "throughway", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks", 907 "trafficway", "trail", "trails", "trak", "trce", "trfy", "trk", "trks", 908 "trl", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls", 909 "tunnel", "tunnels", "tunnl", "turnpike", "turnpk", "un", "underpass", 910 "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct", 911 "via", "viadct", "viaduct", "view", "views", "vill", "villag", 912 "village", "villages", "ville", "villg", "villiage", "vis", "vist", 913 "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta", 914 "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells", 915 "wl", "wls", "wy", "xing", "xrd", 916 0, 917}; 918 919bool AddressDetector::IsValidLocationName(const Word& word) { 920 using namespace WTF; 921 static HashSet<String> streetNames; 922 if (!streetNames.size()) { 923 const char** suffixes = s_rawStreetSuffixes; 924 while (const char* suffix = *suffixes) { 925 int index = suffix[0] - 'a'; 926 streetNames.add(suffix); 927 suffixes++; 928 } 929 } 930 char16 first_letter = base::ToLowerASCII(*word.begin); 931 if (first_letter > 'z' || first_letter < 'a') 932 return false; 933 int index = first_letter - 'a'; 934 int length = std::distance(word.begin, word.end); 935 if (*word.end == '.') 936 length--; 937 String value(word.begin, length); 938 return streetNames.contains(value.lower()); 939} 940