address_detector.cpp revision 77974d2fe8f7fb9c421c8d4240e02e9d76ec2c27
1/*
2 * Copyright (C) 2012 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *    * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *    * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *    * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32
33// Magic pretend-to-be-a-chromium-build flags
34#undef WEBKIT_IMPLEMENTATION
35#undef LOG
36
37#include "content/address_detector.h"
38
39#include <bitset>
40
41#include "base/utf_string_conversions.h"
42#include "net/base/escape.h"
43#include "WebString.h"
44
45#include <wtf/HashSet.h>
46#include <wtf/Noncopyable.h>
47#include <wtf/text/StringHash.h>
48#include <wtf/text/WTFString.h>
49
50namespace {
51
52// Prefix used for geographical address intent URIs.
53static const char kAddressSchemaPrefix[] = "geo:0,0?q=";
54
55// Maximum text length to be searched for address detection.
56static const size_t kMaxAddressLength = 500;
57
58// Minimum number of words in an address after the house number
59// before a state is expected to be found.
60// A value too high can miss short addresses.
61const size_t kMinAddressWords = 3;
62
63// Maximum number of words allowed in an address between the house number
64// and the state, both not included.
65const size_t kMaxAddressWords = 12;
66
67// Maximum number of lines allowed in an address between the house number
68// and the state, both not included.
69const size_t kMaxAddressLines = 5;
70
71// Maximum length allowed for any address word between the house number
72// and the state, both not included.
73const size_t kMaxAddressNameWordLength = 25;
74
75// Maximum number of words after the house number in which the location name
76// should be found.
77const size_t kMaxLocationNameDistance = 4;
78
79// Number of digits for a valid zip code.
80const size_t kZipDigits = 5;
81
82// Number of digits for a valid zip code in the Zip Plus 4 format.
83const size_t kZipPlus4Digits = 9;
84
85// Maximum number of digits of a house number, including possible hyphens.
86const size_t kMaxHouseDigits = 5;
87
88// Additional characters used as new line delimiters.
89const char16 kNewlineDelimiters[] = {
90  ',',
91  '*',
92  0x2022,  // Unicode bullet
93  0,
94};
95
96char16 SafePreviousChar(const string16::const_iterator& it,
97    const string16::const_iterator& begin) {
98  if (it == begin)
99    return ' ';
100  return *(it - 1);
101}
102
103char16 SafeNextChar(const string16::const_iterator& it,
104    const string16::const_iterator& end) {
105  if (it == end)
106    return ' ';
107  return *(it + 1);
108}
109
110bool WordLowerCaseEqualsASCII(string16::const_iterator word_begin,
111    string16::const_iterator word_end, const char* ascii_to_match) {
112  for (string16::const_iterator it = word_begin; it != word_end;
113      ++it, ++ascii_to_match) {
114    if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
115      return false;
116  }
117  return *ascii_to_match == 0 || *ascii_to_match == ' ';
118}
119
120bool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin,
121    string16::const_iterator word_end, const char* ascii_to_match,
122    bool allow_plural) {
123  for (string16::const_iterator it = word_begin; it != word_end;
124      ++it, ++ascii_to_match) {
125    if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end)
126      return true;
127
128    if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
129      return false;
130  }
131  return *ascii_to_match == 0;
132}
133
134}  // anonymous namespace
135
136
137AddressDetector::AddressDetector() {
138}
139
140AddressDetector::~AddressDetector() {
141}
142
143std::string AddressDetector::GetContentText(const WebKit::WebRange& range) {
144  // Get the address and replace unicode bullets with commas.
145  string16 address_16 = CollapseWhitespace(range.toPlainText(), false);
146  std::replace(address_16.begin(), address_16.end(),
147      static_cast<char16>(0x2022), static_cast<char16>(','));
148  return UTF16ToUTF8(address_16);
149}
150
151GURL AddressDetector::GetIntentURL(const std::string& content_text) {
152  return GURL(kAddressSchemaPrefix +
153      EscapeQueryParamValue(content_text, true));
154}
155
156size_t AddressDetector::GetMaximumContentLength() {
157  return kMaxAddressLength;
158}
159
160bool AddressDetector::FindContent(const string16::const_iterator& begin,
161    const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) {
162  HouseNumberParser house_number_parser;
163
164  // Keep going through the input string until a potential house number is
165  // detected. Start tokenizing the following words to find a valid
166  // street name within a word range. Then, find a state name followed
167  // by a valid zip code for that state. Also keep a look for any other
168  // possible house numbers to continue from in case of no match and for
169  // state names not followed by a zip code (e.g. New York, NY 10000).
170  const string16 newline_delimiters = kNewlineDelimiters;
171  const string16 delimiters = kWhitespaceUTF16 + newline_delimiters;
172  for (string16::const_iterator it = begin; it != end; ) {
173    Word house_number;
174    if (!house_number_parser.Parse(it, end, &house_number))
175      return false;
176
177    String16Tokenizer tokenizer(house_number.end, end, delimiters);
178    tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
179
180    std::vector<Word> words;
181    words.push_back(house_number);
182
183    bool found_location_name = false;
184    bool continue_on_house_number = true;
185    size_t next_house_number_word = 0;
186    size_t num_lines = 1;
187
188    // Don't include the house number in the word count.
189    size_t next_word = 1;
190    for (; next_word <= kMaxAddressWords + 1; ++next_word) {
191
192      // Extract a new word from the tokenizer.
193      if (next_word == words.size()) {
194        do {
195          if (!tokenizer.GetNext())
196            return false;
197
198          // Check the number of address lines.
199          if (tokenizer.token_is_delim() && newline_delimiters.find(
200              *tokenizer.token_begin()) != string16::npos) {
201            ++num_lines;
202          }
203        } while (tokenizer.token_is_delim());
204
205        if (num_lines > kMaxAddressLines)
206          break;
207
208        words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
209      }
210
211      // Check the word length. If too long, don't try to continue from
212      // the next house number as no address can hold this word.
213      const Word& current_word = words[next_word];
214      DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
215      size_t current_word_length = std::distance(
216          current_word.begin, current_word.end);
217      if (current_word_length > kMaxAddressNameWordLength) {
218        continue_on_house_number = false;
219        break;
220      }
221
222      // Check if the new word is a valid house number.
223      // This is used to properly resume parsing in case the maximum number
224      // of words is exceeded.
225      if (next_house_number_word == 0 &&
226          house_number_parser.Parse(current_word.begin, current_word.end, NULL)) {
227        next_house_number_word = next_word;
228        continue;
229      }
230
231      // Look for location names in the words after the house number.
232      // A range limitation is introduced to avoid matching
233      // anything that starts with a number before a legitimate address.
234      if (next_word <= kMaxLocationNameDistance &&
235          IsValidLocationName(current_word)) {
236        found_location_name = true;
237        continue;
238      }
239
240      // Don't count the house number.
241      if (next_word > kMinAddressWords) {
242        // Looking for the state is likely to add new words to the list while
243        // checking for multi-word state names.
244        size_t state_first_word = next_word;
245        size_t state_last_word, state_index;
246        if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
247            &tokenizer, &state_index)) {
248
249          // A location name should have been found at this point.
250          if (!found_location_name)
251            break;
252
253          // Explicitly exclude "et al", as "al" is a valid state code.
254          if (current_word_length == 2 && words.size() > 2) {
255            const Word& previous_word = words[state_first_word - 1];
256            if (previous_word.end - previous_word.begin == 2 &&
257                LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
258                    "et") &&
259                LowerCaseEqualsASCII(current_word.begin, current_word.end,
260                    "al"))
261              break;
262          }
263
264          // Extract one more word from the tokenizer if not already available.
265          size_t zip_word = state_last_word + 1;
266          if (zip_word == words.size()) {
267            do {
268              if (!tokenizer.GetNext()) {
269                // Zip is optional
270                *start_pos = words[0].begin - begin;
271                *end_pos = words[state_last_word].end - begin;
272                return true;
273              }
274            } while (tokenizer.token_is_delim());
275            words.push_back(Word(tokenizer.token_begin(),
276                tokenizer.token_end()));
277          }
278
279          // Check the parsing validity and state range of the zip code.
280          next_word = state_last_word;
281          if (!IsZipValid(words[zip_word], state_index))
282            continue;
283
284          *start_pos = words[0].begin - begin;
285          *end_pos = words[zip_word].end - begin;
286          return true;
287        }
288      }
289    }
290
291    // Avoid skipping too many words because of a non-address number
292    // at the beginning of the contents to parse.
293    if (continue_on_house_number && next_house_number_word > 0) {
294      it = words[next_house_number_word].begin;
295    } else {
296      DCHECK(!words.empty());
297      next_word = std::min(next_word, words.size() - 1);
298      it = words[next_word].end;
299    }
300  }
301
302  return false;
303}
304
305bool AddressDetector::HouseNumberParser::IsPreDelimiter(
306    char16 character) {
307  return character == ':' || IsPostDelimiter(character);
308}
309
310bool AddressDetector::HouseNumberParser::IsPostDelimiter(
311    char16 character) {
312  return IsWhitespace(character) || strchr(",\"'", character);
313}
314
315void AddressDetector::HouseNumberParser::RestartOnNextDelimiter() {
316  ResetState();
317  for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {}
318}
319
320void AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) {
321  size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)),
322      num_chars);
323  it_ += offset;
324  result_chars_ += offset;
325}
326
327void AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) {
328  it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars);
329}
330
331void AddressDetector::HouseNumberParser::ResetState() {
332  num_digits_ = 0;
333  result_chars_ = 0;
334}
335
336bool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const {
337  // There should always be a number after a hyphen.
338  if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-')
339    return false;
340
341  if (word) {
342    word->begin = it_ - result_chars_;
343    word->end = it_;
344  }
345  return true;
346}
347
348bool AddressDetector::HouseNumberParser::Parse(
349    const string16::const_iterator& begin,
350    const string16::const_iterator& end, Word* word) {
351  it_ = begin_ = begin;
352  end_ = end;
353  ResetState();
354
355  // Iterations only used as a fail-safe against any buggy infinite loops.
356  size_t iterations = 0;
357  size_t max_iterations = end - begin + 1;
358  for (; it_ != end_ && iterations < max_iterations; ++iterations) {
359
360    // Word finished case.
361    if (IsPostDelimiter(*it_)) {
362      if (CheckFinished(word))
363        return true;
364      else if (result_chars_)
365        ResetState();
366
367      SkipChars(1);
368      continue;
369    }
370
371    // More digits. There should be no more after a letter was found.
372    if (IsAsciiDigit(*it_)) {
373      if (num_digits_ >= kMaxHouseDigits) {
374        RestartOnNextDelimiter();
375      } else {
376        AcceptChars(1);
377        ++num_digits_;
378      }
379      continue;
380    }
381
382    if (IsAsciiAlpha(*it_)) {
383      // Handle special case 'one'.
384      if (result_chars_ == 0) {
385        if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one"))
386          AcceptChars(3);
387        else
388          RestartOnNextDelimiter();
389        continue;
390      }
391
392      // There should be more than 1 character because of result_chars.
393      DCHECK_GT(result_chars_, 0U);
394      DCHECK_NE(it_, begin_);
395      char16 previous = SafePreviousChar(it_, begin_);
396      if (IsAsciiDigit(previous)) {
397        // Check cases like '12A'.
398        char16 next = SafeNextChar(it_, end_);
399        if (IsPostDelimiter(next)) {
400          AcceptChars(1);
401          continue;
402        }
403
404        // Handle cases like 12a, 1st, 2nd, 3rd, 7th.
405        if (IsAsciiAlpha(next)) {
406          char16 last_digit = previous;
407          char16 first_letter = base::ToLowerASCII(*it_);
408          char16 second_letter = base::ToLowerASCII(next);
409          bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' &&
410              num_digits_ == 2;
411
412          switch (last_digit - '0') {
413          case 1:
414            if ((first_letter == 's' && second_letter == 't') ||
415                (first_letter == 't' && second_letter == 'h' && is_teen)) {
416              AcceptChars(2);
417              continue;
418            }
419            break;
420
421          case 2:
422            if ((first_letter == 'n' && second_letter == 'd') ||
423                (first_letter == 't' && second_letter == 'h' && is_teen)) {
424              AcceptChars(2);
425              continue;
426            }
427            break;
428
429          case 3:
430            if ((first_letter == 'r' && second_letter == 'd') ||
431                (first_letter == 't' && second_letter == 'h' && is_teen)) {
432              AcceptChars(2);
433              continue;
434            }
435            break;
436
437          case 0:
438            // Explicitly exclude '0th'.
439            if (num_digits_ == 1)
440              break;
441
442          case 4:
443          case 5:
444          case 6:
445          case 7:
446          case 8:
447          case 9:
448            if (first_letter == 't' && second_letter == 'h') {
449              AcceptChars(2);
450              continue;
451            }
452            break;
453
454          default:
455            NOTREACHED();
456          }
457        }
458      }
459
460      RestartOnNextDelimiter();
461      continue;
462    }
463
464    if (*it_ == '-' && num_digits_ > 0) {
465      AcceptChars(1);
466      ++num_digits_;
467      continue;
468    }
469
470    RestartOnNextDelimiter();
471    SkipChars(1);
472  }
473
474  if (iterations >= max_iterations)
475    return false;
476
477  return CheckFinished(word);
478}
479
480bool AddressDetector::FindStateStartingInWord(WordList* words,
481    size_t state_first_word, size_t* state_last_word,
482    String16Tokenizer* tokenizer, size_t* state_index) {
483
484  // Bitmasks containing the allowed suffixes for 2-letter state codes.
485  static const int state_two_letter_suffix[23] = {
486    0x02060c00,  // A followed by: [KLRSZ].
487    0x00000000,  // B.
488    0x00084001,  // C followed by: [AOT].
489    0x00000014,  // D followed by: [CE].
490    0x00000000,  // E.
491    0x00001800,  // F followed by: [LM].
492    0x00100001,  // G followed by: [AU].
493    0x00000100,  // H followed by: [I].
494    0x00002809,  // I followed by: [ADLN].
495    0x00000000,  // J.
496    0x01040000,  // K followed by: [SY].
497    0x00000001,  // L followed by: [A].
498    0x000ce199,  // M followed by: [ADEHINOPST].
499    0x0120129c,  // N followed by: [CDEHJMVY].
500    0x00020480,  // O followed by: [HKR].
501    0x00420001,  // P followed by: [ARW].
502    0x00000000,  // Q.
503    0x00000100,  // R followed by: [I].
504    0x0000000c,  // S followed by: [CD].
505    0x00802000,  // T followed by: [NX].
506    0x00080000,  // U followed by: [T].
507    0x00080101,  // V followed by: [AIT].
508    0x01200101   // W followed by: [AIVY].
509  };
510
511  // Accumulative number of states for the 2-letter code indexed by the first.
512  static const int state_two_letter_accumulative[24] = {
513     0,  5,  5,  8, 10, 10, 12, 14,
514    15, 19, 19, 21, 22, 32, 40, 43,
515    46, 46, 47, 49, 51, 52, 55, 59
516  };
517
518  // State names sorted alphabetically with their lengths.
519  // There can be more than one possible name for a same state if desired.
520  static const struct StateNameInfo {
521    const char* string;
522    char first_word_length;
523    char length;
524    char state_index; // Relative to two-character code alphabetical order.
525  } state_names[59] = {
526    { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 },
527    { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 },
528    { "arkansas", 8, 8, 2 },
529    { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 },
530    { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 },
531    { "district of columbia", 8, 20, 8 },
532    { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 },
533    { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 },
534    { "hawaii", 6, 6, 14 },
535    { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 },
536    { "iowa", 4, 4, 15 },
537    { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 },
538    { "louisiana", 9, 9, 21 },
539    { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 },
540    { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 },
541    { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 },
542    { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 },
543    { "montana", 7, 7, 31 },
544    { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 },
545    { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 },
546    { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 },
547    { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 },
548    { "northern mariana islands", 8, 24, 29 },
549    { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 },
550    { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 },
551    { "puerto rico", 6, 11, 44 },
552    { "rhode island", 5, 5, 46 },
553    { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 },
554    { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 },
555    { "utah", 4, 4, 51 },
556    { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 },
557    { "virginia", 8, 8, 52 },
558    { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 },
559    { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 }
560  };
561
562  // Accumulative number of states for sorted names indexed by the first letter.
563  // Required a different one since there are codes that don't share their
564  // first letter with the name of their state (MP = Northern Mariana Islands).
565  static const int state_names_accumulative[24] = {
566     0,  5,  5,  8, 10, 10, 12, 14,
567    15, 19, 19, 21, 22, 31, 40, 43,
568    46, 46, 47, 49, 51, 52, 55, 59
569  };
570
571  DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1],
572      static_cast<int>(ARRAYSIZE_UNSAFE(state_names)));
573
574  const Word& first_word = words->at(state_first_word);
575  int length = first_word.end - first_word.begin;
576  if (length < 2 || !IsAsciiAlpha(*first_word.begin))
577    return false;
578
579  // No state names start with x, y, z.
580  char16 first_letter = base::ToLowerASCII(*first_word.begin);
581  if (first_letter > 'w')
582    return false;
583
584  DCHECK(first_letter >= 'a');
585  int first_index = first_letter - 'a';
586
587  // Look for two-letter state names.
588  if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) {
589    char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1));
590    DCHECK(second_letter >= 'a');
591
592    int second_index = second_letter - 'a';
593    if (!(state_two_letter_suffix[first_index] & (1 << second_index)))
594      return false;
595
596    std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] &
597        ((1 << second_index) - 1);
598    *state_last_word = state_first_word;
599    *state_index = state_two_letter_accumulative[first_index] +
600        previous_suffixes.count();
601    return true;
602  }
603
604  // Look for full state names by their first letter. Discard by length.
605  for (int state = state_names_accumulative[first_index];
606      state < state_names_accumulative[first_index + 1]; ++state) {
607    if (state_names[state].first_word_length != length)
608      continue;
609
610    bool state_match = false;
611    size_t state_word = state_first_word;
612    for (int pos = 0; true; ) {
613      if (!WordLowerCaseEqualsASCII(words->at(state_word).begin,
614          words->at(state_word).end, &state_names[state].string[pos]))
615        break;
616
617      pos += words->at(state_word).end - words->at(state_word).begin + 1;
618      if (pos >= state_names[state].length) {
619        state_match = true;
620        break;
621      }
622
623      // Ran out of words, extract more from the tokenizer.
624      if (++state_word == words->size()) {
625        do {
626          if (!tokenizer->GetNext())
627            break;
628        } while (tokenizer->token_is_delim());
629        words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end()));
630      }
631    }
632
633    if (state_match) {
634      *state_last_word = state_word;
635      *state_index = state_names[state].state_index;
636      return true;
637    }
638  }
639
640  return false;
641}
642
643bool AddressDetector::IsZipValid(const Word& word, size_t state_index) {
644  size_t length = word.end - word.begin;
645  if (length != kZipDigits && length != kZipPlus4Digits + 1)
646    return false;
647
648  for (string16::const_iterator it = word.begin; it != word.end; ++it) {
649    size_t pos = it - word.begin;
650    if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits))
651      continue;
652    return false;
653  }
654  return IsZipValidForState(word, state_index);
655}
656
657bool AddressDetector::IsZipValidForState(const Word& word, size_t state_index)
658{
659    enum USState {
660        AP = -4, // AP (military base in the Pacific)
661        AA = -3, // AA (military base inside the US)
662        AE = -2, // AE (military base outside the US)
663        XX = -1, // (not in use)
664        AK =  0, // AK Alaska
665        AL =  1, // AL Alabama
666        AR =  2, // AR Arkansas
667        AS =  3, // AS American Samoa
668        AZ =  4, // AZ Arizona
669        CA =  5, // CA California
670        CO =  6, // CO Colorado
671        CT =  7, // CT Connecticut
672        DC =  8, // DC District of Columbia
673        DE =  9, // DE Delaware
674        FL = 10, // FL Florida
675        FM = 11, // FM Federated States of Micronesia
676        GA = 12, // GA Georgia
677        GU = 13, // GU Guam
678        HI = 14, // HI Hawaii
679        IA = 15, // IA Iowa
680        ID = 16, // ID Idaho
681        IL = 17, // IL Illinois
682        IN = 18, // IN Indiana
683        KS = 19, // KS Kansas
684        KY = 20, // KY Kentucky
685        LA = 21, // LA Louisiana
686        MA = 22, // MA Massachusetts
687        MD = 23, // MD Maryland
688        ME = 24, // ME Maine
689        MH = 25, // MH Marshall Islands
690        MI = 26, // MI Michigan
691        MN = 27, // MN Minnesota
692        MO = 28, // MO Missouri
693        MP = 29, // MP Northern Mariana Islands
694        MS = 30, // MS Mississippi
695        MT = 31, // MT Montana
696        NC = 32, // NC North Carolina
697        ND = 33, // ND North Dakota
698        NE = 34, // NE Nebraska
699        NH = 35, // NH New Hampshire
700        NJ = 36, // NJ New Jersey
701        NM = 37, // NM New Mexico
702        NV = 38, // NV Nevada
703        NY = 39, // NY New York
704        OH = 40, // OH Ohio
705        OK = 41, // OK Oklahoma
706        OR = 42, // OR Oregon
707        PA = 43, // PA Pennsylvania
708        PR = 44, // PR Puerto Rico
709        PW = 45, // PW Palau
710        RI = 46, // RI Rhode Island
711        SC = 47, // SC South Carolina
712        SD = 48, // SD South Dakota
713        TN = 49, // TN Tennessee
714        TX = 50, // TX Texas
715        UT = 51, // UT Utah
716        VA = 52, // VA Virginia
717        VI = 53, // VI Virgin Islands
718        VT = 54, // VT Vermont
719        WA = 55, // WA Washington
720        WI = 56, // WI Wisconsin
721        WV = 57, // WV West Virginia
722        WY = 58, // WY Wyoming
723    };
724
725    static const USState stateForZipPrefix[] = {
726    //   0   1   2   3   4   5   6   7   8   9
727        XX, XX, XX, XX, XX, NY, PR, PR, VI, PR, // 000-009
728        MA, MA, MA, MA, MA, MA, MA, MA, MA, MA, // 010-019
729        MA, MA, MA, MA, MA, MA, MA, MA, RI, RI, // 020-029
730        NH, NH, NH, NH, NH, NH, NH, NH, NH, ME, // 030-039
731        ME, ME, ME, ME, ME, ME, ME, ME, ME, ME, // 040-049
732        VT, VT, VT, VT, VT, MA, VT, VT, VT, VT, // 050-059
733        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 060-069
734        NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 070-079
735        NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 080-089
736        AE, AE, AE, AE, AE, AE, AE, AE, AE, XX, // 090-099
737        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 100-109
738        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 110-119
739        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 120-129
740        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 130-139
741        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 140-149
742        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 150-159
743        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 160-169
744        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 170-179
745        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 180-189
746        PA, PA, PA, PA, PA, PA, PA, DE, DE, DE, // 190-199
747        DC, VA, DC, DC, DC, DC, MD, MD, MD, MD, // 200-209
748        MD, MD, MD, XX, MD, MD, MD, MD, MD, MD, // 210-219
749        VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 220-229
750        VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 230-239
751        VA, VA, VA, VA, VA, VA, VA, WV, WV, WV, // 240-249
752        WV, WV, WV, WV, WV, WV, WV, WV, WV, WV, // 250-259
753        WV, WV, WV, WV, WV, WV, WV, WV, WV, XX, // 260-269
754        NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 270-279
755        NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 280-289
756        SC, SC, SC, SC, SC, SC, SC, SC, SC, SC, // 290-299
757        GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 300-309
758        GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 310-319
759        FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 320-329
760        FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 330-339
761        AA, FL, FL, XX, FL, XX, FL, FL, XX, FL, // 340-349
762        AL, AL, AL, XX, AL, AL, AL, AL, AL, AL, // 350-359
763        AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 360-369
764        TN, TN, TN, TN, TN, TN, TN, TN, TN, TN, // 370-379
765        TN, TN, TN, TN, TN, TN, MS, MS, MS, MS, // 380-389
766        MS, MS, MS, MS, MS, MS, MS, MS, GA, GA, // 390-399
767        KY, KY, KY, KY, KY, KY, KY, KY, KY, KY, // 400-409
768        KY, KY, KY, KY, KY, KY, KY, KY, KY, XX, // 410-419
769        KY, KY, KY, KY, KY, KY, KY, KY, XX, XX, // 420-429
770        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 430-439
771        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 440-449
772        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 450-459
773        IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 460-469
774        IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 470-479
775        MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 480-489
776        MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 490-499
777        IA, IA, IA, IA, IA, IA, IA, IA, IA, IA, // 500-509
778        IA, IA, IA, IA, IA, IA, IA, XX, XX, XX, // 510-519
779        IA, IA, IA, IA, IA, IA, IA, IA, IA, XX, // 520-529
780        WI, WI, WI, XX, WI, WI, XX, WI, WI, WI, // 530-539
781        WI, WI, WI, WI, WI, WI, WI, WI, WI, WI, // 540-549
782        MN, MN, XX, MN, MN, MN, MN, MN, MN, MN, // 550-559
783        MN, MN, MN, MN, MN, MN, MN, MN, XX, DC, // 560-569
784        SD, SD, SD, SD, SD, SD, SD, SD, XX, XX, // 570-579
785        ND, ND, ND, ND, ND, ND, ND, ND, ND, XX, // 580-589
786        MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, // 590-599
787        IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 600-609
788        IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 610-619
789        IL, XX, IL, IL, IL, IL, IL, IL, IL, IL, // 620-629
790        MO, MO, XX, MO, MO, MO, MO, MO, MO, MO, // 630-639
791        MO, MO, XX, XX, MO, MO, MO, MO, MO, MO, // 640-649
792        MO, MO, MO, MO, MO, MO, MO, MO, MO, XX, // 650-659
793        KS, KS, KS, XX, KS, KS, KS, KS, KS, KS, // 660-669
794        KS, KS, KS, KS, KS, KS, KS, KS, KS, KS, // 670-679
795        NE, NE, XX, NE, NE, NE, NE, NE, NE, NE, // 680-689
796        NE, NE, NE, NE, XX, XX, XX, XX, XX, XX, // 690-699
797        LA, LA, XX, LA, LA, LA, LA, LA, LA, XX, // 700-709
798        LA, LA, LA, LA, LA, XX, AR, AR, AR, AR, // 710-719
799        AR, AR, AR, AR, AR, AR, AR, AR, AR, AR, // 720-729
800        OK, OK, XX, TX, OK, OK, OK, OK, OK, OK, // 730-739
801        OK, OK, XX, OK, OK, OK, OK, OK, OK, OK, // 740-749
802        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 750-759
803        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 760-769
804        TX, XX, TX, TX, TX, TX, TX, TX, TX, TX, // 770-779
805        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 780-789
806        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 790-799
807        CO, CO, CO, CO, CO, CO, CO, CO, CO, CO, // 800-809
808        CO, CO, CO, CO, CO, CO, CO, XX, XX, XX, // 810-819
809        WY, WY, WY, WY, WY, WY, WY, WY, WY, WY, // 820-829
810        WY, WY, ID, ID, ID, ID, ID, ID, ID, XX, // 830-839
811        UT, UT, UT, UT, UT, UT, UT, UT, XX, XX, // 840-849
812        AZ, AZ, AZ, AZ, XX, AZ, AZ, AZ, XX, AZ, // 850-859
813        AZ, XX, XX, AZ, AZ, AZ, XX, XX, XX, XX, // 860-869
814        NM, NM, NM, NM, NM, NM, XX, NM, NM, NM, // 870-879
815        NM, NM, NM, NM, NM, TX, XX, XX, XX, NV, // 880-889
816        NV, NV, XX, NV, NV, NV, XX, NV, NV, XX, // 890-899
817        CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 900-909
818        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 910-919
819        CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 920-929
820        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 930-939
821        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 940-949
822        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 950-959
823        CA, CA, AP, AP, AP, AP, AP, HI, HI, GU, // 960-969
824        OR, OR, OR, OR, OR, OR, OR, OR, OR, OR, // 970-979
825        WA, WA, WA, WA, WA, WA, WA, XX, WA, WA, // 980-989
826        WA, WA, WA, WA, WA, AK, AK, AK, AK, AK, // 990-999
827    };
828
829    if (!word.begin || !word.end || (word.end - word.begin) < 3)
830        return false;
831    const char16* zipPtr = word.begin;
832    if (zipPtr[0] < '0' || zipPtr[0] > '9' ||
833        zipPtr[1] < '0' || zipPtr[1] > '9' ||
834        zipPtr[2] < '0' || zipPtr[2] > '9')
835        return false;
836
837    int zip = zipPtr[0] - '0';
838    zip *= 10;
839    zip += zipPtr[1] - '0';
840    zip *= 10;
841    zip += zipPtr[2] - '0';
842    return stateForZipPrefix[zip] == (int) state_index;
843}
844
845static const char* s_rawStreetSuffixes[] = {
846    "allee", "alley", "ally", "aly",
847    "anex", "annex", "anx", "arc", "arcade", "av", "ave", "aven", "avenu",
848    "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend",
849    "bg", "bgs", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd",
850    "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch",
851    "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks",
852    "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu",
853    "camp", "canyn", "canyon", "cape", "causeway", "causway", "cen", "cent",
854    "center", "centers", "centr", "centre", "cir", "circ", "circl",
855    "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff",
856    "cliffs", "club", "cmn", "cmp", "cnter", "cntr", "cnyn", "common",
857    "cor", "corner", "corners", "cors", "course", "court", "courts", "cove",
858    "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres",
859    "crescent", "cresent", "crest", "crk", "crossing", "crossroad",
860    "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt",
861    "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn",
862    "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive",
863    "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests",
864    "exp", "expr", "express", "expressway", "expw", "expy", "ext",
865    "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls",
866    "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls",
867    "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge",
868    "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy",
869    "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy",
870    "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy",
871    "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden",
872    "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove",
873    "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors",
874    "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts",
875    "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw",
876    "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts",
877    "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands",
878    "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn",
879    "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key",
880    "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la",
881    "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks",
882    "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks",
883    "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop",
884    "loops", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows",
885    "medows", "mews", "mill", "mills", "mission", "missn", "ml", "mls",
886    "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount",
887    "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn",
888    "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd",
889    "oval", "overpass", "ovl", "park", "parks", "parkway", "parkways",
890    "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine",
891    "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain",
892    "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne",
893    "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie",
894    "prk", "prr", "prt", "prts", "psge", "pt", "pts", "rad", "radial",
895    "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd",
896    "rdg", "rdge", "rdgs", "rds", "real", "rest", "ridge", "ridges", "riv", "river",
897    "rivr", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds",
898    "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals",
899    "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway",
900    "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng",
901    "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ",
902    "square", "squares", "st", "sta", "station", "statn", "stn", "str",
903    "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream",
904    "street", "streets", "streme", "strm", "strt", "strvn", "strvnue",
905    "sts", "sumit", "sumitt", "summit", "ter", "terr", "terrace",
906    "throughway", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks",
907    "trafficway", "trail", "trails", "trak", "trce", "trfy", "trk", "trks",
908    "trl", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls",
909    "tunnel", "tunnels", "tunnl", "turnpike", "turnpk", "un", "underpass",
910    "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct",
911    "via", "viadct", "viaduct", "view", "views", "vill", "villag",
912    "village", "villages", "ville", "villg", "villiage", "vis", "vist",
913    "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta",
914    "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells",
915    "wl", "wls", "wy", "xing", "xrd",
916    0,
917};
918
919bool AddressDetector::IsValidLocationName(const Word& word) {
920    using namespace WTF;
921    static HashSet<String> streetNames;
922    if (!streetNames.size()) {
923        const char** suffixes = s_rawStreetSuffixes;
924        while (const char* suffix = *suffixes) {
925            int index = suffix[0] - 'a';
926            streetNames.add(suffix);
927            suffixes++;
928        }
929    }
930    char16 first_letter = base::ToLowerASCII(*word.begin);
931    if (first_letter > 'z' || first_letter < 'a')
932        return false;
933    int index = first_letter - 'a';
934    int length = std::distance(word.begin, word.end);
935    if (*word.end == '.')
936        length--;
937    String value(word.begin, length);
938    return streetNames.contains(value.lower());
939}
940