address_detector.cpp revision bf0d5c6dc816bca8f00bde31ddda7ba41e740ccd
1/*
2 * Copyright (C) 2012 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *    * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *    * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *    * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32
33// Magic pretend-to-be-a-chromium-build flags
34#undef WEBKIT_IMPLEMENTATION
35#undef LOG
36
37#include "content/address_detector.h"
38
39#include <bitset>
40
41#include "base/utf_string_conversions.h"
42#include "net/base/escape.h"
43#include "WebString.h"
44
45#include <wtf/HashSet.h>
46#include <wtf/Noncopyable.h>
47#include <wtf/text/StringHash.h>
48#include <wtf/text/WTFString.h>
49
50namespace {
51
52// Prefix used for geographical address intent URIs.
53static const char kAddressSchemaPrefix[] = "geo:0,0?q=";
54
55// Maximum text length to be searched for address detection.
56static const size_t kMaxAddressLength = 500;
57
58// Minimum number of words in an address after the house number
59// before a state is expected to be found.
60// A value too high can miss short addresses.
61const size_t kMinAddressWords = 3;
62
63// Maximum number of words allowed in an address between the house number
64// and the state, both not included.
65const size_t kMaxAddressWords = 12;
66
67// Maximum number of lines allowed in an address between the house number
68// and the state, both not included.
69const size_t kMaxAddressLines = 5;
70
71// Maximum length allowed for any address word between the house number
72// and the state, both not included.
73const size_t kMaxAddressNameWordLength = 25;
74
75// Maximum number of words after the house number in which the location name
76// should be found.
77const size_t kMaxLocationNameDistance = 4;
78
79// Number of digits for a valid zip code.
80const size_t kZipDigits = 5;
81
82// Number of digits for a valid zip code in the Zip Plus 4 format.
83const size_t kZipPlus4Digits = 9;
84
85// Maximum number of digits of a house number, including possible hyphens.
86const size_t kMaxHouseDigits = 5;
87
88// Additional characters used as new line delimiters.
89const char16 kNewlineDelimiters[] = {
90  ',',
91  '*',
92  0x2022,  // Unicode bullet
93};
94
95char16 SafePreviousChar(const string16::const_iterator& it,
96    const string16::const_iterator& begin) {
97  if (it == begin)
98    return ' ';
99  return *(it - 1);
100}
101
102char16 SafeNextChar(const string16::const_iterator& it,
103    const string16::const_iterator& end) {
104  if (it == end)
105    return ' ';
106  return *(it + 1);
107}
108
109bool WordLowerCaseEqualsASCII(string16::const_iterator word_begin,
110    string16::const_iterator word_end, const char* ascii_to_match) {
111  for (string16::const_iterator it = word_begin; it != word_end;
112      ++it, ++ascii_to_match) {
113    if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
114      return false;
115  }
116  return *ascii_to_match == 0 || *ascii_to_match == ' ';
117}
118
119bool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin,
120    string16::const_iterator word_end, const char* ascii_to_match,
121    bool allow_plural) {
122  for (string16::const_iterator it = word_begin; it != word_end;
123      ++it, ++ascii_to_match) {
124    if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end)
125      return true;
126
127    if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
128      return false;
129  }
130  return *ascii_to_match == 0;
131}
132
133}  // anonymous namespace
134
135
136AddressDetector::AddressDetector() {
137}
138
139AddressDetector::~AddressDetector() {
140}
141
142std::string AddressDetector::GetContentText(const WebKit::WebRange& range) {
143  // Get the address and replace unicode bullets with commas.
144  string16 address_16 = CollapseWhitespace(range.toPlainText(), false);
145  std::replace(address_16.begin(), address_16.end(),
146      static_cast<char16>(0x2022), static_cast<char16>(','));
147  return UTF16ToUTF8(address_16);
148}
149
150GURL AddressDetector::GetIntentURL(const std::string& content_text) {
151  return GURL(kAddressSchemaPrefix +
152      EscapeQueryParamValue(content_text, true));
153}
154
155size_t AddressDetector::GetMaximumContentLength() {
156  return kMaxAddressLength;
157}
158
159bool AddressDetector::FindContent(const string16::const_iterator& begin,
160    const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) {
161  HouseNumberParser house_number_parser;
162
163  // Keep going through the input string until a potential house number is
164  // detected. Start tokenizing the following words to find a valid
165  // street name within a word range. Then, find a state name followed
166  // by a valid zip code for that state. Also keep a look for any other
167  // possible house numbers to continue from in case of no match and for
168  // state names not followed by a zip code (e.g. New York, NY 10000).
169  const string16 newline_delimiters = kNewlineDelimiters;
170  const string16 delimiters = kWhitespaceUTF16 + newline_delimiters;
171  for (string16::const_iterator it = begin; it != end; ) {
172    Word house_number;
173    if (!house_number_parser.Parse(it, end, &house_number))
174      return false;
175
176    String16Tokenizer tokenizer(house_number.end, end, delimiters);
177    tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
178
179    std::vector<Word> words;
180    words.push_back(house_number);
181
182    bool found_location_name = false;
183    bool continue_on_house_number = true;
184    size_t next_house_number_word = 0;
185    size_t num_lines = 1;
186
187    // Don't include the house number in the word count.
188    size_t next_word = 1;
189    for (; next_word <= kMaxAddressWords + 1; ++next_word) {
190
191      // Extract a new word from the tokenizer.
192      if (next_word == words.size()) {
193        do {
194          if (!tokenizer.GetNext())
195            return false;
196
197          // Check the number of address lines.
198          if (tokenizer.token_is_delim() && newline_delimiters.find(
199              *tokenizer.token_begin()) != string16::npos) {
200            ++num_lines;
201          }
202        } while (tokenizer.token_is_delim());
203
204        if (num_lines > kMaxAddressLines)
205          break;
206
207        words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
208      }
209
210      // Check the word length. If too long, don't try to continue from
211      // the next house number as no address can hold this word.
212      const Word& current_word = words[next_word];
213      DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
214      size_t current_word_length = std::distance(
215          current_word.begin, current_word.end);
216      if (current_word_length > kMaxAddressNameWordLength) {
217        continue_on_house_number = false;
218        break;
219      }
220
221      // Check if the new word is a valid house number.
222      // This is used to properly resume parsing in case the maximum number
223      // of words is exceeded.
224      if (next_house_number_word == 0 &&
225          house_number_parser.Parse(current_word.begin, current_word.end, NULL)) {
226        next_house_number_word = next_word;
227        continue;
228      }
229
230      // Look for location names in the words after the house number.
231      // A range limitation is introduced to avoid matching
232      // anything that starts with a number before a legitimate address.
233      if (next_word <= kMaxLocationNameDistance &&
234          IsValidLocationName(current_word)) {
235        found_location_name = true;
236        continue;
237      }
238
239      // Don't count the house number.
240      if (next_word > kMinAddressWords) {
241        // Looking for the state is likely to add new words to the list while
242        // checking for multi-word state names.
243        size_t state_first_word = next_word;
244        size_t state_last_word, state_index;
245        if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
246            &tokenizer, &state_index)) {
247
248          // A location name should have been found at this point.
249          if (!found_location_name)
250            break;
251
252          // Explicitly exclude "et al", as "al" is a valid state code.
253          if (current_word_length == 2 && words.size() > 2) {
254            const Word& previous_word = words[state_first_word - 1];
255            if (previous_word.end - previous_word.begin == 2 &&
256                LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
257                    "et") &&
258                LowerCaseEqualsASCII(current_word.begin, current_word.end,
259                    "al"))
260              break;
261          }
262
263          // Extract one more word from the tokenizer if not already available.
264          size_t zip_word = state_last_word + 1;
265          if (zip_word == words.size()) {
266            do {
267              if (!tokenizer.GetNext()) {
268                // Zip is optional
269                *start_pos = words[0].begin - begin;
270                *end_pos = words[state_last_word].end - begin;
271                return true;
272              }
273            } while (tokenizer.token_is_delim());
274            words.push_back(Word(tokenizer.token_begin(),
275                tokenizer.token_end()));
276          }
277
278          // Check the parsing validity and state range of the zip code.
279          next_word = state_last_word;
280          if (!IsZipValid(words[zip_word], state_index))
281            continue;
282
283          *start_pos = words[0].begin - begin;
284          *end_pos = words[zip_word].end - begin;
285          return true;
286        }
287      }
288    }
289
290    // Avoid skipping too many words because of a non-address number
291    // at the beginning of the contents to parse.
292    if (continue_on_house_number && next_house_number_word > 0) {
293      it = words[next_house_number_word].begin;
294    } else {
295      DCHECK(!words.empty());
296      next_word = std::min(next_word, words.size() - 1);
297      it = words[next_word].end;
298    }
299  }
300
301  return false;
302}
303
304bool AddressDetector::HouseNumberParser::IsPreDelimiter(
305    char16 character) {
306  return character == ':' || IsPostDelimiter(character);
307}
308
309bool AddressDetector::HouseNumberParser::IsPostDelimiter(
310    char16 character) {
311  return IsWhitespace(character) || strchr(",\"'", character);
312}
313
314void AddressDetector::HouseNumberParser::RestartOnNextDelimiter() {
315  ResetState();
316  for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {}
317}
318
319void AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) {
320  size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)),
321      num_chars);
322  it_ += offset;
323  result_chars_ += offset;
324}
325
326void AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) {
327  it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars);
328}
329
330void AddressDetector::HouseNumberParser::ResetState() {
331  num_digits_ = 0;
332  result_chars_ = 0;
333}
334
335bool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const {
336  // There should always be a number after a hyphen.
337  if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-')
338    return false;
339
340  if (word) {
341    word->begin = it_ - result_chars_;
342    word->end = it_;
343  }
344  return true;
345}
346
347bool AddressDetector::HouseNumberParser::Parse(
348    const string16::const_iterator& begin,
349    const string16::const_iterator& end, Word* word) {
350  it_ = begin_ = begin;
351  end_ = end;
352  ResetState();
353
354  // Iterations only used as a fail-safe against any buggy infinite loops.
355  size_t iterations = 0;
356  size_t max_iterations = end - begin + 1;
357  for (; it_ != end_ && iterations < max_iterations; ++iterations) {
358
359    // Word finished case.
360    if (IsPostDelimiter(*it_)) {
361      if (CheckFinished(word))
362        return true;
363      else if (result_chars_)
364        ResetState();
365
366      SkipChars(1);
367      continue;
368    }
369
370    // More digits. There should be no more after a letter was found.
371    if (IsAsciiDigit(*it_)) {
372      if (num_digits_ >= kMaxHouseDigits) {
373        RestartOnNextDelimiter();
374      } else {
375        AcceptChars(1);
376        ++num_digits_;
377      }
378      continue;
379    }
380
381    if (IsAsciiAlpha(*it_)) {
382      // Handle special case 'one'.
383      if (result_chars_ == 0) {
384        if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one"))
385          AcceptChars(3);
386        else
387          RestartOnNextDelimiter();
388        continue;
389      }
390
391      // There should be more than 1 character because of result_chars.
392      DCHECK_GT(result_chars_, 0U);
393      DCHECK_NE(it_, begin_);
394      char16 previous = SafePreviousChar(it_, begin_);
395      if (IsAsciiDigit(previous)) {
396        // Check cases like '12A'.
397        char16 next = SafeNextChar(it_, end_);
398        if (IsPostDelimiter(next)) {
399          AcceptChars(1);
400          continue;
401        }
402
403        // Handle cases like 12a, 1st, 2nd, 3rd, 7th.
404        if (IsAsciiAlpha(next)) {
405          char16 last_digit = previous;
406          char16 first_letter = base::ToLowerASCII(*it_);
407          char16 second_letter = base::ToLowerASCII(next);
408          bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' &&
409              num_digits_ == 2;
410
411          switch (last_digit - '0') {
412          case 1:
413            if ((first_letter == 's' && second_letter == 't') ||
414                (first_letter == 't' && second_letter == 'h' && is_teen)) {
415              AcceptChars(2);
416              continue;
417            }
418            break;
419
420          case 2:
421            if ((first_letter == 'n' && second_letter == 'd') ||
422                (first_letter == 't' && second_letter == 'h' && is_teen)) {
423              AcceptChars(2);
424              continue;
425            }
426            break;
427
428          case 3:
429            if ((first_letter == 'r' && second_letter == 'd') ||
430                (first_letter == 't' && second_letter == 'h' && is_teen)) {
431              AcceptChars(2);
432              continue;
433            }
434            break;
435
436          case 0:
437            // Explicitly exclude '0th'.
438            if (num_digits_ == 1)
439              break;
440
441          case 4:
442          case 5:
443          case 6:
444          case 7:
445          case 8:
446          case 9:
447            if (first_letter == 't' && second_letter == 'h') {
448              AcceptChars(2);
449              continue;
450            }
451            break;
452
453          default:
454            NOTREACHED();
455          }
456        }
457      }
458
459      RestartOnNextDelimiter();
460      continue;
461    }
462
463    if (*it_ == '-' && num_digits_ > 0) {
464      AcceptChars(1);
465      ++num_digits_;
466      continue;
467    }
468
469    RestartOnNextDelimiter();
470    SkipChars(1);
471  }
472
473  if (iterations >= max_iterations)
474    return false;
475
476  return CheckFinished(word);
477}
478
479bool AddressDetector::FindStateStartingInWord(WordList* words,
480    size_t state_first_word, size_t* state_last_word,
481    String16Tokenizer* tokenizer, size_t* state_index) {
482
483  // Bitmasks containing the allowed suffixes for 2-letter state codes.
484  static const int state_two_letter_suffix[23] = {
485    0x02060c00,  // A followed by: [KLRSZ].
486    0x00000000,  // B.
487    0x00084001,  // C followed by: [AOT].
488    0x00000014,  // D followed by: [CE].
489    0x00000000,  // E.
490    0x00001800,  // F followed by: [LM].
491    0x00100001,  // G followed by: [AU].
492    0x00000100,  // H followed by: [I].
493    0x00002809,  // I followed by: [ADLN].
494    0x00000000,  // J.
495    0x01040000,  // K followed by: [SY].
496    0x00000001,  // L followed by: [A].
497    0x000ce199,  // M followed by: [ADEHINOPST].
498    0x0120129c,  // N followed by: [CDEHJMVY].
499    0x00020480,  // O followed by: [HKR].
500    0x00420001,  // P followed by: [ARW].
501    0x00000000,  // Q.
502    0x00000100,  // R followed by: [I].
503    0x0000000c,  // S followed by: [CD].
504    0x00802000,  // T followed by: [NX].
505    0x00080000,  // U followed by: [T].
506    0x00080101,  // V followed by: [AIT].
507    0x01200101   // W followed by: [AIVY].
508  };
509
510  // Accumulative number of states for the 2-letter code indexed by the first.
511  static const int state_two_letter_accumulative[24] = {
512     0,  5,  5,  8, 10, 10, 12, 14,
513    15, 19, 19, 21, 22, 32, 40, 43,
514    46, 46, 47, 49, 51, 52, 55, 59
515  };
516
517  // State names sorted alphabetically with their lengths.
518  // There can be more than one possible name for a same state if desired.
519  static const struct StateNameInfo {
520    const char* string;
521    char first_word_length;
522    char length;
523    char state_index; // Relative to two-character code alphabetical order.
524  } state_names[59] = {
525    { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 },
526    { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 },
527    { "arkansas", 8, 8, 2 },
528    { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 },
529    { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 },
530    { "district of columbia", 8, 20, 8 },
531    { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 },
532    { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 },
533    { "hawaii", 6, 6, 14 },
534    { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 },
535    { "iowa", 4, 4, 15 },
536    { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 },
537    { "louisiana", 9, 9, 21 },
538    { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 },
539    { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 },
540    { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 },
541    { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 },
542    { "montana", 7, 7, 31 },
543    { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 },
544    { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 },
545    { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 },
546    { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 },
547    { "northern mariana islands", 8, 24, 29 },
548    { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 },
549    { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 },
550    { "puerto rico", 6, 11, 44 },
551    { "rhode island", 5, 5, 46 },
552    { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 },
553    { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 },
554    { "utah", 4, 4, 51 },
555    { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 },
556    { "virginia", 8, 8, 52 },
557    { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 },
558    { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 }
559  };
560
561  // Accumulative number of states for sorted names indexed by the first letter.
562  // Required a different one since there are codes that don't share their
563  // first letter with the name of their state (MP = Northern Mariana Islands).
564  static const int state_names_accumulative[24] = {
565     0,  5,  5,  8, 10, 10, 12, 14,
566    15, 19, 19, 21, 22, 31, 40, 43,
567    46, 46, 47, 49, 51, 52, 55, 59
568  };
569
570  DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1],
571      static_cast<int>(ARRAYSIZE_UNSAFE(state_names)));
572
573  const Word& first_word = words->at(state_first_word);
574  int length = first_word.end - first_word.begin;
575  if (length < 2 || !IsAsciiAlpha(*first_word.begin))
576    return false;
577
578  // No state names start with x, y, z.
579  char16 first_letter = base::ToLowerASCII(*first_word.begin);
580  if (first_letter > 'w')
581    return false;
582
583  DCHECK(first_letter >= 'a');
584  int first_index = first_letter - 'a';
585
586  // Look for two-letter state names.
587  if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) {
588    char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1));
589    DCHECK(second_letter >= 'a');
590
591    int second_index = second_letter - 'a';
592    if (!(state_two_letter_suffix[first_index] & (1 << second_index)))
593      return false;
594
595    std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] &
596        ((1 << second_index) - 1);
597    *state_last_word = state_first_word;
598    *state_index = state_two_letter_accumulative[first_index] +
599        previous_suffixes.count();
600    return true;
601  }
602
603  // Look for full state names by their first letter. Discard by length.
604  for (int state = state_names_accumulative[first_index];
605      state < state_names_accumulative[first_index + 1]; ++state) {
606    if (state_names[state].first_word_length != length)
607      continue;
608
609    bool state_match = false;
610    size_t state_word = state_first_word;
611    for (int pos = 0; true; ) {
612      if (!WordLowerCaseEqualsASCII(words->at(state_word).begin,
613          words->at(state_word).end, &state_names[state].string[pos]))
614        break;
615
616      pos += words->at(state_word).end - words->at(state_word).begin + 1;
617      if (pos >= state_names[state].length) {
618        state_match = true;
619        break;
620      }
621
622      // Ran out of words, extract more from the tokenizer.
623      if (++state_word == words->size()) {
624        do {
625          if (!tokenizer->GetNext())
626            break;
627        } while (tokenizer->token_is_delim());
628        words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end()));
629      }
630    }
631
632    if (state_match) {
633      *state_last_word = state_word;
634      *state_index = state_names[state].state_index;
635      return true;
636    }
637  }
638
639  return false;
640}
641
642bool AddressDetector::IsZipValid(const Word& word, size_t state_index) {
643  size_t length = word.end - word.begin;
644  if (length != kZipDigits && length != kZipPlus4Digits + 1)
645    return false;
646
647  for (string16::const_iterator it = word.begin; it != word.end; ++it) {
648    size_t pos = it - word.begin;
649    if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits))
650      continue;
651    return false;
652  }
653  return IsZipValidForState(word, state_index);
654}
655
656bool AddressDetector::IsZipValidForState(const Word& word, size_t state_index)
657{
658    enum USState {
659        AP = -4, // AP (military base in the Pacific)
660        AA = -3, // AA (military base inside the US)
661        AE = -2, // AE (military base outside the US)
662        XX = -1, // (not in use)
663        AK =  0, // AK Alaska
664        AL =  1, // AL Alabama
665        AR =  2, // AR Arkansas
666        AS =  3, // AS American Samoa
667        AZ =  4, // AZ Arizona
668        CA =  5, // CA California
669        CO =  6, // CO Colorado
670        CT =  7, // CT Connecticut
671        DC =  8, // DC District of Columbia
672        DE =  9, // DE Delaware
673        FL = 10, // FL Florida
674        FM = 11, // FM Federated States of Micronesia
675        GA = 12, // GA Georgia
676        GU = 13, // GU Guam
677        HI = 14, // HI Hawaii
678        IA = 15, // IA Iowa
679        ID = 16, // ID Idaho
680        IL = 17, // IL Illinois
681        IN = 18, // IN Indiana
682        KS = 19, // KS Kansas
683        KY = 20, // KY Kentucky
684        LA = 21, // LA Louisiana
685        MA = 22, // MA Massachusetts
686        MD = 23, // MD Maryland
687        ME = 24, // ME Maine
688        MH = 25, // MH Marshall Islands
689        MI = 26, // MI Michigan
690        MN = 27, // MN Minnesota
691        MO = 28, // MO Missouri
692        MP = 29, // MP Northern Mariana Islands
693        MS = 30, // MS Mississippi
694        MT = 31, // MT Montana
695        NC = 32, // NC North Carolina
696        ND = 33, // ND North Dakota
697        NE = 34, // NE Nebraska
698        NH = 35, // NH New Hampshire
699        NJ = 36, // NJ New Jersey
700        NM = 37, // NM New Mexico
701        NV = 38, // NV Nevada
702        NY = 39, // NY New York
703        OH = 40, // OH Ohio
704        OK = 41, // OK Oklahoma
705        OR = 42, // OR Oregon
706        PA = 43, // PA Pennsylvania
707        PR = 44, // PR Puerto Rico
708        PW = 45, // PW Palau
709        RI = 46, // RI Rhode Island
710        SC = 47, // SC South Carolina
711        SD = 48, // SD South Dakota
712        TN = 49, // TN Tennessee
713        TX = 50, // TX Texas
714        UT = 51, // UT Utah
715        VA = 52, // VA Virginia
716        VI = 53, // VI Virgin Islands
717        VT = 54, // VT Vermont
718        WA = 55, // WA Washington
719        WI = 56, // WI Wisconsin
720        WV = 57, // WV West Virginia
721        WY = 58, // WY Wyoming
722    };
723
724    static const USState stateForZipPrefix[] = {
725    //   0   1   2   3   4   5   6   7   8   9
726        XX, XX, XX, XX, XX, NY, PR, PR, VI, PR, // 000-009
727        MA, MA, MA, MA, MA, MA, MA, MA, MA, MA, // 010-019
728        MA, MA, MA, MA, MA, MA, MA, MA, RI, RI, // 020-029
729        NH, NH, NH, NH, NH, NH, NH, NH, NH, ME, // 030-039
730        ME, ME, ME, ME, ME, ME, ME, ME, ME, ME, // 040-049
731        VT, VT, VT, VT, VT, MA, VT, VT, VT, VT, // 050-059
732        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 060-069
733        NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 070-079
734        NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 080-089
735        AE, AE, AE, AE, AE, AE, AE, AE, AE, XX, // 090-099
736        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 100-109
737        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 110-119
738        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 120-129
739        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 130-139
740        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 140-149
741        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 150-159
742        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 160-169
743        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 170-179
744        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 180-189
745        PA, PA, PA, PA, PA, PA, PA, DE, DE, DE, // 190-199
746        DC, VA, DC, DC, DC, DC, MD, MD, MD, MD, // 200-209
747        MD, MD, MD, XX, MD, MD, MD, MD, MD, MD, // 210-219
748        VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 220-229
749        VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 230-239
750        VA, VA, VA, VA, VA, VA, VA, WV, WV, WV, // 240-249
751        WV, WV, WV, WV, WV, WV, WV, WV, WV, WV, // 250-259
752        WV, WV, WV, WV, WV, WV, WV, WV, WV, XX, // 260-269
753        NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 270-279
754        NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 280-289
755        SC, SC, SC, SC, SC, SC, SC, SC, SC, SC, // 290-299
756        GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 300-309
757        GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 310-319
758        FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 320-329
759        FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 330-339
760        AA, FL, FL, XX, FL, XX, FL, FL, XX, FL, // 340-349
761        AL, AL, AL, XX, AL, AL, AL, AL, AL, AL, // 350-359
762        AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 360-369
763        TN, TN, TN, TN, TN, TN, TN, TN, TN, TN, // 370-379
764        TN, TN, TN, TN, TN, TN, MS, MS, MS, MS, // 380-389
765        MS, MS, MS, MS, MS, MS, MS, MS, GA, GA, // 390-399
766        KY, KY, KY, KY, KY, KY, KY, KY, KY, KY, // 400-409
767        KY, KY, KY, KY, KY, KY, KY, KY, KY, XX, // 410-419
768        KY, KY, KY, KY, KY, KY, KY, KY, XX, XX, // 420-429
769        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 430-439
770        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 440-449
771        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 450-459
772        IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 460-469
773        IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 470-479
774        MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 480-489
775        MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 490-499
776        IA, IA, IA, IA, IA, IA, IA, IA, IA, IA, // 500-509
777        IA, IA, IA, IA, IA, IA, IA, XX, XX, XX, // 510-519
778        IA, IA, IA, IA, IA, IA, IA, IA, IA, XX, // 520-529
779        WI, WI, WI, XX, WI, WI, XX, WI, WI, WI, // 530-539
780        WI, WI, WI, WI, WI, WI, WI, WI, WI, WI, // 540-549
781        MN, MN, XX, MN, MN, MN, MN, MN, MN, MN, // 550-559
782        MN, MN, MN, MN, MN, MN, MN, MN, XX, DC, // 560-569
783        SD, SD, SD, SD, SD, SD, SD, SD, XX, XX, // 570-579
784        ND, ND, ND, ND, ND, ND, ND, ND, ND, XX, // 580-589
785        MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, // 590-599
786        IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 600-609
787        IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 610-619
788        IL, XX, IL, IL, IL, IL, IL, IL, IL, IL, // 620-629
789        MO, MO, XX, MO, MO, MO, MO, MO, MO, MO, // 630-639
790        MO, MO, XX, XX, MO, MO, MO, MO, MO, MO, // 640-649
791        MO, MO, MO, MO, MO, MO, MO, MO, MO, XX, // 650-659
792        KS, KS, KS, XX, KS, KS, KS, KS, KS, KS, // 660-669
793        KS, KS, KS, KS, KS, KS, KS, KS, KS, KS, // 670-679
794        NE, NE, XX, NE, NE, NE, NE, NE, NE, NE, // 680-689
795        NE, NE, NE, NE, XX, XX, XX, XX, XX, XX, // 690-699
796        LA, LA, XX, LA, LA, LA, LA, LA, LA, XX, // 700-709
797        LA, LA, LA, LA, LA, XX, AR, AR, AR, AR, // 710-719
798        AR, AR, AR, AR, AR, AR, AR, AR, AR, AR, // 720-729
799        OK, OK, XX, TX, OK, OK, OK, OK, OK, OK, // 730-739
800        OK, OK, XX, OK, OK, OK, OK, OK, OK, OK, // 740-749
801        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 750-759
802        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 760-769
803        TX, XX, TX, TX, TX, TX, TX, TX, TX, TX, // 770-779
804        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 780-789
805        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 790-799
806        CO, CO, CO, CO, CO, CO, CO, CO, CO, CO, // 800-809
807        CO, CO, CO, CO, CO, CO, CO, XX, XX, XX, // 810-819
808        WY, WY, WY, WY, WY, WY, WY, WY, WY, WY, // 820-829
809        WY, WY, ID, ID, ID, ID, ID, ID, ID, XX, // 830-839
810        UT, UT, UT, UT, UT, UT, UT, UT, XX, XX, // 840-849
811        AZ, AZ, AZ, AZ, XX, AZ, AZ, AZ, XX, AZ, // 850-859
812        AZ, XX, XX, AZ, AZ, AZ, XX, XX, XX, XX, // 860-869
813        NM, NM, NM, NM, NM, NM, XX, NM, NM, NM, // 870-879
814        NM, NM, NM, NM, NM, TX, XX, XX, XX, NV, // 880-889
815        NV, NV, XX, NV, NV, NV, XX, NV, NV, XX, // 890-899
816        CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 900-909
817        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 910-919
818        CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 920-929
819        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 930-939
820        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 940-949
821        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 950-959
822        CA, CA, AP, AP, AP, AP, AP, HI, HI, GU, // 960-969
823        OR, OR, OR, OR, OR, OR, OR, OR, OR, OR, // 970-979
824        WA, WA, WA, WA, WA, WA, WA, XX, WA, WA, // 980-989
825        WA, WA, WA, WA, WA, AK, AK, AK, AK, AK, // 990-999
826    };
827
828    if (!word.begin || !word.end || (word.end - word.begin) < 3)
829        return false;
830    const char16* zipPtr = word.begin;
831    if (zipPtr[0] < '0' || zipPtr[0] > '9' ||
832        zipPtr[1] < '0' || zipPtr[1] > '9' ||
833        zipPtr[2] < '0' || zipPtr[2] > '9')
834        return false;
835
836    int zip = zipPtr[0] - '0';
837    zip *= 10;
838    zip += zipPtr[1] - '0';
839    zip *= 10;
840    zip += zipPtr[2] - '0';
841    return stateForZipPrefix[zip] == (int) state_index;
842}
843
844static const char* s_rawStreetSuffixes[] = {
845    "allee", "alley", "ally", "aly",
846    "anex", "annex", "anx", "arc", "arcade", "av", "ave", "aven", "avenu",
847    "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend",
848    "bg", "bgs", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd",
849    "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch",
850    "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks",
851    "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu",
852    "camp", "canyn", "canyon", "cape", "causeway", "causway", "cen", "cent",
853    "center", "centers", "centr", "centre", "cir", "circ", "circl",
854    "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff",
855    "cliffs", "club", "cmn", "cmp", "cnter", "cntr", "cnyn", "common",
856    "cor", "corner", "corners", "cors", "course", "court", "courts", "cove",
857    "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres",
858    "crescent", "cresent", "crest", "crk", "crossing", "crossroad",
859    "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt",
860    "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn",
861    "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive",
862    "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests",
863    "exp", "expr", "express", "expressway", "expw", "expy", "ext",
864    "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls",
865    "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls",
866    "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge",
867    "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy",
868    "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy",
869    "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy",
870    "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden",
871    "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove",
872    "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors",
873    "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts",
874    "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw",
875    "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts",
876    "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands",
877    "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn",
878    "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key",
879    "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la",
880    "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks",
881    "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks",
882    "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop",
883    "loops", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows",
884    "medows", "mews", "mill", "mills", "mission", "missn", "ml", "mls",
885    "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount",
886    "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn",
887    "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd",
888    "oval", "overpass", "ovl", "park", "parks", "parkway", "parkways",
889    "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine",
890    "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain",
891    "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne",
892    "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie",
893    "prk", "prr", "prt", "prts", "psge", "pt", "pts", "rad", "radial",
894    "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd",
895    "rdg", "rdge", "rdgs", "rds", "real", "rest", "ridge", "ridges", "riv", "river",
896    "rivr", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds",
897    "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals",
898    "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway",
899    "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng",
900    "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ",
901    "square", "squares", "st", "sta", "station", "statn", "stn", "str",
902    "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream",
903    "street", "streets", "streme", "strm", "strt", "strvn", "strvnue",
904    "sts", "sumit", "sumitt", "summit", "ter", "terr", "terrace",
905    "throughway", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks",
906    "trafficway", "trail", "trails", "trak", "trce", "trfy", "trk", "trks",
907    "trl", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls",
908    "tunnel", "tunnels", "tunnl", "turnpike", "turnpk", "un", "underpass",
909    "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct",
910    "via", "viadct", "viaduct", "view", "views", "vill", "villag",
911    "village", "villages", "ville", "villg", "villiage", "vis", "vist",
912    "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta",
913    "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells",
914    "wl", "wls", "wy", "xing", "xrd",
915    0,
916};
917
918bool AddressDetector::IsValidLocationName(const Word& word) {
919    using namespace WTF;
920    static HashSet<String> streetNames;
921    if (!streetNames.size()) {
922        const char** suffixes = s_rawStreetSuffixes;
923        while (const char* suffix = *suffixes) {
924            int index = suffix[0] - 'a';
925            streetNames.add(suffix);
926            suffixes++;
927        }
928    }
929    char16 first_letter = base::ToLowerASCII(*word.begin);
930    if (first_letter > 'z' || first_letter < 'a')
931        return false;
932    int index = first_letter - 'a';
933    int length = std::distance(word.begin, word.end);
934    if (*word.end == '.')
935        length--;
936    String value(word.begin, length);
937    return streetNames.contains(value.lower());
938}
939