1/*
2 * Copyright (C) 2012 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *    * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *    * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *    * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32
33// Magic pretend-to-be-a-chromium-build flags
34#undef WEBKIT_IMPLEMENTATION
35#undef LOG
36
37#include "content/address_detector.h"
38
39#include <bitset>
40
41#include "base/utf_string_conversions.h"
42#include "net/base/escape.h"
43#include "Settings.h"
44#include "WebString.h"
45
46#include <wtf/HashSet.h>
47#include <wtf/Noncopyable.h>
48#include <wtf/text/StringHash.h>
49#include <wtf/text/WTFString.h>
50
51namespace {
52
53// Prefix used for geographical address intent URIs.
54static const char kAddressSchemaPrefix[] = "geo:0,0?q=";
55
56// Maximum text length to be searched for address detection.
57static const size_t kMaxAddressLength = 500;
58
59// Minimum number of words in an address after the house number
60// before a state is expected to be found.
61// A value too high can miss short addresses.
62const size_t kMinAddressWords = 3;
63
64// Maximum number of words allowed in an address between the house number
65// and the state, both not included.
66const size_t kMaxAddressWords = 12;
67
68// Maximum number of lines allowed in an address between the house number
69// and the state, both not included.
70const size_t kMaxAddressLines = 5;
71
72// Maximum length allowed for any address word between the house number
73// and the state, both not included.
74const size_t kMaxAddressNameWordLength = 25;
75
76// Maximum number of words after the house number in which the location name
77// should be found.
78const size_t kMaxLocationNameDistance = 4;
79
80// Number of digits for a valid zip code.
81const size_t kZipDigits = 5;
82
83// Number of digits for a valid zip code in the Zip Plus 4 format.
84const size_t kZipPlus4Digits = 9;
85
86// Maximum number of digits of a house number, including possible hyphens.
87const size_t kMaxHouseDigits = 5;
88
89// Additional characters used as new line delimiters.
90const char16 kNewlineDelimiters[] = {
91  ',',
92  '*',
93  0x2022,  // Unicode bullet
94  0,
95};
96
97char16 SafePreviousChar(const string16::const_iterator& it,
98    const string16::const_iterator& begin) {
99  if (it == begin)
100    return ' ';
101  return *(it - 1);
102}
103
104char16 SafeNextChar(const string16::const_iterator& it,
105    const string16::const_iterator& end) {
106  if (it == end)
107    return ' ';
108  return *(it + 1);
109}
110
111bool WordLowerCaseEqualsASCII(string16::const_iterator word_begin,
112    string16::const_iterator word_end, const char* ascii_to_match) {
113  for (string16::const_iterator it = word_begin; it != word_end;
114      ++it, ++ascii_to_match) {
115    if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
116      return false;
117  }
118  return *ascii_to_match == 0 || *ascii_to_match == ' ';
119}
120
121bool LowerCaseEqualsASCIIWithPlural(string16::const_iterator word_begin,
122    string16::const_iterator word_end, const char* ascii_to_match,
123    bool allow_plural) {
124  for (string16::const_iterator it = word_begin; it != word_end;
125      ++it, ++ascii_to_match) {
126    if (!*ascii_to_match && allow_plural && *it == 's' && it + 1 == word_end)
127      return true;
128
129    if (!*ascii_to_match || base::ToLowerASCII(*it) != *ascii_to_match)
130      return false;
131  }
132  return *ascii_to_match == 0;
133}
134
135}  // anonymous namespace
136
137
138AddressDetector::AddressDetector() {
139}
140
141AddressDetector::~AddressDetector() {
142}
143
144std::string AddressDetector::GetContentText(const WebKit::WebRange& range) {
145  // Get the address and replace unicode bullets with commas.
146  string16 address_16 = CollapseWhitespace(range.toPlainText(), false);
147  std::replace(address_16.begin(), address_16.end(),
148      static_cast<char16>(0x2022), static_cast<char16>(','));
149  return UTF16ToUTF8(address_16);
150}
151
152GURL AddressDetector::GetIntentURL(const std::string& content_text) {
153  return GURL(kAddressSchemaPrefix +
154      EscapeQueryParamValue(content_text, true));
155}
156
157size_t AddressDetector::GetMaximumContentLength() {
158  return kMaxAddressLength;
159}
160
161bool AddressDetector::IsEnabled(const WebKit::WebHitTestInfo& hit_test) {
162  WebCore::Settings* settings = GetSettings(hit_test);
163  return settings && settings->formatDetectionAddress();
164}
165
166bool AddressDetector::FindContent(const string16::const_iterator& begin,
167    const string16::const_iterator& end, size_t* start_pos, size_t* end_pos) {
168  HouseNumberParser house_number_parser;
169
170  // Keep going through the input string until a potential house number is
171  // detected. Start tokenizing the following words to find a valid
172  // street name within a word range. Then, find a state name followed
173  // by a valid zip code for that state. Also keep a look for any other
174  // possible house numbers to continue from in case of no match and for
175  // state names not followed by a zip code (e.g. New York, NY 10000).
176  const string16 newline_delimiters = kNewlineDelimiters;
177  const string16 delimiters = kWhitespaceUTF16 + newline_delimiters;
178  for (string16::const_iterator it = begin; it != end; ) {
179    Word house_number;
180    if (!house_number_parser.Parse(it, end, &house_number))
181      return false;
182
183    String16Tokenizer tokenizer(house_number.end, end, delimiters);
184    tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
185
186    std::vector<Word> words;
187    words.push_back(house_number);
188
189    bool found_location_name = false;
190    bool continue_on_house_number = true;
191    size_t next_house_number_word = 0;
192    size_t num_lines = 1;
193
194    // Don't include the house number in the word count.
195    size_t next_word = 1;
196    for (; next_word <= kMaxAddressWords + 1; ++next_word) {
197
198      // Extract a new word from the tokenizer.
199      if (next_word == words.size()) {
200        do {
201          if (!tokenizer.GetNext())
202            return false;
203
204          // Check the number of address lines.
205          if (tokenizer.token_is_delim() && newline_delimiters.find(
206              *tokenizer.token_begin()) != string16::npos) {
207            ++num_lines;
208          }
209        } while (tokenizer.token_is_delim());
210
211        if (num_lines > kMaxAddressLines)
212          break;
213
214        words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
215      }
216
217      // Check the word length. If too long, don't try to continue from
218      // the next house number as no address can hold this word.
219      const Word& current_word = words[next_word];
220      DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
221      size_t current_word_length = std::distance(
222          current_word.begin, current_word.end);
223      if (current_word_length > kMaxAddressNameWordLength) {
224        continue_on_house_number = false;
225        break;
226      }
227
228      // Check if the new word is a valid house number.
229      // This is used to properly resume parsing in case the maximum number
230      // of words is exceeded.
231      if (next_house_number_word == 0 &&
232          house_number_parser.Parse(current_word.begin, current_word.end, NULL)) {
233        next_house_number_word = next_word;
234        continue;
235      }
236
237      // Look for location names in the words after the house number.
238      // A range limitation is introduced to avoid matching
239      // anything that starts with a number before a legitimate address.
240      if (next_word <= kMaxLocationNameDistance &&
241          IsValidLocationName(current_word)) {
242        found_location_name = true;
243        continue;
244      }
245
246      // Don't count the house number.
247      if (next_word > kMinAddressWords) {
248        // Looking for the state is likely to add new words to the list while
249        // checking for multi-word state names.
250        size_t state_first_word = next_word;
251        size_t state_last_word, state_index;
252        if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
253            &tokenizer, &state_index)) {
254
255          // A location name should have been found at this point.
256          if (!found_location_name)
257            break;
258
259          // Explicitly exclude "et al", as "al" is a valid state code.
260          if (current_word_length == 2 && words.size() > 2) {
261            const Word& previous_word = words[state_first_word - 1];
262            if (previous_word.end - previous_word.begin == 2 &&
263                LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
264                    "et") &&
265                LowerCaseEqualsASCII(current_word.begin, current_word.end,
266                    "al"))
267              break;
268          }
269
270          // Extract one more word from the tokenizer if not already available.
271          size_t zip_word = state_last_word + 1;
272          if (zip_word == words.size()) {
273            do {
274              if (!tokenizer.GetNext()) {
275                // Zip is optional
276                *start_pos = words[0].begin - begin;
277                *end_pos = words[state_last_word].end - begin;
278                return true;
279              }
280            } while (tokenizer.token_is_delim());
281            words.push_back(Word(tokenizer.token_begin(),
282                tokenizer.token_end()));
283          }
284
285          // Check the parsing validity and state range of the zip code.
286          next_word = state_last_word;
287          if (!IsZipValid(words[zip_word], state_index))
288            continue;
289
290          *start_pos = words[0].begin - begin;
291          *end_pos = words[zip_word].end - begin;
292          return true;
293        }
294      }
295    }
296
297    // Avoid skipping too many words because of a non-address number
298    // at the beginning of the contents to parse.
299    if (continue_on_house_number && next_house_number_word > 0) {
300      it = words[next_house_number_word].begin;
301    } else {
302      DCHECK(!words.empty());
303      next_word = std::min(next_word, words.size() - 1);
304      it = words[next_word].end;
305    }
306  }
307
308  return false;
309}
310
311bool AddressDetector::HouseNumberParser::IsPreDelimiter(
312    char16 character) {
313  return character == ':' || IsPostDelimiter(character);
314}
315
316bool AddressDetector::HouseNumberParser::IsPostDelimiter(
317    char16 character) {
318  return IsWhitespace(character) || strchr(",\"'", character);
319}
320
321void AddressDetector::HouseNumberParser::RestartOnNextDelimiter() {
322  ResetState();
323  for (; it_ != end_ && !IsPreDelimiter(*it_); ++it_) {}
324}
325
326void AddressDetector::HouseNumberParser::AcceptChars(size_t num_chars) {
327  size_t offset = std::min(static_cast<size_t>(std::distance(it_, end_)),
328      num_chars);
329  it_ += offset;
330  result_chars_ += offset;
331}
332
333void AddressDetector::HouseNumberParser::SkipChars(size_t num_chars) {
334  it_ += std::min(static_cast<size_t>(std::distance(it_, end_)), num_chars);
335}
336
337void AddressDetector::HouseNumberParser::ResetState() {
338  num_digits_ = 0;
339  result_chars_ = 0;
340}
341
342bool AddressDetector::HouseNumberParser::CheckFinished(Word* word) const {
343  // There should always be a number after a hyphen.
344  if (result_chars_ == 0 || SafePreviousChar(it_, begin_) == '-')
345    return false;
346
347  if (word) {
348    word->begin = it_ - result_chars_;
349    word->end = it_;
350  }
351  return true;
352}
353
354bool AddressDetector::HouseNumberParser::Parse(
355    const string16::const_iterator& begin,
356    const string16::const_iterator& end, Word* word) {
357  it_ = begin_ = begin;
358  end_ = end;
359  ResetState();
360
361  // Iterations only used as a fail-safe against any buggy infinite loops.
362  size_t iterations = 0;
363  size_t max_iterations = end - begin + 1;
364  for (; it_ != end_ && iterations < max_iterations; ++iterations) {
365
366    // Word finished case.
367    if (IsPostDelimiter(*it_)) {
368      if (CheckFinished(word))
369        return true;
370      else if (result_chars_)
371        ResetState();
372
373      SkipChars(1);
374      continue;
375    }
376
377    // More digits. There should be no more after a letter was found.
378    if (IsAsciiDigit(*it_)) {
379      if (num_digits_ >= kMaxHouseDigits) {
380        RestartOnNextDelimiter();
381      } else {
382        AcceptChars(1);
383        ++num_digits_;
384      }
385      continue;
386    }
387
388    if (IsAsciiAlpha(*it_)) {
389      // Handle special case 'one'.
390      if (result_chars_ == 0) {
391        if (it_ + 3 <= end_ && LowerCaseEqualsASCII(it_, it_ + 3, "one"))
392          AcceptChars(3);
393        else
394          RestartOnNextDelimiter();
395        continue;
396      }
397
398      // There should be more than 1 character because of result_chars.
399      DCHECK_GT(result_chars_, 0U);
400      DCHECK_NE(it_, begin_);
401      char16 previous = SafePreviousChar(it_, begin_);
402      if (IsAsciiDigit(previous)) {
403        // Check cases like '12A'.
404        char16 next = SafeNextChar(it_, end_);
405        if (IsPostDelimiter(next)) {
406          AcceptChars(1);
407          continue;
408        }
409
410        // Handle cases like 12a, 1st, 2nd, 3rd, 7th.
411        if (IsAsciiAlpha(next)) {
412          char16 last_digit = previous;
413          char16 first_letter = base::ToLowerASCII(*it_);
414          char16 second_letter = base::ToLowerASCII(next);
415          bool is_teen = SafePreviousChar(it_ - 1, begin_) == '1' &&
416              num_digits_ == 2;
417
418          switch (last_digit - '0') {
419          case 1:
420            if ((first_letter == 's' && second_letter == 't') ||
421                (first_letter == 't' && second_letter == 'h' && is_teen)) {
422              AcceptChars(2);
423              continue;
424            }
425            break;
426
427          case 2:
428            if ((first_letter == 'n' && second_letter == 'd') ||
429                (first_letter == 't' && second_letter == 'h' && is_teen)) {
430              AcceptChars(2);
431              continue;
432            }
433            break;
434
435          case 3:
436            if ((first_letter == 'r' && second_letter == 'd') ||
437                (first_letter == 't' && second_letter == 'h' && is_teen)) {
438              AcceptChars(2);
439              continue;
440            }
441            break;
442
443          case 0:
444            // Explicitly exclude '0th'.
445            if (num_digits_ == 1)
446              break;
447
448          case 4:
449          case 5:
450          case 6:
451          case 7:
452          case 8:
453          case 9:
454            if (first_letter == 't' && second_letter == 'h') {
455              AcceptChars(2);
456              continue;
457            }
458            break;
459
460          default:
461            NOTREACHED();
462          }
463        }
464      }
465
466      RestartOnNextDelimiter();
467      continue;
468    }
469
470    if (*it_ == '-' && num_digits_ > 0) {
471      AcceptChars(1);
472      ++num_digits_;
473      continue;
474    }
475
476    RestartOnNextDelimiter();
477    SkipChars(1);
478  }
479
480  if (iterations >= max_iterations)
481    return false;
482
483  return CheckFinished(word);
484}
485
486bool AddressDetector::FindStateStartingInWord(WordList* words,
487    size_t state_first_word, size_t* state_last_word,
488    String16Tokenizer* tokenizer, size_t* state_index) {
489
490  // Bitmasks containing the allowed suffixes for 2-letter state codes.
491  static const int state_two_letter_suffix[23] = {
492    0x02060c00,  // A followed by: [KLRSZ].
493    0x00000000,  // B.
494    0x00084001,  // C followed by: [AOT].
495    0x00000014,  // D followed by: [CE].
496    0x00000000,  // E.
497    0x00001800,  // F followed by: [LM].
498    0x00100001,  // G followed by: [AU].
499    0x00000100,  // H followed by: [I].
500    0x00002809,  // I followed by: [ADLN].
501    0x00000000,  // J.
502    0x01040000,  // K followed by: [SY].
503    0x00000001,  // L followed by: [A].
504    0x000ce199,  // M followed by: [ADEHINOPST].
505    0x0120129c,  // N followed by: [CDEHJMVY].
506    0x00020480,  // O followed by: [HKR].
507    0x00420001,  // P followed by: [ARW].
508    0x00000000,  // Q.
509    0x00000100,  // R followed by: [I].
510    0x0000000c,  // S followed by: [CD].
511    0x00802000,  // T followed by: [NX].
512    0x00080000,  // U followed by: [T].
513    0x00080101,  // V followed by: [AIT].
514    0x01200101   // W followed by: [AIVY].
515  };
516
517  // Accumulative number of states for the 2-letter code indexed by the first.
518  static const int state_two_letter_accumulative[24] = {
519     0,  5,  5,  8, 10, 10, 12, 14,
520    15, 19, 19, 21, 22, 32, 40, 43,
521    46, 46, 47, 49, 51, 52, 55, 59
522  };
523
524  // State names sorted alphabetically with their lengths.
525  // There can be more than one possible name for a same state if desired.
526  static const struct StateNameInfo {
527    const char* string;
528    char first_word_length;
529    char length;
530    char state_index; // Relative to two-character code alphabetical order.
531  } state_names[59] = {
532    { "alabama", 7, 7, 1 }, { "alaska", 6, 6, 0 },
533    { "american samoa", 8, 14, 3 }, { "arizona", 7, 7, 4 },
534    { "arkansas", 8, 8, 2 },
535    { "california", 10, 10, 5 }, { "colorado", 8, 8, 6 },
536    { "connecticut", 11, 11, 7 }, { "delaware", 8, 8, 9 },
537    { "district of columbia", 8, 20, 8 },
538    { "federated states of micronesia", 9, 30, 11 }, { "florida", 7, 7, 10 },
539    { "guam", 4, 4, 13 }, { "georgia", 7, 7, 12 },
540    { "hawaii", 6, 6, 14 },
541    { "idaho", 5, 5, 16 }, { "illinois", 8, 8, 17 }, { "indiana", 7, 7, 18 },
542    { "iowa", 4, 4, 15 },
543    { "kansas", 6, 6, 19 }, { "kentucky", 8, 8, 20 },
544    { "louisiana", 9, 9, 21 },
545    { "maine", 5, 5, 24 }, { "marshall islands", 8, 16, 25 },
546    { "maryland", 8, 8, 23 }, { "massachusetts", 13, 13, 22 },
547    { "michigan", 8, 8, 26 }, { "minnesota", 9, 9, 27 },
548    { "mississippi", 11, 11, 30 }, { "missouri", 8, 8, 28 },
549    { "montana", 7, 7, 31 },
550    { "nebraska", 8, 8, 34 }, { "nevada", 6, 6, 38 },
551    { "new hampshire", 3, 13, 35 }, { "new jersey", 3, 10, 36 },
552    { "new mexico", 3, 10, 37 }, { "new york", 3, 8, 39 },
553    { "north carolina", 5, 14, 32 }, { "north dakota", 5, 12, 33 },
554    { "northern mariana islands", 8, 24, 29 },
555    { "ohio", 4, 4, 40 }, { "oklahoma", 8, 8, 41 }, { "oregon", 6, 6, 42 },
556    { "palau", 5, 5, 45 }, { "pennsylvania", 12, 12, 43 },
557    { "puerto rico", 6, 11, 44 },
558    { "rhode island", 5, 5, 46 },
559    { "south carolina", 5, 14, 47 }, { "south dakota", 5, 12, 48 },
560    { "tennessee", 9, 9, 49 }, { "texas", 5, 5, 50 },
561    { "utah", 4, 4, 51 },
562    { "vermont", 7, 7, 54 }, { "virgin islands", 6, 14, 53 },
563    { "virginia", 8, 8, 52 },
564    { "washington", 10, 10, 55 }, { "west virginia", 4, 13, 57 },
565    { "wisconsin", 9, 9, 56 }, { "wyoming", 7, 7, 58 }
566  };
567
568  // Accumulative number of states for sorted names indexed by the first letter.
569  // Required a different one since there are codes that don't share their
570  // first letter with the name of their state (MP = Northern Mariana Islands).
571  static const int state_names_accumulative[24] = {
572     0,  5,  5,  8, 10, 10, 12, 14,
573    15, 19, 19, 21, 22, 31, 40, 43,
574    46, 46, 47, 49, 51, 52, 55, 59
575  };
576
577  DCHECK_EQ(state_names_accumulative[arraysize(state_names_accumulative) - 1],
578      static_cast<int>(ARRAYSIZE_UNSAFE(state_names)));
579
580  const Word& first_word = words->at(state_first_word);
581  int length = first_word.end - first_word.begin;
582  if (length < 2 || !IsAsciiAlpha(*first_word.begin))
583    return false;
584
585  // No state names start with x, y, z.
586  char16 first_letter = base::ToLowerASCII(*first_word.begin);
587  if (first_letter > 'w')
588    return false;
589
590  DCHECK(first_letter >= 'a');
591  int first_index = first_letter - 'a';
592
593  // Look for two-letter state names.
594  if (length == 2 && IsAsciiAlpha(*(first_word.begin + 1))) {
595    char16 second_letter = base::ToLowerASCII(*(first_word.begin + 1));
596    DCHECK(second_letter >= 'a');
597
598    int second_index = second_letter - 'a';
599    if (!(state_two_letter_suffix[first_index] & (1 << second_index)))
600      return false;
601
602    std::bitset<32> previous_suffixes = state_two_letter_suffix[first_index] &
603        ((1 << second_index) - 1);
604    *state_last_word = state_first_word;
605    *state_index = state_two_letter_accumulative[first_index] +
606        previous_suffixes.count();
607    return true;
608  }
609
610  // Look for full state names by their first letter. Discard by length.
611  for (int state = state_names_accumulative[first_index];
612      state < state_names_accumulative[first_index + 1]; ++state) {
613    if (state_names[state].first_word_length != length)
614      continue;
615
616    bool state_match = false;
617    size_t state_word = state_first_word;
618    for (int pos = 0; true; ) {
619      if (!WordLowerCaseEqualsASCII(words->at(state_word).begin,
620          words->at(state_word).end, &state_names[state].string[pos]))
621        break;
622
623      pos += words->at(state_word).end - words->at(state_word).begin + 1;
624      if (pos >= state_names[state].length) {
625        state_match = true;
626        break;
627      }
628
629      // Ran out of words, extract more from the tokenizer.
630      if (++state_word == words->size()) {
631        do {
632          if (!tokenizer->GetNext())
633            break;
634        } while (tokenizer->token_is_delim());
635        words->push_back(Word(tokenizer->token_begin(), tokenizer->token_end()));
636      }
637    }
638
639    if (state_match) {
640      *state_last_word = state_word;
641      *state_index = state_names[state].state_index;
642      return true;
643    }
644  }
645
646  return false;
647}
648
649bool AddressDetector::IsZipValid(const Word& word, size_t state_index) {
650  size_t length = word.end - word.begin;
651  if (length != kZipDigits && length != kZipPlus4Digits + 1)
652    return false;
653
654  for (string16::const_iterator it = word.begin; it != word.end; ++it) {
655    size_t pos = it - word.begin;
656    if (IsAsciiDigit(*it) || (*it == '-' && pos == kZipDigits))
657      continue;
658    return false;
659  }
660  return IsZipValidForState(word, state_index);
661}
662
663bool AddressDetector::IsZipValidForState(const Word& word, size_t state_index)
664{
665    enum USState {
666        AP = -4, // AP (military base in the Pacific)
667        AA = -3, // AA (military base inside the US)
668        AE = -2, // AE (military base outside the US)
669        XX = -1, // (not in use)
670        AK =  0, // AK Alaska
671        AL =  1, // AL Alabama
672        AR =  2, // AR Arkansas
673        AS =  3, // AS American Samoa
674        AZ =  4, // AZ Arizona
675        CA =  5, // CA California
676        CO =  6, // CO Colorado
677        CT =  7, // CT Connecticut
678        DC =  8, // DC District of Columbia
679        DE =  9, // DE Delaware
680        FL = 10, // FL Florida
681        FM = 11, // FM Federated States of Micronesia
682        GA = 12, // GA Georgia
683        GU = 13, // GU Guam
684        HI = 14, // HI Hawaii
685        IA = 15, // IA Iowa
686        ID = 16, // ID Idaho
687        IL = 17, // IL Illinois
688        IN = 18, // IN Indiana
689        KS = 19, // KS Kansas
690        KY = 20, // KY Kentucky
691        LA = 21, // LA Louisiana
692        MA = 22, // MA Massachusetts
693        MD = 23, // MD Maryland
694        ME = 24, // ME Maine
695        MH = 25, // MH Marshall Islands
696        MI = 26, // MI Michigan
697        MN = 27, // MN Minnesota
698        MO = 28, // MO Missouri
699        MP = 29, // MP Northern Mariana Islands
700        MS = 30, // MS Mississippi
701        MT = 31, // MT Montana
702        NC = 32, // NC North Carolina
703        ND = 33, // ND North Dakota
704        NE = 34, // NE Nebraska
705        NH = 35, // NH New Hampshire
706        NJ = 36, // NJ New Jersey
707        NM = 37, // NM New Mexico
708        NV = 38, // NV Nevada
709        NY = 39, // NY New York
710        OH = 40, // OH Ohio
711        OK = 41, // OK Oklahoma
712        OR = 42, // OR Oregon
713        PA = 43, // PA Pennsylvania
714        PR = 44, // PR Puerto Rico
715        PW = 45, // PW Palau
716        RI = 46, // RI Rhode Island
717        SC = 47, // SC South Carolina
718        SD = 48, // SD South Dakota
719        TN = 49, // TN Tennessee
720        TX = 50, // TX Texas
721        UT = 51, // UT Utah
722        VA = 52, // VA Virginia
723        VI = 53, // VI Virgin Islands
724        VT = 54, // VT Vermont
725        WA = 55, // WA Washington
726        WI = 56, // WI Wisconsin
727        WV = 57, // WV West Virginia
728        WY = 58, // WY Wyoming
729    };
730
731    static const USState stateForZipPrefix[] = {
732    //   0   1   2   3   4   5   6   7   8   9
733        XX, XX, XX, XX, XX, NY, PR, PR, VI, PR, // 000-009
734        MA, MA, MA, MA, MA, MA, MA, MA, MA, MA, // 010-019
735        MA, MA, MA, MA, MA, MA, MA, MA, RI, RI, // 020-029
736        NH, NH, NH, NH, NH, NH, NH, NH, NH, ME, // 030-039
737        ME, ME, ME, ME, ME, ME, ME, ME, ME, ME, // 040-049
738        VT, VT, VT, VT, VT, MA, VT, VT, VT, VT, // 050-059
739        CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 060-069
740        NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 070-079
741        NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, NJ, // 080-089
742        AE, AE, AE, AE, AE, AE, AE, AE, AE, XX, // 090-099
743        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 100-109
744        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 110-119
745        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 120-129
746        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 130-139
747        NY, NY, NY, NY, NY, NY, NY, NY, NY, NY, // 140-149
748        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 150-159
749        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 160-169
750        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 170-179
751        PA, PA, PA, PA, PA, PA, PA, PA, PA, PA, // 180-189
752        PA, PA, PA, PA, PA, PA, PA, DE, DE, DE, // 190-199
753        DC, VA, DC, DC, DC, DC, MD, MD, MD, MD, // 200-209
754        MD, MD, MD, XX, MD, MD, MD, MD, MD, MD, // 210-219
755        VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 220-229
756        VA, VA, VA, VA, VA, VA, VA, VA, VA, VA, // 230-239
757        VA, VA, VA, VA, VA, VA, VA, WV, WV, WV, // 240-249
758        WV, WV, WV, WV, WV, WV, WV, WV, WV, WV, // 250-259
759        WV, WV, WV, WV, WV, WV, WV, WV, WV, XX, // 260-269
760        NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 270-279
761        NC, NC, NC, NC, NC, NC, NC, NC, NC, NC, // 280-289
762        SC, SC, SC, SC, SC, SC, SC, SC, SC, SC, // 290-299
763        GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 300-309
764        GA, GA, GA, GA, GA, GA, GA, GA, GA, GA, // 310-319
765        FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 320-329
766        FL, FL, FL, FL, FL, FL, FL, FL, FL, FL, // 330-339
767        AA, FL, FL, XX, FL, XX, FL, FL, XX, FL, // 340-349
768        AL, AL, AL, XX, AL, AL, AL, AL, AL, AL, // 350-359
769        AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 360-369
770        TN, TN, TN, TN, TN, TN, TN, TN, TN, TN, // 370-379
771        TN, TN, TN, TN, TN, TN, MS, MS, MS, MS, // 380-389
772        MS, MS, MS, MS, MS, MS, MS, MS, GA, GA, // 390-399
773        KY, KY, KY, KY, KY, KY, KY, KY, KY, KY, // 400-409
774        KY, KY, KY, KY, KY, KY, KY, KY, KY, XX, // 410-419
775        KY, KY, KY, KY, KY, KY, KY, KY, XX, XX, // 420-429
776        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 430-439
777        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 440-449
778        OH, OH, OH, OH, OH, OH, OH, OH, OH, OH, // 450-459
779        IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 460-469
780        IN, IN, IN, IN, IN, IN, IN, IN, IN, IN, // 470-479
781        MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 480-489
782        MI, MI, MI, MI, MI, MI, MI, MI, MI, MI, // 490-499
783        IA, IA, IA, IA, IA, IA, IA, IA, IA, IA, // 500-509
784        IA, IA, IA, IA, IA, IA, IA, XX, XX, XX, // 510-519
785        IA, IA, IA, IA, IA, IA, IA, IA, IA, XX, // 520-529
786        WI, WI, WI, XX, WI, WI, XX, WI, WI, WI, // 530-539
787        WI, WI, WI, WI, WI, WI, WI, WI, WI, WI, // 540-549
788        MN, MN, XX, MN, MN, MN, MN, MN, MN, MN, // 550-559
789        MN, MN, MN, MN, MN, MN, MN, MN, XX, DC, // 560-569
790        SD, SD, SD, SD, SD, SD, SD, SD, XX, XX, // 570-579
791        ND, ND, ND, ND, ND, ND, ND, ND, ND, XX, // 580-589
792        MT, MT, MT, MT, MT, MT, MT, MT, MT, MT, // 590-599
793        IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 600-609
794        IL, IL, IL, IL, IL, IL, IL, IL, IL, IL, // 610-619
795        IL, XX, IL, IL, IL, IL, IL, IL, IL, IL, // 620-629
796        MO, MO, XX, MO, MO, MO, MO, MO, MO, MO, // 630-639
797        MO, MO, XX, XX, MO, MO, MO, MO, MO, MO, // 640-649
798        MO, MO, MO, MO, MO, MO, MO, MO, MO, XX, // 650-659
799        KS, KS, KS, XX, KS, KS, KS, KS, KS, KS, // 660-669
800        KS, KS, KS, KS, KS, KS, KS, KS, KS, KS, // 670-679
801        NE, NE, XX, NE, NE, NE, NE, NE, NE, NE, // 680-689
802        NE, NE, NE, NE, XX, XX, XX, XX, XX, XX, // 690-699
803        LA, LA, XX, LA, LA, LA, LA, LA, LA, XX, // 700-709
804        LA, LA, LA, LA, LA, XX, AR, AR, AR, AR, // 710-719
805        AR, AR, AR, AR, AR, AR, AR, AR, AR, AR, // 720-729
806        OK, OK, XX, TX, OK, OK, OK, OK, OK, OK, // 730-739
807        OK, OK, XX, OK, OK, OK, OK, OK, OK, OK, // 740-749
808        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 750-759
809        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 760-769
810        TX, XX, TX, TX, TX, TX, TX, TX, TX, TX, // 770-779
811        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 780-789
812        TX, TX, TX, TX, TX, TX, TX, TX, TX, TX, // 790-799
813        CO, CO, CO, CO, CO, CO, CO, CO, CO, CO, // 800-809
814        CO, CO, CO, CO, CO, CO, CO, XX, XX, XX, // 810-819
815        WY, WY, WY, WY, WY, WY, WY, WY, WY, WY, // 820-829
816        WY, WY, ID, ID, ID, ID, ID, ID, ID, XX, // 830-839
817        UT, UT, UT, UT, UT, UT, UT, UT, XX, XX, // 840-849
818        AZ, AZ, AZ, AZ, XX, AZ, AZ, AZ, XX, AZ, // 850-859
819        AZ, XX, XX, AZ, AZ, AZ, XX, XX, XX, XX, // 860-869
820        NM, NM, NM, NM, NM, NM, XX, NM, NM, NM, // 870-879
821        NM, NM, NM, NM, NM, TX, XX, XX, XX, NV, // 880-889
822        NV, NV, XX, NV, NV, NV, XX, NV, NV, XX, // 890-899
823        CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 900-909
824        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 910-919
825        CA, CA, CA, CA, CA, CA, CA, CA, CA, XX, // 920-929
826        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 930-939
827        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 940-949
828        CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, // 950-959
829        CA, CA, AP, AP, AP, AP, AP, HI, HI, GU, // 960-969
830        OR, OR, OR, OR, OR, OR, OR, OR, OR, OR, // 970-979
831        WA, WA, WA, WA, WA, WA, WA, XX, WA, WA, // 980-989
832        WA, WA, WA, WA, WA, AK, AK, AK, AK, AK, // 990-999
833    };
834
835    if (!word.begin || !word.end || (word.end - word.begin) < 3)
836        return false;
837    const char16* zipPtr = word.begin;
838    if (zipPtr[0] < '0' || zipPtr[0] > '9' ||
839        zipPtr[1] < '0' || zipPtr[1] > '9' ||
840        zipPtr[2] < '0' || zipPtr[2] > '9')
841        return false;
842
843    int zip = zipPtr[0] - '0';
844    zip *= 10;
845    zip += zipPtr[1] - '0';
846    zip *= 10;
847    zip += zipPtr[2] - '0';
848    return stateForZipPrefix[zip] == (int) state_index;
849}
850
851static const char* s_rawStreetSuffixes[] = {
852    "allee", "alley", "ally", "aly",
853    "anex", "annex", "anx", "arc", "arcade", "av", "ave", "aven", "avenu",
854    "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend",
855    "bg", "bgs", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd",
856    "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch",
857    "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks",
858    "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu",
859    "camp", "canyn", "canyon", "cape", "causeway", "causway", "cen", "cent",
860    "center", "centers", "centr", "centre", "cir", "circ", "circl",
861    "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff",
862    "cliffs", "club", "cmn", "cmp", "cnter", "cntr", "cnyn", "common",
863    "cor", "corner", "corners", "cors", "course", "court", "courts", "cove",
864    "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres",
865    "crescent", "cresent", "crest", "crk", "crossing", "crossroad",
866    "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt",
867    "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn",
868    "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive",
869    "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests",
870    "exp", "expr", "express", "expressway", "expw", "expy", "ext",
871    "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls",
872    "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls",
873    "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge",
874    "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy",
875    "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy",
876    "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy",
877    "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden",
878    "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove",
879    "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors",
880    "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts",
881    "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw",
882    "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts",
883    "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands",
884    "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn",
885    "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key",
886    "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la",
887    "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks",
888    "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks",
889    "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop",
890    "loops", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows",
891    "medows", "mews", "mill", "mills", "mission", "missn", "ml", "mls",
892    "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount",
893    "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn",
894    "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd",
895    "oval", "overpass", "ovl", "park", "parks", "parkway", "parkways",
896    "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine",
897    "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain",
898    "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne",
899    "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie",
900    "prk", "prr", "prt", "prts", "psge", "pt", "pts", "rad", "radial",
901    "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd",
902    "rdg", "rdge", "rdgs", "rds", "real", "rest", "ridge", "ridges", "riv", "river",
903    "rivr", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds",
904    "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals",
905    "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway",
906    "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng",
907    "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ",
908    "square", "squares", "st", "sta", "station", "statn", "stn", "str",
909    "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream",
910    "street", "streets", "streme", "strm", "strt", "strvn", "strvnue",
911    "sts", "sumit", "sumitt", "summit", "ter", "terr", "terrace",
912    "throughway", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks",
913    "trafficway", "trail", "trails", "trak", "trce", "trfy", "trk", "trks",
914    "trl", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls",
915    "tunnel", "tunnels", "tunnl", "turnpike", "turnpk", "un", "underpass",
916    "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct",
917    "via", "viadct", "viaduct", "view", "views", "vill", "villag",
918    "village", "villages", "ville", "villg", "villiage", "vis", "vist",
919    "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta",
920    "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells",
921    "wl", "wls", "wy", "xing", "xrd",
922    0,
923};
924
925bool AddressDetector::IsValidLocationName(const Word& word) {
926    using namespace WTF;
927    static HashSet<String> streetNames;
928    if (!streetNames.size()) {
929        const char** suffixes = s_rawStreetSuffixes;
930        while (const char* suffix = *suffixes) {
931            int index = suffix[0] - 'a';
932            streetNames.add(suffix);
933            suffixes++;
934        }
935    }
936    char16 first_letter = base::ToLowerASCII(*word.begin);
937    if (first_letter > 'z' || first_letter < 'a')
938        return false;
939    int index = first_letter - 'a';
940    int length = std::distance(word.begin, word.end);
941    if (*word.end == '.')
942        length--;
943    String value(word.begin, length);
944    return streetNames.contains(value.lower());
945}
946