address_field.cc revision 5d1f7b1de12d16ceb2c938c56701a3e8bfa558f7
1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "components/autofill/core/browser/address_field.h"
6
7#include <stddef.h>
8
9#include "base/logging.h"
10#include "base/memory/scoped_ptr.h"
11#include "base/strings/string16.h"
12#include "base/strings/string_util.h"
13#include "base/strings/utf_string_conversions.h"
14#include "components/autofill/core/browser/autofill_field.h"
15#include "components/autofill/core/browser/autofill_regex_constants.h"
16#include "components/autofill/core/browser/autofill_scanner.h"
17#include "components/autofill/core/browser/field_types.h"
18#include "ui/base/l10n/l10n_util.h"
19
20using base::UTF8ToUTF16;
21
22namespace autofill {
23
24FormField* AddressField::Parse(AutofillScanner* scanner) {
25  if (scanner->IsEnd())
26    return NULL;
27
28  scoped_ptr<AddressField> address_field(new AddressField);
29  const AutofillField* const initial_field = scanner->Cursor();
30  size_t saved_cursor = scanner->SaveCursor();
31
32  base::string16 attention_ignored = UTF8ToUTF16(autofill::kAttentionIgnoredRe);
33  base::string16 region_ignored = UTF8ToUTF16(autofill::kRegionIgnoredRe);
34
35  // Allow address fields to appear in any order.
36  size_t begin_trailing_non_labeled_fields = 0;
37  bool has_trailing_non_labeled_fields = false;
38  while (!scanner->IsEnd()) {
39    const size_t cursor = scanner->SaveCursor();
40    if (address_field->ParseAddressLines(scanner) ||
41        address_field->ParseCity(scanner) ||
42        address_field->ParseState(scanner) ||
43        address_field->ParseZipCode(scanner) ||
44        address_field->ParseCountry(scanner) ||
45        address_field->ParseCompany(scanner)) {
46      has_trailing_non_labeled_fields = false;
47      continue;
48    } else if (ParseField(scanner, attention_ignored, NULL) ||
49               ParseField(scanner, region_ignored, NULL)) {
50      // We ignore the following:
51      // * Attention.
52      // * Province/Region/Other.
53      continue;
54    } else if (scanner->Cursor() != initial_field &&
55               ParseEmptyLabel(scanner, NULL)) {
56      // Ignore non-labeled fields within an address; the page
57      // MapQuest Driving Directions North America.html contains such a field.
58      // We only ignore such fields after we've parsed at least one other field;
59      // otherwise we'd effectively parse address fields before other field
60      // types after any non-labeled fields, and we want email address fields to
61      // have precedence since some pages contain fields labeled
62      // "Email address".
63      if (!has_trailing_non_labeled_fields) {
64        has_trailing_non_labeled_fields = true;
65        begin_trailing_non_labeled_fields = cursor;
66      }
67
68      continue;
69    } else {
70      // No field found.
71      break;
72    }
73  }
74
75  // If we have identified any address fields in this field then it should be
76  // added to the list of fields.
77  if (address_field->company_ ||
78      address_field->address1_ ||
79      address_field->address2_ ||
80      address_field->street_address_ ||
81      address_field->city_ ||
82      address_field->state_ ||
83      address_field->zip_ ||
84      address_field->zip4_ ||
85      address_field->country_) {
86    // Don't slurp non-labeled fields at the end into the address.
87    if (has_trailing_non_labeled_fields)
88      scanner->RewindTo(begin_trailing_non_labeled_fields);
89
90    return address_field.release();
91  }
92
93  scanner->RewindTo(saved_cursor);
94  return NULL;
95}
96
97AddressField::AddressField()
98    : company_(NULL),
99      address1_(NULL),
100      address2_(NULL),
101      street_address_(NULL),
102      city_(NULL),
103      state_(NULL),
104      zip_(NULL),
105      zip4_(NULL),
106      country_(NULL) {
107}
108
109bool AddressField::ClassifyField(ServerFieldTypeMap* map) const {
110  // The page can request the address lines as a single textarea input or as
111  // multiple text fields (or not at all), but it shouldn't be possible to
112  // request both.
113  DCHECK(!(address1_ && street_address_));
114  DCHECK(!(address2_ && street_address_));
115
116  return AddClassification(company_, COMPANY_NAME, map) &&
117         AddClassification(address1_, ADDRESS_HOME_LINE1, map) &&
118         AddClassification(address2_, ADDRESS_HOME_LINE2, map) &&
119         AddClassification(street_address_, ADDRESS_HOME_STREET_ADDRESS, map) &&
120         AddClassification(city_, ADDRESS_HOME_CITY, map) &&
121         AddClassification(state_, ADDRESS_HOME_STATE, map) &&
122         AddClassification(zip_, ADDRESS_HOME_ZIP, map) &&
123         AddClassification(country_, ADDRESS_HOME_COUNTRY, map);
124}
125
126bool AddressField::ParseCompany(AutofillScanner* scanner) {
127  if (company_ && !company_->IsEmpty())
128    return false;
129
130  return ParseField(scanner, UTF8ToUTF16(autofill::kCompanyRe), &company_);
131}
132
133bool AddressField::ParseAddressLines(AutofillScanner* scanner) {
134  // We only match the string "address" in page text, not in element names,
135  // because sometimes every element in a group of address fields will have
136  // a name containing the string "address"; for example, on the page
137  // Kohl's - Register Billing Address.html the text element labeled "city"
138  // has the name "BILL_TO_ADDRESS<>city".  We do match address labels
139  // such as "address1", which appear as element names on various pages (eg
140  // AmericanGirl-Registration.html, BloomingdalesBilling.html,
141  // EBay Registration Enter Information.html).
142  if (address1_ || street_address_)
143    return false;
144
145  base::string16 pattern = UTF8ToUTF16(autofill::kAddressLine1Re);
146  base::string16 label_pattern = UTF8ToUTF16(autofill::kAddressLine1LabelRe);
147  if (!ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT, &address1_) &&
148      !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
149                           &address1_) &&
150      !ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT | MATCH_TEXT_AREA,
151                           &street_address_) &&
152      !ParseFieldSpecifics(scanner, label_pattern,
153                           MATCH_LABEL | MATCH_TEXT_AREA,
154                           &street_address_)) {
155    return false;
156  }
157
158  // Optionally parse more address lines, which may have empty labels.
159  pattern = UTF8ToUTF16(autofill::kAddressLine2Re);
160  label_pattern = UTF8ToUTF16(autofill::kAddressLine2LabelRe);
161  if (!street_address_ &&
162      !ParseEmptyLabel(scanner, &address2_) &&
163      !ParseField(scanner, pattern, &address2_)) {
164    ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT,
165                        &address2_);
166  }
167
168  // Try for surplus lines, which we will promptly discard.
169  // Some pages have 3 address lines (eg SharperImageModifyAccount.html)
170  // Some pages even have 4 address lines (e.g. uk/ShoesDirect2.html)!
171  if (address2_) {
172    pattern = UTF8ToUTF16(autofill::kAddressLinesExtraRe);
173    while (ParseField(scanner, pattern, NULL)) {
174      // Consumed a surplus line, try for another.
175    }
176  }
177
178  return true;
179}
180
181bool AddressField::ParseCountry(AutofillScanner* scanner) {
182  // Parse a country.  The occasional page (e.g.
183  // Travelocity_New Member Information1.html) calls this a "location".
184  if (country_ && !country_->IsEmpty())
185    return false;
186
187  return ParseFieldSpecifics(scanner,
188                             UTF8ToUTF16(autofill::kCountryRe),
189                             MATCH_DEFAULT | MATCH_SELECT,
190                             &country_);
191}
192
193bool AddressField::ParseZipCode(AutofillScanner* scanner) {
194  // Parse a zip code.  On some UK pages (e.g. The China Shop2.html) this
195  // is called a "post code".
196  if (zip_)
197    return false;
198
199  base::string16 pattern = UTF8ToUTF16(autofill::kZipCodeRe);
200  if (!ParseField(scanner, pattern, &zip_))
201    return false;
202
203  // Look for a zip+4, whose field name will also often contain
204  // the substring "zip".
205  ParseField(scanner, UTF8ToUTF16(autofill::kZip4Re), &zip4_);
206  return true;
207}
208
209bool AddressField::ParseCity(AutofillScanner* scanner) {
210  // Parse a city name.  Some UK pages (e.g. The China Shop2.html) use
211  // the term "town".
212  if (city_)
213    return false;
214
215  // Select fields are allowed here.  This occurs on top-100 site rediff.com.
216  return ParseFieldSpecifics(scanner,
217                             UTF8ToUTF16(autofill::kCityRe),
218                             MATCH_DEFAULT | MATCH_SELECT,
219                             &city_);
220}
221
222bool AddressField::ParseState(AutofillScanner* scanner) {
223  if (state_)
224    return false;
225
226  return ParseFieldSpecifics(scanner,
227                             UTF8ToUTF16(autofill::kStateRe),
228                             MATCH_DEFAULT | MATCH_SELECT,
229                             &state_);
230}
231
232}  // namespace autofill
233