1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "content/renderer/android/email_detector.h"
6
7#include "base/logging.h"
8#include "base/memory/scoped_ptr.h"
9#include "base/strings/utf_string_conversions.h"
10#include "content/public/renderer/android_content_detection_prefixes.h"
11#include "net/base/escape.h"
12#include "third_party/icu/source/i18n/unicode/regex.h"
13
14namespace {
15
16// Maximum length of an email address.
17const size_t kMaximumEmailLength = 254;
18
19// Regex to match email addresses.
20// This is more specific than RFC 2822 (uncommon special characters are
21// disallowed) in order to avoid false positives.
22// Delimiters are word boundaries to allow punctuation, quote marks etc. around
23// the address.
24const char kEmailRegex[] = "\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,6}\\b";
25
26}  // anonymous namespace
27
28namespace content {
29
30EmailDetector::EmailDetector() {
31}
32
33size_t EmailDetector::GetMaximumContentLength() {
34  return kMaximumEmailLength;
35}
36
37GURL EmailDetector::GetIntentURL(const std::string& content_text) {
38  if (content_text.empty())
39    return GURL();
40
41  return GURL(kEmailPrefix +
42      net::EscapeQueryParamValue(content_text, true));
43}
44
45bool EmailDetector::FindContent(const base::string16::const_iterator& begin,
46                                const base::string16::const_iterator& end,
47                                size_t* start_pos,
48                                size_t* end_pos,
49                                std::string* content_text) {
50  base::string16 utf16_input = base::string16(begin, end);
51  icu::UnicodeString pattern(kEmailRegex);
52  icu::UnicodeString input(utf16_input.data(), utf16_input.length());
53  UErrorCode status = U_ZERO_ERROR;
54  scoped_ptr<icu::RegexMatcher> matcher(
55      new icu::RegexMatcher(pattern,
56                            input,
57                            UREGEX_CASE_INSENSITIVE,
58                            status));
59  if (matcher->find()) {
60    *start_pos = matcher->start(status);
61    DCHECK(U_SUCCESS(status));
62    *end_pos = matcher->end(status);
63    DCHECK(U_SUCCESS(status));
64    icu::UnicodeString content_ustr(matcher->group(status));
65    DCHECK(U_SUCCESS(status));
66    base::UTF16ToUTF8(content_ustr.getBuffer(), content_ustr.length(),
67        content_text);
68    return true;
69  }
70
71  return false;
72}
73
74}  // namespace content
75