1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/history/snippet.h"
6
7#include <algorithm>
8
9#include "base/string_split.h"
10#include "base/string_util.h"
11#include "base/utf_string_conversions.h"
12#include "testing/gtest/include/gtest/gtest.h"
13
14namespace {
15
16// A sample document to compute snippets of.
17// The \x bits after the first "Google" are UTF-8 of U+2122 TRADE MARK SIGN,
18// and are useful for verifying we don't screw up in UTF-8/UTF-16 conversion.
19const char* kSampleDocument = "Google\xe2\x84\xa2 Terms of Service "
20"Welcome to Google! "
21"1. Your relationship with Google "
22"1.1 Your use of Google's products, software, services and web sites "
23"(referred to collectively as the \"Services\" in this document and excluding "
24"any services provided to you by Google under a separate written agreement) "
25"is subject to the terms of a legal agreement between you and Google. "
26"\"Google\" means Google Inc., whose principal place of business is at 1600 "
27"Amphitheatre Parkway, Mountain View, CA 94043, United States. This document "
28"explains how the agreement is made up, and sets out some of the terms of "
29"that agreement.";
30};
31
32// Thai sample taken from http://www.google.co.th/intl/th/privacy.html
33// TODO(jungshik) : Add more samples (e.g. Hindi) after porting
34// ICU 4.0's character iterator changes to our copy of ICU 3.8 to get
35// grapheme clusters in Indic scripts handled more reasonably.
36const char* kThaiSample = "Google \xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87"
37"\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7"
38"\xE0\xB8\xA1 \xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
39"\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A"
40"\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5 \xE0\xB9\x80\xE0\xB8\xA1"
41"\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93"
42"\xE0\xB8\xA5\xE0\xB8\x87\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0\xB8\x9A"
43"\xE0\xB8\xB5\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7"
44"\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB8\x9A"
45"\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x82"
46"\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7"
47"\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x82\xE0\xB9\x89"
48"\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x94\xE0\xB8\xB1"
49"\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0\xB9\x88\xE0\xB8\xB2\xE0\xB8\xA7"
50"\xE0\xB9\x82\xE0\xB8\x94\xE0\xB8\xA2\xE0\xB8\xAA\xE0\xB8\xA1\xE0\xB8\xB1"
51"\xE0\xB8\x84\xE0\xB8\xA3\xE0\xB9\x83\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3"
52"\xE0\xB8\xB2\xE0\xB8\xAD\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3\xE0\xB8\xA7"
53"\xE0\xB8\xA1\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
54"\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A"
55"\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5"
56"\xE0\xB9\x88\xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3"
57"\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x88"
58"\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0\xB9\x80"
59"\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\xB1\xE0\xB8\x9A"
60"\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5"
61"\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4"
62"\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD\xE0\xB8\xB7\xE0\xB9\x88"
63"\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB"
64"\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84"
65"\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA"
66"\xE0\xB8\xB2\xE0\xB8\xA1 \xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88"
67"\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x9C\xE0\xB8\xB9"
68"\xE0\xB9\x89\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84\xE0\xB8\x94"
69"\xE0\xB9\x89\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x9B\xE0\xB8\xA3"
70"\xE0\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3"
71"\xE0\xB8\x93\xE0\xB9\x8C\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\x94"
72"\xE0\xB8\xB5\xE0\xB8\x82\xE0\xB8\xB6\xE0\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3"
73"\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89\xE0\xB8\x87"
74"\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB9\x81\xE0\xB8\x95"
75"\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80\xE0\xB8\x99\xE0\xB8\xB7\xE0\xB9\x89"
76"\xE0\xB8\xAD\xE0\xB8\xAB\xE0\xB8\xB2\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89"
77"\xE0\xB9\x80\xE0\xB8\xAB\xE0\xB8\xA1\xE0\xB8\xB2\xE0\xB8\xB0\xE0\xB8\xAA"
78"\xE0\xB8\xB3\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x84"
79"\xE0\xB8\xB8\xE0\xB8\x93";
80
81// Comparator for sorting by the first element in a pair.
82bool ComparePair1st(const Snippet::MatchPosition& a,
83                    const Snippet::MatchPosition& b) {
84  return a.first < b.first;
85}
86
87// For testing, we'll compute the match positions manually instead of using
88// sqlite's FTS matching.  BuildSnippet returns the snippet for matching
89// |query| against |document|.  Matches are surrounded by "**".
90string16 BuildSnippet(const std::string& document,
91                      const std::string& query) {
92  // This function assumes that |document| does not contain
93  // any character for which lowercasing changes its length. Further,
94  // it's assumed that lowercasing only the ASCII-portion works for
95  // |document|. We need to add more test cases and change this function
96  // to be more generic depending on how we deal with 'folding for match'
97  // in history.
98  const std::string document_folded = StringToLowerASCII(std::string(document));
99
100  std::vector<std::string> query_words;
101  base::SplitString(query, ' ', &query_words);
102
103  // Manually construct match_positions of the document.
104  Snippet::MatchPositions match_positions;
105  match_positions.clear();
106  for (std::vector<std::string>::iterator qw = query_words.begin();
107       qw != query_words.end(); ++qw) {
108    // Insert all instances of this word into match_pairs.
109    size_t ofs = 0;
110    while ((ofs = document_folded.find(*qw, ofs)) != std::string::npos) {
111      match_positions.push_back(std::make_pair(ofs, ofs + qw->size()));
112      ofs += qw->size();
113    }
114  }
115  // Sort match_positions in order of increasing offset.
116  std::sort(match_positions.begin(), match_positions.end(), ComparePair1st);
117
118  // Compute the snippet.
119  Snippet snippet;
120  snippet.ComputeSnippet(match_positions, document);
121
122  // Now "highlight" all matches in the snippet with **.
123  string16 star_snippet;
124  Snippet::MatchPositions::const_iterator match;
125  size_t pos = 0;
126  for (match = snippet.matches().begin();
127       match != snippet.matches().end(); ++match) {
128    star_snippet += snippet.text().substr(pos, match->first - pos);
129    star_snippet += UTF8ToUTF16("**");
130    star_snippet += snippet.text().substr(match->first,
131                                          match->second - match->first);
132    star_snippet += UTF8ToUTF16("**");
133    pos = match->second;
134  }
135  star_snippet += snippet.text().substr(pos);
136
137  return star_snippet;
138}
139
140TEST(Snippets, SimpleQuery) {
141  ASSERT_EQ(" ... eferred to collectively as the \"Services\" in this "
142            "**document** and excluding any services provided to you by "
143            "Goo ...  ... way, Mountain View, CA 94043, United States. This "
144            "**document** explains how the agreement is made up, and sets "
145            "o ... ",
146            UTF16ToUTF8(BuildSnippet(kSampleDocument, "document")));
147}
148
149// Test that two words that are near each other don't produce two elided bits.
150TEST(Snippets, NearbyWords) {
151  ASSERT_EQ(" ... lace of business is at 1600 Amphitheatre Parkway, "
152            "**Mountain** **View**, CA 94043, United States. This "
153            "document explains  ... ",
154            UTF16ToUTF8(BuildSnippet(kSampleDocument, "mountain view")));
155}
156
157// The above tests already test that we get byte offsets correct, but here's
158// one that gets the "TM" in its snippet.
159TEST(Snippets, UTF8) {
160  ASSERT_EQ(" ... ogle\xe2\x84\xa2 Terms of Service Welcome to Google! "
161            "1. Your **relationship** with Google 1.1 Your use of Google's "
162            "products, so ... ",
163            UTF16ToUTF8(BuildSnippet(kSampleDocument, "relationship")));
164}
165
166TEST(Snippets, ThaiUTF8) {
167  // There are 3 instances of '\u0E43\u0E2B\u0E49'
168  // (\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89) in kThaiSample.
169  // The 1st is more than |kSniipetContext| graphemes away from the
170  // 2nd while the 2nd and 3rd are within that window. However, with
171  // the 2nd match added, the snippet goes over the size limit so that
172  // the snippet ends right before the 3rd match.
173  ASSERT_EQ(" ... "
174            "\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7"
175            "\xE0\xB8\xA1 \xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8"
176            "\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8"
177            "\xA7\xE0\xB8\x99\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8"
178            "\x84\xE0\xB8\xA5 \xE0\xB9\x80\xE0\xB8\xA1\xE0\xB8\xB7\xE0"
179            "\xB9\x88\xE0\xB8\xAD\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0"
180            "\xB8\xA5\xE0\xB8\x87\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0"
181            "\xB8\x9A\xE0\xB8\xB5\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0"
182            "\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0"
183            "\xB8\x8A\xE0\xB9\x89\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4\xE0"
184            "\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x82\xE0\xB8\xAD\xE0"
185            "\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8"
186            "\xAD**\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89**\xE0\xB8\x82\xE0"
187            "\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0"
188            "\xB8\x94\xE0\xB8\xB1\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0"
189            "\xB9\x88\xE0\xB8\xB2\xE0\xB8\xA7\xE0\xB9\x82\xE0\xB8\x94\xE0"
190            "\xB8\xA2\xE0\xB8\xAA\xE0\xB8\xA1\xE0\xB8\xB1\xE0\xB8\x84\xE0"
191            "\xB8\xA3\xE0\xB9\x83\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3"
192            "\xE0\xB8\xB2\xE0\xB8\xAD\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3"
193            "\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD"
194            "\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88"
195            "\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84"
196            "\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88"
197            "\xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3"
198            "\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1"
199            "\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8"
200            "\xE0\xB8\x93\xE0\xB9\x80\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2"
201            "\xE0\xB8\x81\xE0\xB8\xB1\xE0\xB8\x9A ...  ... \xE0\xB8\x82"
202            "\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5"
203            "\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3"
204            "\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD"
205            "\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD"
206            "\xE0\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0"
207            "\xB8\xAD\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0"
208            "\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA\xE0"
209            "\xB8\xB2\xE0\xB8\xA1 \xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7"
210            "\xE0\xB9\x88\xE0\xB8\xAD**\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9"
211            "\x89**\xE0\xB8\x9C\xE0\xB8\xB9\xE0\xB9\x89\xE0\xB9\x83\xE0"
212            "\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84\xE0\xB8\x94\xE0\xB9\x89\xE0"
213            "\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x9B\xE0\xB8\xA3\xE0"
214            "\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A\xE0\xB8\x81\xE0\xB8\xB2\xE0"
215            "\xB8\xA3\xE0\xB8\x93\xE0\xB9\x8C\xE0\xB8\x97\xE0\xB8\xB5\xE0"
216            "\xB9\x88\xE0\xB8\x94\xE0\xB8\xB5\xE0\xB8\x82\xE0\xB8\xB6\xE0"
217            "\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1"
218            "\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89\xE0\xB8\x87\xE0\xB8\x9B"
219            "\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB9\x81\xE0\xB8\x95"
220            "\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80\xE0\xB8\x99\xE0\xB8\xB7"
221            "\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xAB\xE0\xB8\xB2",
222            UTF16ToUTF8(BuildSnippet(kThaiSample,
223                                     "\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89")));
224}
225
226TEST(Snippets, ExtractMatchPositions) {
227  struct TestData {
228    const std::string offsets_string;
229    const size_t expected_match_count;
230    const size_t expected_matches[10];
231  } data[] = {
232    { "0 0 1 2 0 0 4 1 0 0 1 5",            1,     { 1, 6 } },
233    { "0 0 1 4 0 0 2 1",                    1,     { 1, 5 } },
234    { "0 0 4 1 0 0 2 1",                    2,     { 2, 3, 4, 5 } },
235    { "0 0 0 1",                            1,     { 0, 1 } },
236    { "0 0 0 1 0 0 0 2",                    1,     { 0, 2 } },
237    { "0 0 1 1 0 0 1 2",                    1,     { 1, 3 } },
238    { "0 0 1 2 0 0 4 3 0 0 3 1",            1,     { 1, 7 } },
239    { "0 0 1 4 0 0 2 5",                    1,     { 1, 7 } },
240    { "0 0 1 2 0 0 1 1",                    1,     { 1, 3 } },
241    { "0 0 1 1 0 0 5 2 0 0 10 1 0 0 3 10",  2,     { 1, 2, 3, 13 } },
242  };
243  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(data); ++i) {
244    Snippet::MatchPositions matches;
245    Snippet::ExtractMatchPositions(data[i].offsets_string, "0", &matches);
246    EXPECT_EQ(data[i].expected_match_count, matches.size());
247    for (size_t j = 0; j < data[i].expected_match_count; ++j) {
248      EXPECT_EQ(data[i].expected_matches[2 * j], matches[j].first);
249      EXPECT_EQ(data[i].expected_matches[2 * j + 1], matches[j].second);
250    }
251  }
252}
253