1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/browser/history/snippet.h" 6 7#include <algorithm> 8 9#include "base/string_split.h" 10#include "base/string_util.h" 11#include "base/utf_string_conversions.h" 12#include "testing/gtest/include/gtest/gtest.h" 13 14namespace { 15 16// A sample document to compute snippets of. 17// The \x bits after the first "Google" are UTF-8 of U+2122 TRADE MARK SIGN, 18// and are useful for verifying we don't screw up in UTF-8/UTF-16 conversion. 19const char* kSampleDocument = "Google\xe2\x84\xa2 Terms of Service " 20"Welcome to Google! " 21"1. Your relationship with Google " 22"1.1 Your use of Google's products, software, services and web sites " 23"(referred to collectively as the \"Services\" in this document and excluding " 24"any services provided to you by Google under a separate written agreement) " 25"is subject to the terms of a legal agreement between you and Google. " 26"\"Google\" means Google Inc., whose principal place of business is at 1600 " 27"Amphitheatre Parkway, Mountain View, CA 94043, United States. This document " 28"explains how the agreement is made up, and sets out some of the terms of " 29"that agreement."; 30}; 31 32// Thai sample taken from http://www.google.co.th/intl/th/privacy.html 33// TODO(jungshik) : Add more samples (e.g. Hindi) after porting 34// ICU 4.0's character iterator changes to our copy of ICU 3.8 to get 35// grapheme clusters in Indic scripts handled more reasonably. 36const char* kThaiSample = "Google \xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87" 37"\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7" 38"\xE0\xB8\xA1 \xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9" 39"\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A" 40"\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5 \xE0\xB9\x80\xE0\xB8\xA1" 41"\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93" 42"\xE0\xB8\xA5\xE0\xB8\x87\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0\xB8\x9A" 43"\xE0\xB8\xB5\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7" 44"\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB8\x9A" 45"\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x82" 46"\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7" 47"\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x82\xE0\xB9\x89" 48"\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x94\xE0\xB8\xB1" 49"\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0\xB9\x88\xE0\xB8\xB2\xE0\xB8\xA7" 50"\xE0\xB9\x82\xE0\xB8\x94\xE0\xB8\xA2\xE0\xB8\xAA\xE0\xB8\xA1\xE0\xB8\xB1" 51"\xE0\xB8\x84\xE0\xB8\xA3\xE0\xB9\x83\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3" 52"\xE0\xB8\xB2\xE0\xB8\xAD\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3\xE0\xB8\xA7" 53"\xE0\xB8\xA1\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9" 54"\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A" 55"\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5" 56"\xE0\xB9\x88\xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3" 57"\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x88" 58"\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0\xB9\x80" 59"\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\xB1\xE0\xB8\x9A" 60"\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5" 61"\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4" 62"\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD\xE0\xB8\xB7\xE0\xB9\x88" 63"\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB" 64"\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84" 65"\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA" 66"\xE0\xB8\xB2\xE0\xB8\xA1 \xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88" 67"\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x9C\xE0\xB8\xB9" 68"\xE0\xB9\x89\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84\xE0\xB8\x94" 69"\xE0\xB9\x89\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x9B\xE0\xB8\xA3" 70"\xE0\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3" 71"\xE0\xB8\x93\xE0\xB9\x8C\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\x94" 72"\xE0\xB8\xB5\xE0\xB8\x82\xE0\xB8\xB6\xE0\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3" 73"\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89\xE0\xB8\x87" 74"\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB9\x81\xE0\xB8\x95" 75"\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80\xE0\xB8\x99\xE0\xB8\xB7\xE0\xB9\x89" 76"\xE0\xB8\xAD\xE0\xB8\xAB\xE0\xB8\xB2\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89" 77"\xE0\xB9\x80\xE0\xB8\xAB\xE0\xB8\xA1\xE0\xB8\xB2\xE0\xB8\xB0\xE0\xB8\xAA" 78"\xE0\xB8\xB3\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x84" 79"\xE0\xB8\xB8\xE0\xB8\x93"; 80 81// Comparator for sorting by the first element in a pair. 82bool ComparePair1st(const Snippet::MatchPosition& a, 83 const Snippet::MatchPosition& b) { 84 return a.first < b.first; 85} 86 87// For testing, we'll compute the match positions manually instead of using 88// sqlite's FTS matching. BuildSnippet returns the snippet for matching 89// |query| against |document|. Matches are surrounded by "**". 90string16 BuildSnippet(const std::string& document, 91 const std::string& query) { 92 // This function assumes that |document| does not contain 93 // any character for which lowercasing changes its length. Further, 94 // it's assumed that lowercasing only the ASCII-portion works for 95 // |document|. We need to add more test cases and change this function 96 // to be more generic depending on how we deal with 'folding for match' 97 // in history. 98 const std::string document_folded = StringToLowerASCII(std::string(document)); 99 100 std::vector<std::string> query_words; 101 base::SplitString(query, ' ', &query_words); 102 103 // Manually construct match_positions of the document. 104 Snippet::MatchPositions match_positions; 105 match_positions.clear(); 106 for (std::vector<std::string>::iterator qw = query_words.begin(); 107 qw != query_words.end(); ++qw) { 108 // Insert all instances of this word into match_pairs. 109 size_t ofs = 0; 110 while ((ofs = document_folded.find(*qw, ofs)) != std::string::npos) { 111 match_positions.push_back(std::make_pair(ofs, ofs + qw->size())); 112 ofs += qw->size(); 113 } 114 } 115 // Sort match_positions in order of increasing offset. 116 std::sort(match_positions.begin(), match_positions.end(), ComparePair1st); 117 118 // Compute the snippet. 119 Snippet snippet; 120 snippet.ComputeSnippet(match_positions, document); 121 122 // Now "highlight" all matches in the snippet with **. 123 string16 star_snippet; 124 Snippet::MatchPositions::const_iterator match; 125 size_t pos = 0; 126 for (match = snippet.matches().begin(); 127 match != snippet.matches().end(); ++match) { 128 star_snippet += snippet.text().substr(pos, match->first - pos); 129 star_snippet += UTF8ToUTF16("**"); 130 star_snippet += snippet.text().substr(match->first, 131 match->second - match->first); 132 star_snippet += UTF8ToUTF16("**"); 133 pos = match->second; 134 } 135 star_snippet += snippet.text().substr(pos); 136 137 return star_snippet; 138} 139 140TEST(Snippets, SimpleQuery) { 141 ASSERT_EQ(" ... eferred to collectively as the \"Services\" in this " 142 "**document** and excluding any services provided to you by " 143 "Goo ... ... way, Mountain View, CA 94043, United States. This " 144 "**document** explains how the agreement is made up, and sets " 145 "o ... ", 146 UTF16ToUTF8(BuildSnippet(kSampleDocument, "document"))); 147} 148 149// Test that two words that are near each other don't produce two elided bits. 150TEST(Snippets, NearbyWords) { 151 ASSERT_EQ(" ... lace of business is at 1600 Amphitheatre Parkway, " 152 "**Mountain** **View**, CA 94043, United States. This " 153 "document explains ... ", 154 UTF16ToUTF8(BuildSnippet(kSampleDocument, "mountain view"))); 155} 156 157// The above tests already test that we get byte offsets correct, but here's 158// one that gets the "TM" in its snippet. 159TEST(Snippets, UTF8) { 160 ASSERT_EQ(" ... ogle\xe2\x84\xa2 Terms of Service Welcome to Google! " 161 "1. Your **relationship** with Google 1.1 Your use of Google's " 162 "products, so ... ", 163 UTF16ToUTF8(BuildSnippet(kSampleDocument, "relationship"))); 164} 165 166TEST(Snippets, ThaiUTF8) { 167 // There are 3 instances of '\u0E43\u0E2B\u0E49' 168 // (\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89) in kThaiSample. 169 // The 1st is more than |kSniipetContext| graphemes away from the 170 // 2nd while the 2nd and 3rd are within that window. However, with 171 // the 2nd match added, the snippet goes over the size limit so that 172 // the snippet ends right before the 3rd match. 173 ASSERT_EQ(" ... " 174 "\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7" 175 "\xE0\xB8\xA1 \xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8" 176 "\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8" 177 "\xA7\xE0\xB8\x99\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8" 178 "\x84\xE0\xB8\xA5 \xE0\xB9\x80\xE0\xB8\xA1\xE0\xB8\xB7\xE0" 179 "\xB9\x88\xE0\xB8\xAD\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0" 180 "\xB8\xA5\xE0\xB8\x87\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0" 181 "\xB8\x9A\xE0\xB8\xB5\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0" 182 "\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0" 183 "\xB8\x8A\xE0\xB9\x89\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4\xE0" 184 "\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x82\xE0\xB8\xAD\xE0" 185 "\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8" 186 "\xAD**\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89**\xE0\xB8\x82\xE0" 187 "\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0" 188 "\xB8\x94\xE0\xB8\xB1\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0" 189 "\xB9\x88\xE0\xB8\xB2\xE0\xB8\xA7\xE0\xB9\x82\xE0\xB8\x94\xE0" 190 "\xB8\xA2\xE0\xB8\xAA\xE0\xB8\xA1\xE0\xB8\xB1\xE0\xB8\x84\xE0" 191 "\xB8\xA3\xE0\xB9\x83\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3" 192 "\xE0\xB8\xB2\xE0\xB8\xAD\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3" 193 "\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD" 194 "\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88" 195 "\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84" 196 "\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88" 197 "\xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3" 198 "\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1" 199 "\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8" 200 "\xE0\xB8\x93\xE0\xB9\x80\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2" 201 "\xE0\xB8\x81\xE0\xB8\xB1\xE0\xB8\x9A ... ... \xE0\xB8\x82" 202 "\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5" 203 "\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3" 204 "\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD" 205 "\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD" 206 "\xE0\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0" 207 "\xB8\xAD\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0" 208 "\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA\xE0" 209 "\xB8\xB2\xE0\xB8\xA1 \xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7" 210 "\xE0\xB9\x88\xE0\xB8\xAD**\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9" 211 "\x89**\xE0\xB8\x9C\xE0\xB8\xB9\xE0\xB9\x89\xE0\xB9\x83\xE0" 212 "\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84\xE0\xB8\x94\xE0\xB9\x89\xE0" 213 "\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x9B\xE0\xB8\xA3\xE0" 214 "\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A\xE0\xB8\x81\xE0\xB8\xB2\xE0" 215 "\xB8\xA3\xE0\xB8\x93\xE0\xB9\x8C\xE0\xB8\x97\xE0\xB8\xB5\xE0" 216 "\xB9\x88\xE0\xB8\x94\xE0\xB8\xB5\xE0\xB8\x82\xE0\xB8\xB6\xE0" 217 "\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1" 218 "\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89\xE0\xB8\x87\xE0\xB8\x9B" 219 "\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB9\x81\xE0\xB8\x95" 220 "\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80\xE0\xB8\x99\xE0\xB8\xB7" 221 "\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xAB\xE0\xB8\xB2", 222 UTF16ToUTF8(BuildSnippet(kThaiSample, 223 "\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89"))); 224} 225 226TEST(Snippets, ExtractMatchPositions) { 227 struct TestData { 228 const std::string offsets_string; 229 const size_t expected_match_count; 230 const size_t expected_matches[10]; 231 } data[] = { 232 { "0 0 1 2 0 0 4 1 0 0 1 5", 1, { 1, 6 } }, 233 { "0 0 1 4 0 0 2 1", 1, { 1, 5 } }, 234 { "0 0 4 1 0 0 2 1", 2, { 2, 3, 4, 5 } }, 235 { "0 0 0 1", 1, { 0, 1 } }, 236 { "0 0 0 1 0 0 0 2", 1, { 0, 2 } }, 237 { "0 0 1 1 0 0 1 2", 1, { 1, 3 } }, 238 { "0 0 1 2 0 0 4 3 0 0 3 1", 1, { 1, 7 } }, 239 { "0 0 1 4 0 0 2 5", 1, { 1, 7 } }, 240 { "0 0 1 2 0 0 1 1", 1, { 1, 3 } }, 241 { "0 0 1 1 0 0 5 2 0 0 10 1 0 0 3 10", 2, { 1, 2, 3, 13 } }, 242 }; 243 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(data); ++i) { 244 Snippet::MatchPositions matches; 245 Snippet::ExtractMatchPositions(data[i].offsets_string, "0", &matches); 246 EXPECT_EQ(data[i].expected_match_count, matches.size()); 247 for (size_t j = 0; j < data[i].expected_match_count; ++j) { 248 EXPECT_EQ(data[i].expected_matches[2 * j], matches[j].first); 249 EXPECT_EQ(data[i].expected_matches[2 * j + 1], matches[j].second); 250 } 251 } 252} 253