10529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// Copyright 2014 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 50529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch#include "components/query_parser/snippet.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <algorithm> 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 92a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/strings/string_split.h" 10868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h" 11868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "testing/gtest/include/gtest/gtest.h" 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 140529e5d033099cbfc42635f6f6183833b09dff6eBen Murdochnamespace query_parser { 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace { 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A sample document to compute snippets of. 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The \x bits after the first "Google" are UTF-8 of U+2122 TRADE MARK SIGN, 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and are useful for verifying we don't screw up in UTF-8/UTF-16 conversion. 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char* kSampleDocument = "Google\xe2\x84\xa2 Terms of Service " 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"Welcome to Google! " 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"1. Your relationship with Google " 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"1.1 Your use of Google's products, software, services and web sites " 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"(referred to collectively as the \"Services\" in this document and excluding " 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"any services provided to you by Google under a separate written agreement) " 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"is subject to the terms of a legal agreement between you and Google. " 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\"Google\" means Google Inc., whose principal place of business is at 1600 " 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"Amphitheatre Parkway, Mountain View, CA 94043, United States. This document " 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"explains how the agreement is made up, and sets out some of the terms of " 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"that agreement."; 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Thai sample taken from http://www.google.co.th/intl/th/privacy.html 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// TODO(jungshik) : Add more samples (e.g. Hindi) after porting 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ICU 4.0's character iterator changes to our copy of ICU 3.8 to get 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// grapheme clusters in Indic scripts handled more reasonably. 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char* kThaiSample = "Google \xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87" 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7" 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA1 \xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9" 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A" 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5 \xE0\xB9\x80\xE0\xB8\xA1" 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93" 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA5\xE0\xB8\x87\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0\xB8\x9A" 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB5\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7" 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB8\x9A" 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x82" 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7" 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x82\xE0\xB9\x89" 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x94\xE0\xB8\xB1" 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0\xB9\x88\xE0\xB8\xB2\xE0\xB8\xA7" 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x82\xE0\xB8\x94\xE0\xB8\xA2\xE0\xB8\xAA\xE0\xB8\xA1\xE0\xB8\xB1" 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x84\xE0\xB8\xA3\xE0\xB9\x83\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3" 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB2\xE0\xB8\xAD\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3\xE0\xB8\xA7" 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA1\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9" 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A" 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5" 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x88\xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3" 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x88" 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0\xB9\x80" 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\xB1\xE0\xB8\x9A" 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5" 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4" 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD\xE0\xB8\xB7\xE0\xB9\x88" 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB" 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84" 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA" 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB2\xE0\xB8\xA1 \xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88" 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x9C\xE0\xB8\xB9" 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x89\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84\xE0\xB8\x94" 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x89\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x9B\xE0\xB8\xA3" 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3" 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x93\xE0\xB9\x8C\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\x94" 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB5\xE0\xB8\x82\xE0\xB8\xB6\xE0\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3" 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89\xE0\xB8\x87" 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB9\x81\xE0\xB8\x95" 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80\xE0\xB8\x99\xE0\xB8\xB7\xE0\xB9\x89" 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xAD\xE0\xB8\xAB\xE0\xB8\xB2\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89" 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x80\xE0\xB8\xAB\xE0\xB8\xA1\xE0\xB8\xB2\xE0\xB8\xB0\xE0\xB8\xAA" 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB3\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x84" 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB8\xE0\xB8\x93"; 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Comparator for sorting by the first element in a pair. 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ComparePair1st(const Snippet::MatchPosition& a, 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const Snippet::MatchPosition& b) { 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return a.first < b.first; 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For testing, we'll compute the match positions manually instead of using 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// sqlite's FTS matching. BuildSnippet returns the snippet for matching 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// |query| against |document|. Matches are surrounded by "**". 915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)base::string16 BuildSnippet(const std::string& document, 920529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch const std::string& query) { 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This function assumes that |document| does not contain 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // any character for which lowercasing changes its length. Further, 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // it's assumed that lowercasing only the ASCII-portion works for 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // |document|. We need to add more test cases and change this function 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to be more generic depending on how we deal with 'folding for match' 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in history. 996e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) const std::string document_folded = 1006e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) base::StringToLowerASCII(std::string(document)); 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<std::string> query_words; 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::SplitString(query, ' ', &query_words); 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Manually construct match_positions of the document. 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Snippet::MatchPositions match_positions; 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) match_positions.clear(); 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (std::vector<std::string>::iterator qw = query_words.begin(); 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) qw != query_words.end(); ++qw) { 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Insert all instances of this word into match_pairs. 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t ofs = 0; 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while ((ofs = document_folded.find(*qw, ofs)) != std::string::npos) { 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) match_positions.push_back(std::make_pair(ofs, ofs + qw->size())); 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ofs += qw->size(); 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Sort match_positions in order of increasing offset. 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::sort(match_positions.begin(), match_positions.end(), ComparePair1st); 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Compute the snippet. 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Snippet snippet; 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) snippet.ComputeSnippet(match_positions, document); 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Now "highlight" all matches in the snippet with **. 125a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) base::string16 star_snippet; 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Snippet::MatchPositions::const_iterator match; 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t pos = 0; 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (match = snippet.matches().begin(); 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) match != snippet.matches().end(); ++match) { 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) star_snippet += snippet.text().substr(pos, match->first - pos); 1315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) star_snippet += base::UTF8ToUTF16("**"); 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) star_snippet += snippet.text().substr(match->first, 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) match->second - match->first); 1345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) star_snippet += base::UTF8ToUTF16("**"); 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pos = match->second; 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) star_snippet += snippet.text().substr(pos); 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return star_snippet; 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Snippets, SimpleQuery) { 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ASSERT_EQ(" ... eferred to collectively as the \"Services\" in this " 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "**document** and excluding any services provided to you by " 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "Goo ... ... way, Mountain View, CA 94043, United States. This " 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "**document** explains how the agreement is made up, and sets " 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "o ... ", 1485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) base::UTF16ToUTF8(BuildSnippet(kSampleDocument, "document"))); 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Test that two words that are near each other don't produce two elided bits. 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Snippets, NearbyWords) { 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ASSERT_EQ(" ... lace of business is at 1600 Amphitheatre Parkway, " 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "**Mountain** **View**, CA 94043, United States. This " 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "document explains ... ", 1565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) base::UTF16ToUTF8(BuildSnippet(kSampleDocument, "mountain view"))); 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The above tests already test that we get byte offsets correct, but here's 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// one that gets the "TM" in its snippet. 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Snippets, UTF8) { 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ASSERT_EQ(" ... ogle\xe2\x84\xa2 Terms of Service Welcome to Google! " 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "1. Your **relationship** with Google 1.1 Your use of Google's " 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "products, so ... ", 1655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) base::UTF16ToUTF8(BuildSnippet(kSampleDocument, "relationship"))); 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Snippets, ThaiUTF8) { 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // There are 3 instances of '\u0E43\u0E2B\u0E49' 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89) in kThaiSample. 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The 1st is more than |kSniipetContext| graphemes away from the 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 2nd while the 2nd and 3rd are within that window. However, with 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the 2nd match added, the snippet goes over the size limit so that 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the snippet ends right before the 3rd match. 1755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) ASSERT_EQ(" ... " 1765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9" 1775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99" 1785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5 " 1795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB9\x80\xE0\xB8\xA1\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD" 1805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0\xB8\xA5\xE0\xB8\x87" 1815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0\xB8\x9A\xE0\xB8\xB5" 1825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7" 1835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89" 1845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2" 1855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xA3\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google " 1865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD**\xE0\xB9\x83" 1875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xAB\xE0\xB9\x89**\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD" 1885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x94\xE0\xB8\xB1" 1895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0\xB9\x88\xE0\xB8\xB2" 1905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xA7\xE0\xB9\x82\xE0\xB8\x94\xE0\xB8\xA2\xE0\xB8\xAA" 1915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xA1\xE0\xB8\xB1\xE0\xB8\x84\xE0\xB8\xA3\xE0\xB9\x83" 1925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3\xE0\xB8\xB2\xE0\xB8\xAD" 1935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1" 1945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9" 1955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99" 1965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5" 1975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB9\x80\xE0\xB8\x81" 1985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A" 1995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1 ... ... \xE0\xB8\x88" 2005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93" 2015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB9\x80\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2\xE0\xB8\x81" 2025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD" 2035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x88\xE0\xB8\xB2" 2045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81" 2055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD\xE0\xB8\xB7\xE0\xB9\x88" 2065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google " 2075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD\xE0\xB8\x9A" 2085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97" 2095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA\xE0\xB8\xB2\xE0\xB8\xA1 " 2105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD**" 2115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89**\xE0\xB8\x9C\xE0\xB8\xB9" 2125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB9\x89\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84" 2135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x94\xE0\xB9\x89\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A" 2145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A" 2155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x93\xE0\xB9\x8C" 2165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\x94\xE0\xB8\xB5" 2175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x82\xE0\xB8\xB6\xE0\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3" 2185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89" 2195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x87\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A" 2205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB9\x81\xE0\xB8\x95\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80" 2215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\x99\xE0\xB8\xB7\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xAB" 2225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) "\xE0\xB8\xB2", 2235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) base::UTF16ToUTF8(BuildSnippet(kThaiSample, 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89"))); 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Snippets, ExtractMatchPositions) { 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) struct TestData { 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const std::string offsets_string; 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t expected_match_count; 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t expected_matches[10]; 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } data[] = { 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { "0 0 1 2 0 0 4 1 0 0 1 5", 1, { 1, 6 } }, 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { "0 0 1 4 0 0 2 1", 1, { 1, 5 } }, 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { "0 0 4 1 0 0 2 1", 2, { 2, 3, 4, 5 } }, 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { "0 0 0 1", 1, { 0, 1 } }, 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { "0 0 0 1 0 0 0 2", 1, { 0, 2 } }, 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { "0 0 1 1 0 0 1 2", 1, { 1, 3 } }, 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { "0 0 1 2 0 0 4 3 0 0 3 1", 1, { 1, 7 } }, 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { "0 0 1 4 0 0 2 5", 1, { 1, 7 } }, 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { "0 0 1 2 0 0 1 1", 1, { 1, 3 } }, 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { "0 0 1 1 0 0 5 2 0 0 10 1 0 0 3 10", 2, { 1, 2, 3, 13 } }, 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t i = 0; i < ARRAYSIZE_UNSAFE(data); ++i) { 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Snippet::MatchPositions matches; 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Snippet::ExtractMatchPositions(data[i].offsets_string, "0", &matches); 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(data[i].expected_match_count, matches.size()); 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t j = 0; j < data[i].expected_match_count; ++j) { 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(data[i].expected_matches[2 * j], matches[j].first); 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) EXPECT_EQ(data[i].expected_matches[2 * j + 1], matches[j].second); 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2540529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch 2550529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch} // namespace query_parser 256