10529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// Copyright 2014 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
50529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch#include "components/query_parser/snippet.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <algorithm>
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
92a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/strings/string_split.h"
10868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h"
11868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "testing/gtest/include/gtest/gtest.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
140529e5d033099cbfc42635f6f6183833b09dff6eBen Murdochnamespace query_parser {
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace {
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A sample document to compute snippets of.
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The \x bits after the first "Google" are UTF-8 of U+2122 TRADE MARK SIGN,
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and are useful for verifying we don't screw up in UTF-8/UTF-16 conversion.
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char* kSampleDocument = "Google\xe2\x84\xa2 Terms of Service "
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"Welcome to Google! "
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"1. Your relationship with Google "
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"1.1 Your use of Google's products, software, services and web sites "
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"(referred to collectively as the \"Services\" in this document and excluding "
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"any services provided to you by Google under a separate written agreement) "
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"is subject to the terms of a legal agreement between you and Google. "
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\"Google\" means Google Inc., whose principal place of business is at 1600 "
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"Amphitheatre Parkway, Mountain View, CA 94043, United States. This document "
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"explains how the agreement is made up, and sets out some of the terms of "
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"that agreement.";
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Thai sample taken from http://www.google.co.th/intl/th/privacy.html
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// TODO(jungshik) : Add more samples (e.g. Hindi) after porting
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ICU 4.0's character iterator changes to our copy of ICU 3.8 to get
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// grapheme clusters in Indic scripts handled more reasonably.
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char* kThaiSample = "Google \xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87"
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7"
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA1 \xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A"
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5 \xE0\xB9\x80\xE0\xB8\xA1"
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93"
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA5\xE0\xB8\x87\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0\xB8\x9A"
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB5\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7"
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB8\x9A"
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x82"
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7"
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x82\xE0\xB9\x89"
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x94\xE0\xB8\xB1"
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0\xB9\x88\xE0\xB8\xB2\xE0\xB8\xA7"
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x82\xE0\xB8\x94\xE0\xB8\xA2\xE0\xB8\xAA\xE0\xB8\xA1\xE0\xB8\xB1"
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x84\xE0\xB8\xA3\xE0\xB9\x83\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3"
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB2\xE0\xB8\xAD\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3\xE0\xB8\xA7"
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA1\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99\xE0\xB8\x9A"
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5"
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x88\xE0\xB9\x80\xE0\xB8\x81\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3"
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA7\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x88"
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0\xB9\x80"
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\xB1\xE0\xB8\x9A"
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5"
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x88\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4"
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD\xE0\xB8\xB7\xE0\xB9\x88"
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google \xE0\xB8\xAB"
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84"
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA"
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB2\xE0\xB8\xA1 \xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88"
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89\xE0\xB8\x9C\xE0\xB8\xB9"
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x89\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84\xE0\xB8\x94"
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x89\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x9B\xE0\xB8\xA3"
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3"
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x93\xE0\xB9\x8C\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\x94"
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB5\xE0\xB8\x82\xE0\xB8\xB6\xE0\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3"
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89\xE0\xB8\x87"
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB9\x81\xE0\xB8\x95"
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80\xE0\xB8\x99\xE0\xB8\xB7\xE0\xB9\x89"
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xAD\xE0\xB8\xAB\xE0\xB8\xB2\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89"
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB9\x80\xE0\xB8\xAB\xE0\xB8\xA1\xE0\xB8\xB2\xE0\xB8\xB0\xE0\xB8\xAA"
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB3\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x84"
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)"\xE0\xB8\xB8\xE0\xB8\x93";
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Comparator for sorting by the first element in a pair.
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ComparePair1st(const Snippet::MatchPosition& a,
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    const Snippet::MatchPosition& b) {
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return a.first < b.first;
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For testing, we'll compute the match positions manually instead of using
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// sqlite's FTS matching.  BuildSnippet returns the snippet for matching
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// |query| against |document|.  Matches are surrounded by "**".
915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)base::string16 BuildSnippet(const std::string& document,
920529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch                            const std::string& query) {
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This function assumes that |document| does not contain
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // any character for which lowercasing changes its length. Further,
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // it's assumed that lowercasing only the ASCII-portion works for
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // |document|. We need to add more test cases and change this function
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to be more generic depending on how we deal with 'folding for match'
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // in history.
996e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)  const std::string document_folded =
1006e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)      base::StringToLowerASCII(std::string(document));
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<std::string> query_words;
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::SplitString(query, ' ', &query_words);
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Manually construct match_positions of the document.
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Snippet::MatchPositions match_positions;
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  match_positions.clear();
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (std::vector<std::string>::iterator qw = query_words.begin();
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)       qw != query_words.end(); ++qw) {
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Insert all instances of this word into match_pairs.
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    size_t ofs = 0;
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while ((ofs = document_folded.find(*qw, ofs)) != std::string::npos) {
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      match_positions.push_back(std::make_pair(ofs, ofs + qw->size()));
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ofs += qw->size();
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Sort match_positions in order of increasing offset.
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::sort(match_positions.begin(), match_positions.end(), ComparePair1st);
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Compute the snippet.
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Snippet snippet;
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  snippet.ComputeSnippet(match_positions, document);
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Now "highlight" all matches in the snippet with **.
125a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  base::string16 star_snippet;
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Snippet::MatchPositions::const_iterator match;
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t pos = 0;
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (match = snippet.matches().begin();
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)       match != snippet.matches().end(); ++match) {
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    star_snippet += snippet.text().substr(pos, match->first - pos);
1315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    star_snippet += base::UTF8ToUTF16("**");
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    star_snippet += snippet.text().substr(match->first,
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                          match->second - match->first);
1345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    star_snippet += base::UTF8ToUTF16("**");
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pos = match->second;
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  star_snippet += snippet.text().substr(pos);
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return star_snippet;
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Snippets, SimpleQuery) {
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ASSERT_EQ(" ... eferred to collectively as the \"Services\" in this "
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            "**document** and excluding any services provided to you by "
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            "Goo ...  ... way, Mountain View, CA 94043, United States. This "
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            "**document** explains how the agreement is made up, and sets "
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            "o ... ",
1485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            base::UTF16ToUTF8(BuildSnippet(kSampleDocument, "document")));
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Test that two words that are near each other don't produce two elided bits.
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Snippets, NearbyWords) {
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ASSERT_EQ(" ... lace of business is at 1600 Amphitheatre Parkway, "
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            "**Mountain** **View**, CA 94043, United States. This "
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            "document explains  ... ",
1565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            base::UTF16ToUTF8(BuildSnippet(kSampleDocument, "mountain view")));
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The above tests already test that we get byte offsets correct, but here's
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// one that gets the "TM" in its snippet.
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Snippets, UTF8) {
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ASSERT_EQ(" ... ogle\xe2\x84\xa2 Terms of Service Welcome to Google! "
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            "1. Your **relationship** with Google 1.1 Your use of Google's "
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            "products, so ... ",
1655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            base::UTF16ToUTF8(BuildSnippet(kSampleDocument, "relationship")));
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Snippets, ThaiUTF8) {
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // There are 3 instances of '\u0E43\u0E2B\u0E49'
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // (\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89) in kThaiSample.
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The 1st is more than |kSniipetContext| graphemes away from the
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // 2nd while the 2nd and 3rd are within that window. However, with
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the 2nd match added, the snippet goes over the size limit so that
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the snippet ends right before the 3rd match.
1755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)  ASSERT_EQ(" ...  "
1765f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
1775f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99"
1785f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5 "
1795f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB9\x80\xE0\xB8\xA1\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD"
1805f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93\xE0\xB8\xA5\xE0\xB8\x87"
1815f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x97\xE0\xB8\xB0\xE0\xB9\x80\xE0\xB8\x9A\xE0\xB8\xB5"
1825f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xA2\xE0\xB8\x99\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7"
1835f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB9\x88\xE0\xB8\xAD\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89"
1845f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81\xE0\xB8\xB2"
1855f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xA3\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google "
1865f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD**\xE0\xB9\x83"
1875f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xAB\xE0\xB9\x89**\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD"
1885f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x94\xE0\xB8\xB1"
1895f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x87\xE0\xB8\x81\xE0\xB8\xA5\xE0\xB9\x88\xE0\xB8\xB2"
1905f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xA7\xE0\xB9\x82\xE0\xB8\x94\xE0\xB8\xA2\xE0\xB8\xAA"
1915f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xA1\xE0\xB8\xB1\xE0\xB8\x84\xE0\xB8\xA3\xE0\xB9\x83"
1925f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x88 \xE0\xB9\x80\xE0\xB8\xA3\xE0\xB8\xB2\xE0\xB8\xAD"
1935f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xB2\xE0\xB8\x88\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1"
1945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xA1\xE0\xB8\xB9"
1955f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xA5\xE0\xB8\xAA\xE0\xB9\x88\xE0\xB8\xA7\xE0\xB8\x99"
1965f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x9A\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5"
1975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB9\x80\xE0\xB8\x81"
1985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB9\x87\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\x9A"
1995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xA3\xE0\xB8\xA7\xE0\xB8\xA1 ...  ... \xE0\xB8\x88"
2005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xB2\xE0\xB8\x81\xE0\xB8\x84\xE0\xB8\xB8\xE0\xB8\x93"
2015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB9\x80\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xB2\xE0\xB8\x81"
2025f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xB1\xE0\xB8\x9A\xE0\xB8\x82\xE0\xB9\x89\xE0\xB8\xAD"
2035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xA1\xE0\xB8\xB9\xE0\xB8\xA5\xE0\xB8\x88\xE0\xB8\xB2"
2045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x81\xE0\xB8\x9A\xE0\xB8\xA3\xE0\xB8\xB4\xE0\xB8\x81"
2055f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\xAD\xE0\xB8\xB7\xE0\xB9\x88"
2065f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x99\xE0\xB8\x82\xE0\xB8\xAD\xE0\xB8\x87 Google "
2075f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xAB\xE0\xB8\xA3\xE0\xB8\xB7\xE0\xB8\xAD\xE0\xB8\x9A"
2085f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xB8\xE0\xB8\x84\xE0\xB8\x84\xE0\xB8\xA5\xE0\xB8\x97"
2095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\xAA\xE0\xB8\xB2\xE0\xB8\xA1 "
2105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB9\x80\xE0\xB8\x9E\xE0\xB8\xB7\xE0\xB9\x88\xE0\xB8\xAD**"
2115f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89**\xE0\xB8\x9C\xE0\xB8\xB9"
2125f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB9\x89\xE0\xB9\x83\xE0\xB8\x8A\xE0\xB9\x89\xE0\xB9\x84"
2135f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x94\xE0\xB9\x89\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A"
2145f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB0\xE0\xB8\xAA\xE0\xB8\x9A"
2155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x81\xE0\xB8\xB2\xE0\xB8\xA3\xE0\xB8\x93\xE0\xB9\x8C"
2165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x97\xE0\xB8\xB5\xE0\xB9\x88\xE0\xB8\x94\xE0\xB8\xB5"
2175f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x82\xE0\xB8\xB6\xE0\xB9\x89\xE0\xB8\x99 \xE0\xB8\xA3"
2185f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xA7\xE0\xB8\xA1\xE0\xB8\x97\xE0\xB8\xB1\xE0\xB9\x89"
2195f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x87\xE0\xB8\x9B\xE0\xB8\xA3\xE0\xB8\xB1\xE0\xB8\x9A"
2205f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB9\x81\xE0\xB8\x95\xE0\xB9\x88\xE0\xB8\x87\xE0\xB9\x80"
2215f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\x99\xE0\xB8\xB7\xE0\xB9\x89\xE0\xB8\xAD\xE0\xB8\xAB"
2225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)            "\xE0\xB8\xB2",
2235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            base::UTF16ToUTF8(BuildSnippet(kThaiSample,
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                     "\xE0\xB9\x83\xE0\xB8\xAB\xE0\xB9\x89")));
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Snippets, ExtractMatchPositions) {
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  struct TestData {
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const std::string offsets_string;
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const size_t expected_match_count;
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const size_t expected_matches[10];
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } data[] = {
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    { "0 0 1 2 0 0 4 1 0 0 1 5",            1,     { 1, 6 } },
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    { "0 0 1 4 0 0 2 1",                    1,     { 1, 5 } },
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    { "0 0 4 1 0 0 2 1",                    2,     { 2, 3, 4, 5 } },
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    { "0 0 0 1",                            1,     { 0, 1 } },
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    { "0 0 0 1 0 0 0 2",                    1,     { 0, 2 } },
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    { "0 0 1 1 0 0 1 2",                    1,     { 1, 3 } },
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    { "0 0 1 2 0 0 4 3 0 0 3 1",            1,     { 1, 7 } },
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    { "0 0 1 4 0 0 2 5",                    1,     { 1, 7 } },
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    { "0 0 1 2 0 0 1 1",                    1,     { 1, 3 } },
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    { "0 0 1 1 0 0 5 2 0 0 10 1 0 0 3 10",  2,     { 1, 2, 3, 13 } },
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(data); ++i) {
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Snippet::MatchPositions matches;
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Snippet::ExtractMatchPositions(data[i].offsets_string, "0", &matches);
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    EXPECT_EQ(data[i].expected_match_count, matches.size());
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (size_t j = 0; j < data[i].expected_match_count; ++j) {
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      EXPECT_EQ(data[i].expected_matches[2 * j], matches[j].first);
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      EXPECT_EQ(data[i].expected_matches[2 * j + 1], matches[j].second);
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2540529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch
2550529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch}  // namespace query_parser
256