1ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Use of this source code is governed by a BSD-style license that can be 3c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// found in the LICENSE file. 4c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 5c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/browser/history/query_parser.h" 6c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 7c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include <algorithm> 8c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 921d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen#include "base/i18n/break_iterator.h" 10c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/logging.h" 11ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "base/memory/scoped_vector.h" 12c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/string_util.h" 13c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/utf_string_conversions.h" 1472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen#include "ui/base/l10n/l10n_util.h" 15c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "unicode/uscript.h" 16c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 17c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochnamespace { 18c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 19c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Returns true if |mp1.first| is less than |mp2.first|. This is used to 20c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// sort match positions. 21c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochint CompareMatchPosition(const Snippet::MatchPosition& mp1, 22c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const Snippet::MatchPosition& mp2) { 23c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return mp1.first < mp2.first; 24c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 25c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 26c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Returns true if |mp2| intersects |mp1|. This is intended for use by 27c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// CoalesceMatchesFrom and isn't meant as a general intersectpion comparison 28c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// function. 29c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool SnippetIntersects(const Snippet::MatchPosition& mp1, 30c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const Snippet::MatchPosition& mp2) { 31c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return mp2.first >= mp1.first && mp2.first <= mp1.second; 32c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 33c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 34c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Coalesces match positions in |matches| after index that intersect the match 35c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// position at |index|. 36c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid CoalesceMatchesFrom(size_t index, 37c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions* matches) { 38c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPosition& mp = (*matches)[index]; 39c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1; 40c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch i != matches->end(); ) { 41c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (SnippetIntersects(mp, *i)) { 42c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch mp.second = i->second; 43c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch i = matches->erase(i); 44c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } else { 45c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 46c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 47c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 48c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 49c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 50c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Sorts the match positions in |matches| by their first index, then coalesces 51c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// any match positions that intersect each other. 52c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid CoalseAndSortMatchPositions(Snippet::MatchPositions* matches) { 53c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::sort(matches->begin(), matches->end(), &CompareMatchPosition); 54c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove 55c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // from matches. 56c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = 0; i < matches->size(); ++i) 57c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CoalesceMatchesFrom(i, matches); 58c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 59c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 60c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} // namespace 61c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 62c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Inheritance structure: 63c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Queries are represented as trees of QueryNodes. 64c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// QueryNodes are either a collection of subnodes (a QueryNodeList) 65c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// or a single word (a QueryNodeWord). 66c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 67c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// A QueryNodeWord is a single word in the query. 68c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochclass QueryNodeWord : public QueryNode { 69c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch public: 70c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch explicit QueryNodeWord(const string16& word) 71c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch : word_(word), literal_(false) {} 72c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual ~QueryNodeWord() {} 73c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual int AppendToSQLiteQuery(string16* query) const; 74c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual bool IsWord() const { return true; } 75c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 76c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const string16& word() const { return word_; } 77c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch void set_literal(bool literal) { literal_ = literal; } 78c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 79c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual bool HasMatchIn(const std::vector<QueryWord>& words, 80c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions* match_positions) const; 81c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 82c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual bool Matches(const string16& word, bool exact) const; 83c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual void AppendWords(std::vector<string16>* words) const; 84c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 85c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch private: 86c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch string16 word_; 87c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bool literal_; 88c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}; 89c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 90c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words, 91c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions* match_positions) const { 92c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = 0; i < words.size(); ++i) { 93c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (Matches(words[i].word, false)) { 94c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t match_start = words[i].position; 95c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch match_positions->push_back( 96c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPosition(match_start, 97c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch match_start + static_cast<int>(word_.size()))); 98c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return true; 99c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 100c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 101c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 102c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 103c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 104c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryNodeWord::Matches(const string16& word, bool exact) const { 105c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_)) 106c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return word == word_; 107c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return word.size() >= word_.size() && 108c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch (word_.compare(0, word_.size(), word, 0, word_.size()) == 0); 109c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 110c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 111c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryNodeWord::AppendWords(std::vector<string16>* words) const { 112c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch words->push_back(word_); 113c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 114c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 115c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochint QueryNodeWord::AppendToSQLiteQuery(string16* query) const { 116c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch query->append(word_); 117c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 118c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Use prefix search if we're not literal and long enough. 119c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_)) 120c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch *query += L'*'; 121c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return 1; 122c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 123c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 124c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// A QueryNodeList has a collection of child QueryNodes 125c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// which it cleans up after. 126c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochclass QueryNodeList : public QueryNode { 127c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch public: 128c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual ~QueryNodeList(); 129c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 130c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual int AppendToSQLiteQuery(string16* query) const { 131c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return AppendChildrenToString(query); 132c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 133c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual bool IsWord() const { return false; } 134c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 135c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch void AddChild(QueryNode* node) { children_.push_back(node); } 136c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 137c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch typedef std::vector<QueryNode*> QueryNodeVector; 138c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch QueryNodeVector* children() { return &children_; } 139c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 140c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Remove empty subnodes left over from other parsing. 141c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch void RemoveEmptySubnodes(); 142c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 143c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // QueryNodeList is never used with Matches or HasMatchIn. 144c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual bool Matches(const string16& word, bool exact) const { 145c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch NOTREACHED(); 146c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 147c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 148c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual bool HasMatchIn(const std::vector<QueryWord>& words, 149c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions* match_positions) const { 150c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch NOTREACHED(); 151c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 152c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 153c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual void AppendWords(std::vector<string16>* words) const; 154c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 155c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch protected: 156c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int AppendChildrenToString(string16* query) const; 157c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 158c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch QueryNodeVector children_; 159c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}; 160c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 161c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochQueryNodeList::~QueryNodeList() { 162c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (QueryNodeVector::iterator node = children_.begin(); 163c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch node != children_.end(); ++node) 164c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch delete *node; 165c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 166c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 167c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryNodeList::RemoveEmptySubnodes() { 168c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = 0; i < children_.size(); ++i) { 169c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (children_[i]->IsWord()) 170c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch continue; 171c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 172c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]); 173c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch list_node->RemoveEmptySubnodes(); 174c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (list_node->children()->empty()) { 175c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch children_.erase(children_.begin() + i); 176c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch --i; 177c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch delete list_node; 178c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 179c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 180c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 181c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 182c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryNodeList::AppendWords(std::vector<string16>* words) const { 183c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = 0; i < children_.size(); ++i) 184c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch children_[i]->AppendWords(words); 185c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 186c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 187c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochint QueryNodeList::AppendChildrenToString(string16* query) const { 188c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int num_words = 0; 189c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (QueryNodeVector::const_iterator node = children_.begin(); 190c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch node != children_.end(); ++node) { 191c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (node != children_.begin()) 192c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch query->push_back(L' '); 193c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch num_words += (*node)->AppendToSQLiteQuery(query); 194c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 195c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return num_words; 196c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 197c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 198c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// A QueryNodePhrase is a phrase query ("quoted"). 199c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochclass QueryNodePhrase : public QueryNodeList { 200c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch public: 201c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual int AppendToSQLiteQuery(string16* query) const { 202c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch query->push_back(L'"'); 203c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int num_words = AppendChildrenToString(query); 204c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch query->push_back(L'"'); 205c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return num_words; 206c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 207c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 208c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual bool Matches(const string16& word, bool exact) const; 209c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch virtual bool HasMatchIn(const std::vector<QueryWord>& words, 210c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions* match_positions) const; 211c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}; 212c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 213c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryNodePhrase::Matches(const string16& word, bool exact) const { 214c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch NOTREACHED(); 215c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 216c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 217c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 218c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryNodePhrase::HasMatchIn( 219c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const std::vector<QueryWord>& words, 220c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions* match_positions) const { 221c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (words.size() < children_.size()) 222c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 223c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 224c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) { 225c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bool matched_all = true; 226c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t j = 0; j < children_.size(); ++j) { 227c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!children_[j]->Matches(words[i + j].word, true)) { 228c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch matched_all = false; 229c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch break; 230c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 231c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 232c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (matched_all) { 233c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const QueryWord& last_word = words[i + children_.size() - 1]; 234c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch match_positions->push_back( 235c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPosition(words[i].position, 236c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch last_word.position + last_word.word.length())); 237c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return true; 238c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 239c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 240c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 241c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 242c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 243c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochQueryParser::QueryParser() { 244c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 245c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 246c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// static 247c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryParser::IsWordLongEnoughForPrefixSearch(const string16& word) { 248dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen DCHECK(!word.empty()); 249c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t minimum_length = 3; 250c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We intentionally exclude Hangul Jamos (both Conjoining and compatibility) 251c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // because they 'behave like' Latin letters. Moreover, we should 252c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // normalize the former before reaching here. 253c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (0xAC00 <= word[0] && word[0] <= 0xD7A3) 254c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch minimum_length = 2; 255c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return word.size() >= minimum_length; 256c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 257c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 258c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Returns true if the character is considered a quote. 259c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic bool IsQueryQuote(wchar_t ch) { 260c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return ch == '"' || 261c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ch == 0xab || // left pointing double angle bracket 262c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ch == 0xbb || // right pointing double angle bracket 263c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ch == 0x201c || // left double quotation mark 264c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ch == 0x201d || // right double quotation mark 265c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ch == 0x201e; // double low-9 quotation mark 266c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 267c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 268c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochint QueryParser::ParseQuery(const string16& query, 269c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch string16* sqlite_query) { 270c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch QueryNodeList root; 271c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!ParseQueryImpl(query, &root)) 272c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return 0; 273c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return root.AppendToSQLiteQuery(sqlite_query); 274c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 275c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 276c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryParser::ParseQuery(const string16& query, 277c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<QueryNode*>* nodes) { 278c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch QueryNodeList root; 279c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (ParseQueryImpl(l10n_util::ToLower(query), &root)) 280c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch nodes->swap(*root.children()); 281c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 282c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 283c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 284c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryParser::ExtractQueryWords(const string16& query, 285c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<string16>* words) { 286c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch QueryNodeList root; 287c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!ParseQueryImpl(query, &root)) 288c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 289c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch root.AppendWords(words); 290c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 291c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 292c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryParser::DoesQueryMatch(const string16& text, 293c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const std::vector<QueryNode*>& query_nodes, 294c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions* match_positions) { 295c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (query_nodes.empty()) 296c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 297c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 298c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<QueryWord> query_words; 299c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch string16 lower_text = l10n_util::ToLower(text); 300c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ExtractQueryWords(lower_text, &query_words); 301c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 302c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (query_words.empty()) 303c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 304c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 305c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions matches; 306c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = 0; i < query_nodes.size(); ++i) { 307c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!query_nodes[i]->HasMatchIn(query_words, &matches)) 308c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 309c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 310c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (lower_text.length() != text.length()) { 311c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // The lower case string differs from the original string. The matches are 312c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // meaningless. 313c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // TODO(sky): we need a better way to align the positions so that we don't 314c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // completely punt here. 315c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch match_positions->clear(); 316c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } else { 317c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CoalseAndSortMatchPositions(&matches); 318c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch match_positions->swap(matches); 319c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 320c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return true; 321c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 322c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 323c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryParser::ParseQueryImpl(const string16& query, 324c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch QueryNodeList* root) { 32521d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen base::BreakIterator iter(&query, base::BreakIterator::BREAK_WORD); 326c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // TODO(evanm): support a locale here 327c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!iter.Init()) 328c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 329c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 330c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // To handle nesting, we maintain a stack of QueryNodeLists. 331c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // The last element (back) of the stack contains the current, deepest node. 332c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<QueryNodeList*> query_stack; 333c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch query_stack.push_back(root); 334c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 335c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bool in_quotes = false; // whether we're currently in a quoted phrase 336c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch while (iter.Advance()) { 337c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It 338c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // is not necessarily a word, but could also be a sequence of punctuation 339c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // or whitespace. 340c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (iter.IsWord()) { 34121d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen string16 word = iter.GetString(); 342c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 343c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch QueryNodeWord* word_node = new QueryNodeWord(word); 344c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (in_quotes) 345c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch word_node->set_literal(true); 346c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch query_stack.back()->AddChild(word_node); 347c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } else { // Punctuation. 348c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (IsQueryQuote(query[iter.prev()])) { 349c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!in_quotes) { 350c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch QueryNodeList* quotes_node = new QueryNodePhrase; 351c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch query_stack.back()->AddChild(quotes_node); 352c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch query_stack.push_back(quotes_node); 353c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch in_quotes = true; 354c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } else { 355c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch query_stack.pop_back(); // Stop adding to the quoted phrase. 356c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch in_quotes = false; 357c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 358c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 359c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 360c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 361c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 362c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch root->RemoveEmptySubnodes(); 363c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return true; 364c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 365c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 366c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryParser::ExtractQueryWords(const string16& text, 367c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<QueryWord>* words) { 36821d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen base::BreakIterator iter(&text, base::BreakIterator::BREAK_WORD); 369c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // TODO(evanm): support a locale here 370c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!iter.Init()) 371c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 372c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 373c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch while (iter.Advance()) { 374c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It 375c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // is not necessarily a word, but could also be a sequence of punctuation 376c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // or whitespace. 377c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (iter.IsWord()) { 37821d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen string16 word = iter.GetString(); 379c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!word.empty()) { 380c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch words->push_back(QueryWord()); 381c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch words->back().word = word; 382c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch words->back().position = iter.prev(); 383c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 384c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 385c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 386c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 387