1// Copyright 2014 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "components/query_parser/query_parser.h" 6 7#include <algorithm> 8 9#include "base/compiler_specific.h" 10#include "base/i18n/break_iterator.h" 11#include "base/i18n/case_conversion.h" 12#include "base/logging.h" 13#include "base/stl_util.h" 14#include "base/strings/utf_string_conversions.h" 15 16namespace query_parser { 17namespace { 18 19// Returns true if |mp1.first| is less than |mp2.first|. This is used to 20// sort match positions. 21int CompareMatchPosition(const Snippet::MatchPosition& mp1, 22 const Snippet::MatchPosition& mp2) { 23 return mp1.first < mp2.first; 24} 25 26// Returns true if |mp2| intersects |mp1|. This is intended for use by 27// CoalesceMatchesFrom and isn't meant as a general intersection comparison 28// function. 29bool SnippetIntersects(const Snippet::MatchPosition& mp1, 30 const Snippet::MatchPosition& mp2) { 31 return mp2.first >= mp1.first && mp2.first <= mp1.second; 32} 33 34// Coalesces match positions in |matches| after index that intersect the match 35// position at |index|. 36void CoalesceMatchesFrom(size_t index, Snippet::MatchPositions* matches) { 37 Snippet::MatchPosition& mp = (*matches)[index]; 38 for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1; 39 i != matches->end(); ) { 40 if (SnippetIntersects(mp, *i)) { 41 mp.second = std::max(mp.second, i->second); 42 i = matches->erase(i); 43 } else { 44 return; 45 } 46 } 47} 48 49// Returns true if the character is considered a quote. 50bool IsQueryQuote(wchar_t ch) { 51 return ch == '"' || 52 ch == 0xab || // left pointing double angle bracket 53 ch == 0xbb || // right pointing double angle bracket 54 ch == 0x201c || // left double quotation mark 55 ch == 0x201d || // right double quotation mark 56 ch == 0x201e; // double low-9 quotation mark 57} 58 59} // namespace 60 61// Inheritance structure: 62// Queries are represented as trees of QueryNodes. 63// QueryNodes are either a collection of subnodes (a QueryNodeList) 64// or a single word (a QueryNodeWord). 65 66// A QueryNodeWord is a single word in the query. 67class QueryNodeWord : public QueryNode { 68 public: 69 explicit QueryNodeWord(const base::string16& word); 70 virtual ~QueryNodeWord(); 71 72 const base::string16& word() const { return word_; } 73 74 void set_literal(bool literal) { literal_ = literal; } 75 76 // QueryNode: 77 virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE; 78 virtual bool IsWord() const OVERRIDE; 79 virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE; 80 virtual bool HasMatchIn( 81 const QueryWordVector& words, 82 Snippet::MatchPositions* match_positions) const OVERRIDE; 83 virtual bool HasMatchIn( 84 const QueryWordVector& words) const OVERRIDE; 85 virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE; 86 87 private: 88 base::string16 word_; 89 bool literal_; 90 91 DISALLOW_COPY_AND_ASSIGN(QueryNodeWord); 92}; 93 94QueryNodeWord::QueryNodeWord(const base::string16& word) 95 : word_(word), 96 literal_(false) {} 97 98QueryNodeWord::~QueryNodeWord() {} 99 100int QueryNodeWord::AppendToSQLiteQuery(base::string16* query) const { 101 query->append(word_); 102 103 // Use prefix search if we're not literal and long enough. 104 if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_)) 105 *query += L'*'; 106 return 1; 107} 108 109bool QueryNodeWord::IsWord() const { 110 return true; 111} 112 113bool QueryNodeWord::Matches(const base::string16& word, bool exact) const { 114 if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_)) 115 return word == word_; 116 return word.size() >= word_.size() && 117 (word_.compare(0, word_.size(), word, 0, word_.size()) == 0); 118} 119 120bool QueryNodeWord::HasMatchIn(const QueryWordVector& words, 121 Snippet::MatchPositions* match_positions) const { 122 bool matched = false; 123 for (size_t i = 0; i < words.size(); ++i) { 124 if (Matches(words[i].word, false)) { 125 size_t match_start = words[i].position; 126 match_positions->push_back( 127 Snippet::MatchPosition(match_start, 128 match_start + static_cast<int>(word_.size()))); 129 matched = true; 130 } 131 } 132 return matched; 133} 134 135bool QueryNodeWord::HasMatchIn(const QueryWordVector& words) const { 136 for (size_t i = 0; i < words.size(); ++i) { 137 if (Matches(words[i].word, false)) 138 return true; 139 } 140 return false; 141} 142 143void QueryNodeWord::AppendWords(std::vector<base::string16>* words) const { 144 words->push_back(word_); 145} 146 147// A QueryNodeList has a collection of QueryNodes which are deleted in the end. 148class QueryNodeList : public QueryNode { 149 public: 150 QueryNodeList(); 151 virtual ~QueryNodeList(); 152 153 QueryNodeStarVector* children() { return &children_; } 154 155 void AddChild(QueryNode* node); 156 157 // Remove empty subnodes left over from other parsing. 158 void RemoveEmptySubnodes(); 159 160 // QueryNode: 161 virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE; 162 virtual bool IsWord() const OVERRIDE; 163 virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE; 164 virtual bool HasMatchIn( 165 const QueryWordVector& words, 166 Snippet::MatchPositions* match_positions) const OVERRIDE; 167 virtual bool HasMatchIn(const QueryWordVector& words) const OVERRIDE; 168 virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE; 169 170 protected: 171 int AppendChildrenToString(base::string16* query) const; 172 173 QueryNodeStarVector children_; 174 175 private: 176 DISALLOW_COPY_AND_ASSIGN(QueryNodeList); 177}; 178 179QueryNodeList::QueryNodeList() {} 180 181QueryNodeList::~QueryNodeList() { 182 STLDeleteElements(&children_); 183} 184 185void QueryNodeList::AddChild(QueryNode* node) { 186 children_.push_back(node); 187} 188 189void QueryNodeList::RemoveEmptySubnodes() { 190 for (size_t i = 0; i < children_.size(); ++i) { 191 if (children_[i]->IsWord()) 192 continue; 193 194 QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]); 195 list_node->RemoveEmptySubnodes(); 196 if (list_node->children()->empty()) { 197 children_.erase(children_.begin() + i); 198 --i; 199 delete list_node; 200 } 201 } 202} 203 204int QueryNodeList::AppendToSQLiteQuery(base::string16* query) const { 205 return AppendChildrenToString(query); 206} 207 208bool QueryNodeList::IsWord() const { 209 return false; 210} 211 212bool QueryNodeList::Matches(const base::string16& word, bool exact) const { 213 NOTREACHED(); 214 return false; 215} 216 217bool QueryNodeList::HasMatchIn(const QueryWordVector& words, 218 Snippet::MatchPositions* match_positions) const { 219 NOTREACHED(); 220 return false; 221} 222 223bool QueryNodeList::HasMatchIn(const QueryWordVector& words) const { 224 NOTREACHED(); 225 return false; 226} 227 228void QueryNodeList::AppendWords(std::vector<base::string16>* words) const { 229 for (size_t i = 0; i < children_.size(); ++i) 230 children_[i]->AppendWords(words); 231} 232 233int QueryNodeList::AppendChildrenToString(base::string16* query) const { 234 int num_words = 0; 235 for (QueryNodeStarVector::const_iterator node = children_.begin(); 236 node != children_.end(); ++node) { 237 if (node != children_.begin()) 238 query->push_back(L' '); 239 num_words += (*node)->AppendToSQLiteQuery(query); 240 } 241 return num_words; 242} 243 244// A QueryNodePhrase is a phrase query ("quoted"). 245class QueryNodePhrase : public QueryNodeList { 246 public: 247 QueryNodePhrase(); 248 virtual ~QueryNodePhrase(); 249 250 // QueryNodeList: 251 virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE; 252 virtual bool HasMatchIn( 253 const QueryWordVector& words, 254 Snippet::MatchPositions* match_positions) const OVERRIDE; 255 virtual bool HasMatchIn(const QueryWordVector& words) const OVERRIDE; 256 257 private: 258 bool MatchesAll(const QueryWordVector& words, 259 const QueryWord** first_word, 260 const QueryWord** last_word) const; 261 DISALLOW_COPY_AND_ASSIGN(QueryNodePhrase); 262}; 263 264QueryNodePhrase::QueryNodePhrase() {} 265 266QueryNodePhrase::~QueryNodePhrase() {} 267 268int QueryNodePhrase::AppendToSQLiteQuery(base::string16* query) const { 269 query->push_back(L'"'); 270 int num_words = AppendChildrenToString(query); 271 query->push_back(L'"'); 272 return num_words; 273} 274 275bool QueryNodePhrase::MatchesAll(const QueryWordVector& words, 276 const QueryWord** first_word, 277 const QueryWord** last_word) const { 278 if (words.size() < children_.size()) 279 return false; 280 281 for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) { 282 bool matched_all = true; 283 for (size_t j = 0; j < children_.size(); ++j) { 284 if (!children_[j]->Matches(words[i + j].word, true)) { 285 matched_all = false; 286 break; 287 } 288 } 289 if (matched_all) { 290 *first_word = &words[i]; 291 *last_word = &words[i + children_.size() - 1]; 292 return true; 293 } 294 } 295 return false; 296} 297 298bool QueryNodePhrase::HasMatchIn( 299 const QueryWordVector& words, 300 Snippet::MatchPositions* match_positions) const { 301 const QueryWord* first_word; 302 const QueryWord* last_word; 303 304 if (MatchesAll(words, &first_word, &last_word)) { 305 match_positions->push_back( 306 Snippet::MatchPosition(first_word->position, 307 last_word->position + last_word->word.length())); 308 return true; 309 } 310 return false; 311} 312 313bool QueryNodePhrase::HasMatchIn(const QueryWordVector& words) const { 314 const QueryWord* first_word; 315 const QueryWord* last_word; 316 return MatchesAll(words, &first_word, &last_word); 317} 318 319QueryParser::QueryParser() {} 320 321// static 322bool QueryParser::IsWordLongEnoughForPrefixSearch(const base::string16& word) { 323 DCHECK(!word.empty()); 324 size_t minimum_length = 3; 325 // We intentionally exclude Hangul Jamos (both Conjoining and compatibility) 326 // because they 'behave like' Latin letters. Moreover, we should 327 // normalize the former before reaching here. 328 if (0xAC00 <= word[0] && word[0] <= 0xD7A3) 329 minimum_length = 2; 330 return word.size() >= minimum_length; 331} 332 333int QueryParser::ParseQuery(const base::string16& query, 334 base::string16* sqlite_query) { 335 QueryNodeList root; 336 if (!ParseQueryImpl(query, &root)) 337 return 0; 338 return root.AppendToSQLiteQuery(sqlite_query); 339} 340 341void QueryParser::ParseQueryWords(const base::string16& query, 342 std::vector<base::string16>* words) { 343 QueryNodeList root; 344 if (!ParseQueryImpl(query, &root)) 345 return; 346 root.AppendWords(words); 347} 348 349void QueryParser::ParseQueryNodes(const base::string16& query, 350 QueryNodeStarVector* nodes) { 351 QueryNodeList root; 352 if (ParseQueryImpl(base::i18n::ToLower(query), &root)) 353 nodes->swap(*root.children()); 354} 355 356bool QueryParser::DoesQueryMatch(const base::string16& text, 357 const QueryNodeStarVector& query_nodes, 358 Snippet::MatchPositions* match_positions) { 359 if (query_nodes.empty()) 360 return false; 361 362 QueryWordVector query_words; 363 base::string16 lower_text = base::i18n::ToLower(text); 364 ExtractQueryWords(lower_text, &query_words); 365 366 if (query_words.empty()) 367 return false; 368 369 Snippet::MatchPositions matches; 370 for (size_t i = 0; i < query_nodes.size(); ++i) { 371 if (!query_nodes[i]->HasMatchIn(query_words, &matches)) 372 return false; 373 } 374 if (lower_text.length() != text.length()) { 375 // The lower case string differs from the original string. The matches are 376 // meaningless. 377 // TODO(sky): we need a better way to align the positions so that we don't 378 // completely punt here. 379 match_positions->clear(); 380 } else { 381 SortAndCoalesceMatchPositions(&matches); 382 match_positions->swap(matches); 383 } 384 return true; 385} 386 387bool QueryParser::DoesQueryMatch(const QueryWordVector& query_words, 388 const QueryNodeStarVector& query_nodes) { 389 if (query_nodes.empty() || query_words.empty()) 390 return false; 391 392 for (size_t i = 0; i < query_nodes.size(); ++i) { 393 if (!query_nodes[i]->HasMatchIn(query_words)) 394 return false; 395 } 396 return true; 397} 398 399bool QueryParser::ParseQueryImpl(const base::string16& query, 400 QueryNodeList* root) { 401 base::i18n::BreakIterator iter(query, base::i18n::BreakIterator::BREAK_WORD); 402 // TODO(evanm): support a locale here 403 if (!iter.Init()) 404 return false; 405 406 // To handle nesting, we maintain a stack of QueryNodeLists. 407 // The last element (back) of the stack contains the current, deepest node. 408 std::vector<QueryNodeList*> query_stack; 409 query_stack.push_back(root); 410 411 bool in_quotes = false; // whether we're currently in a quoted phrase 412 while (iter.Advance()) { 413 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It 414 // is not necessarily a word, but could also be a sequence of punctuation 415 // or whitespace. 416 if (iter.IsWord()) { 417 QueryNodeWord* word_node = new QueryNodeWord(iter.GetString()); 418 if (in_quotes) 419 word_node->set_literal(true); 420 query_stack.back()->AddChild(word_node); 421 } else { // Punctuation. 422 if (IsQueryQuote(query[iter.prev()])) { 423 if (!in_quotes) { 424 QueryNodeList* quotes_node = new QueryNodePhrase; 425 query_stack.back()->AddChild(quotes_node); 426 query_stack.push_back(quotes_node); 427 in_quotes = true; 428 } else { 429 query_stack.pop_back(); // Stop adding to the quoted phrase. 430 in_quotes = false; 431 } 432 } 433 } 434 } 435 436 root->RemoveEmptySubnodes(); 437 return true; 438} 439 440void QueryParser::ExtractQueryWords(const base::string16& text, 441 QueryWordVector* words) { 442 base::i18n::BreakIterator iter(text, base::i18n::BreakIterator::BREAK_WORD); 443 // TODO(evanm): support a locale here 444 if (!iter.Init()) 445 return; 446 447 while (iter.Advance()) { 448 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It 449 // is not necessarily a word, but could also be a sequence of punctuation 450 // or whitespace. 451 if (iter.IsWord()) { 452 base::string16 word = iter.GetString(); 453 if (!word.empty()) { 454 words->push_back(QueryWord()); 455 words->back().word = word; 456 words->back().position = iter.prev(); 457 } 458 } 459 } 460} 461 462// static 463void QueryParser::SortAndCoalesceMatchPositions( 464 Snippet::MatchPositions* matches) { 465 std::sort(matches->begin(), matches->end(), &CompareMatchPosition); 466 // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove 467 // from matches. 468 for (size_t i = 0; i < matches->size(); ++i) 469 CoalesceMatchesFrom(i, matches); 470} 471 472} // namespace query_parser 473