1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "components/query_parser/query_parser.h"
6
7#include <algorithm>
8
9#include "base/compiler_specific.h"
10#include "base/i18n/break_iterator.h"
11#include "base/i18n/case_conversion.h"
12#include "base/logging.h"
13#include "base/stl_util.h"
14#include "base/strings/utf_string_conversions.h"
15
16namespace query_parser {
17namespace {
18
19// Returns true if |mp1.first| is less than |mp2.first|. This is used to
20// sort match positions.
21int CompareMatchPosition(const Snippet::MatchPosition& mp1,
22                         const Snippet::MatchPosition& mp2) {
23  return mp1.first < mp2.first;
24}
25
26// Returns true if |mp2| intersects |mp1|. This is intended for use by
27// CoalesceMatchesFrom and isn't meant as a general intersection comparison
28// function.
29bool SnippetIntersects(const Snippet::MatchPosition& mp1,
30                       const Snippet::MatchPosition& mp2) {
31  return mp2.first >= mp1.first && mp2.first <= mp1.second;
32}
33
34// Coalesces match positions in |matches| after index that intersect the match
35// position at |index|.
36void CoalesceMatchesFrom(size_t index, Snippet::MatchPositions* matches) {
37  Snippet::MatchPosition& mp = (*matches)[index];
38  for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1;
39       i != matches->end(); ) {
40    if (SnippetIntersects(mp, *i)) {
41      mp.second = std::max(mp.second, i->second);
42      i = matches->erase(i);
43    } else {
44      return;
45    }
46  }
47}
48
49// Returns true if the character is considered a quote.
50bool IsQueryQuote(wchar_t ch) {
51  return ch == '"' ||
52         ch == 0xab ||    // left pointing double angle bracket
53         ch == 0xbb ||    // right pointing double angle bracket
54         ch == 0x201c ||  // left double quotation mark
55         ch == 0x201d ||  // right double quotation mark
56         ch == 0x201e;    // double low-9 quotation mark
57}
58
59}  // namespace
60
61// Inheritance structure:
62// Queries are represented as trees of QueryNodes.
63// QueryNodes are either a collection of subnodes (a QueryNodeList)
64// or a single word (a QueryNodeWord).
65
66// A QueryNodeWord is a single word in the query.
67class QueryNodeWord : public QueryNode {
68 public:
69  explicit QueryNodeWord(const base::string16& word);
70  virtual ~QueryNodeWord();
71
72  const base::string16& word() const { return word_; }
73
74  void set_literal(bool literal) { literal_ = literal; }
75
76  // QueryNode:
77  virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
78  virtual bool IsWord() const OVERRIDE;
79  virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE;
80  virtual bool HasMatchIn(
81      const QueryWordVector& words,
82      Snippet::MatchPositions* match_positions) const OVERRIDE;
83  virtual bool HasMatchIn(
84      const QueryWordVector& words) const OVERRIDE;
85  virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE;
86
87 private:
88  base::string16 word_;
89  bool literal_;
90
91  DISALLOW_COPY_AND_ASSIGN(QueryNodeWord);
92};
93
94QueryNodeWord::QueryNodeWord(const base::string16& word)
95    : word_(word),
96      literal_(false) {}
97
98QueryNodeWord::~QueryNodeWord() {}
99
100int QueryNodeWord::AppendToSQLiteQuery(base::string16* query) const {
101  query->append(word_);
102
103  // Use prefix search if we're not literal and long enough.
104  if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_))
105    *query += L'*';
106  return 1;
107}
108
109bool QueryNodeWord::IsWord() const {
110  return true;
111}
112
113bool QueryNodeWord::Matches(const base::string16& word, bool exact) const {
114  if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_))
115    return word == word_;
116  return word.size() >= word_.size() &&
117         (word_.compare(0, word_.size(), word, 0, word_.size()) == 0);
118}
119
120bool QueryNodeWord::HasMatchIn(const QueryWordVector& words,
121                               Snippet::MatchPositions* match_positions) const {
122  bool matched = false;
123  for (size_t i = 0; i < words.size(); ++i) {
124    if (Matches(words[i].word, false)) {
125      size_t match_start = words[i].position;
126      match_positions->push_back(
127          Snippet::MatchPosition(match_start,
128                                 match_start + static_cast<int>(word_.size())));
129      matched = true;
130    }
131  }
132  return matched;
133}
134
135bool QueryNodeWord::HasMatchIn(const QueryWordVector& words) const {
136  for (size_t i = 0; i < words.size(); ++i) {
137    if (Matches(words[i].word, false))
138      return true;
139  }
140  return false;
141}
142
143void QueryNodeWord::AppendWords(std::vector<base::string16>* words) const {
144  words->push_back(word_);
145}
146
147// A QueryNodeList has a collection of QueryNodes which are deleted in the end.
148class QueryNodeList : public QueryNode {
149 public:
150  QueryNodeList();
151  virtual ~QueryNodeList();
152
153  QueryNodeStarVector* children() { return &children_; }
154
155  void AddChild(QueryNode* node);
156
157  // Remove empty subnodes left over from other parsing.
158  void RemoveEmptySubnodes();
159
160  // QueryNode:
161  virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
162  virtual bool IsWord() const OVERRIDE;
163  virtual bool Matches(const base::string16& word, bool exact) const OVERRIDE;
164  virtual bool HasMatchIn(
165      const QueryWordVector& words,
166      Snippet::MatchPositions* match_positions) const OVERRIDE;
167  virtual bool HasMatchIn(const QueryWordVector& words) const OVERRIDE;
168  virtual void AppendWords(std::vector<base::string16>* words) const OVERRIDE;
169
170 protected:
171  int AppendChildrenToString(base::string16* query) const;
172
173  QueryNodeStarVector children_;
174
175 private:
176  DISALLOW_COPY_AND_ASSIGN(QueryNodeList);
177};
178
179QueryNodeList::QueryNodeList() {}
180
181QueryNodeList::~QueryNodeList() {
182  STLDeleteElements(&children_);
183}
184
185void QueryNodeList::AddChild(QueryNode* node) {
186  children_.push_back(node);
187}
188
189void QueryNodeList::RemoveEmptySubnodes() {
190  for (size_t i = 0; i < children_.size(); ++i) {
191    if (children_[i]->IsWord())
192      continue;
193
194    QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]);
195    list_node->RemoveEmptySubnodes();
196    if (list_node->children()->empty()) {
197      children_.erase(children_.begin() + i);
198      --i;
199      delete list_node;
200    }
201  }
202}
203
204int QueryNodeList::AppendToSQLiteQuery(base::string16* query) const {
205  return AppendChildrenToString(query);
206}
207
208bool QueryNodeList::IsWord() const {
209  return false;
210}
211
212bool QueryNodeList::Matches(const base::string16& word, bool exact) const {
213  NOTREACHED();
214  return false;
215}
216
217bool QueryNodeList::HasMatchIn(const QueryWordVector& words,
218                               Snippet::MatchPositions* match_positions) const {
219  NOTREACHED();
220  return false;
221}
222
223bool QueryNodeList::HasMatchIn(const QueryWordVector& words) const {
224  NOTREACHED();
225  return false;
226}
227
228void QueryNodeList::AppendWords(std::vector<base::string16>* words) const {
229  for (size_t i = 0; i < children_.size(); ++i)
230    children_[i]->AppendWords(words);
231}
232
233int QueryNodeList::AppendChildrenToString(base::string16* query) const {
234  int num_words = 0;
235  for (QueryNodeStarVector::const_iterator node = children_.begin();
236       node != children_.end(); ++node) {
237    if (node != children_.begin())
238      query->push_back(L' ');
239    num_words += (*node)->AppendToSQLiteQuery(query);
240  }
241  return num_words;
242}
243
244// A QueryNodePhrase is a phrase query ("quoted").
245class QueryNodePhrase : public QueryNodeList {
246 public:
247  QueryNodePhrase();
248  virtual ~QueryNodePhrase();
249
250  // QueryNodeList:
251  virtual int AppendToSQLiteQuery(base::string16* query) const OVERRIDE;
252  virtual bool HasMatchIn(
253      const QueryWordVector& words,
254      Snippet::MatchPositions* match_positions) const OVERRIDE;
255  virtual bool HasMatchIn(const QueryWordVector& words) const OVERRIDE;
256
257 private:
258  bool MatchesAll(const QueryWordVector& words,
259                  const QueryWord** first_word,
260                  const QueryWord** last_word) const;
261  DISALLOW_COPY_AND_ASSIGN(QueryNodePhrase);
262};
263
264QueryNodePhrase::QueryNodePhrase() {}
265
266QueryNodePhrase::~QueryNodePhrase() {}
267
268int QueryNodePhrase::AppendToSQLiteQuery(base::string16* query) const {
269  query->push_back(L'"');
270  int num_words = AppendChildrenToString(query);
271  query->push_back(L'"');
272  return num_words;
273}
274
275bool QueryNodePhrase::MatchesAll(const QueryWordVector& words,
276                                 const QueryWord** first_word,
277                                 const QueryWord** last_word) const {
278  if (words.size() < children_.size())
279    return false;
280
281  for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) {
282    bool matched_all = true;
283    for (size_t j = 0; j < children_.size(); ++j) {
284      if (!children_[j]->Matches(words[i + j].word, true)) {
285        matched_all = false;
286        break;
287      }
288    }
289    if (matched_all) {
290      *first_word = &words[i];
291      *last_word = &words[i + children_.size() - 1];
292      return true;
293    }
294  }
295  return false;
296}
297
298bool QueryNodePhrase::HasMatchIn(
299    const QueryWordVector& words,
300    Snippet::MatchPositions* match_positions) const {
301  const QueryWord* first_word;
302  const QueryWord* last_word;
303
304  if (MatchesAll(words, &first_word, &last_word)) {
305    match_positions->push_back(
306        Snippet::MatchPosition(first_word->position,
307                               last_word->position + last_word->word.length()));
308      return true;
309  }
310  return false;
311}
312
313bool QueryNodePhrase::HasMatchIn(const QueryWordVector& words) const {
314  const QueryWord* first_word;
315  const QueryWord* last_word;
316  return MatchesAll(words, &first_word, &last_word);
317}
318
319QueryParser::QueryParser() {}
320
321// static
322bool QueryParser::IsWordLongEnoughForPrefixSearch(const base::string16& word) {
323  DCHECK(!word.empty());
324  size_t minimum_length = 3;
325  // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
326  // because they 'behave like' Latin letters. Moreover, we should
327  // normalize the former before reaching here.
328  if (0xAC00 <= word[0] && word[0] <= 0xD7A3)
329    minimum_length = 2;
330  return word.size() >= minimum_length;
331}
332
333int QueryParser::ParseQuery(const base::string16& query,
334                            base::string16* sqlite_query) {
335  QueryNodeList root;
336  if (!ParseQueryImpl(query, &root))
337    return 0;
338  return root.AppendToSQLiteQuery(sqlite_query);
339}
340
341void QueryParser::ParseQueryWords(const base::string16& query,
342                                  std::vector<base::string16>* words) {
343  QueryNodeList root;
344  if (!ParseQueryImpl(query, &root))
345    return;
346  root.AppendWords(words);
347}
348
349void QueryParser::ParseQueryNodes(const base::string16& query,
350                                  QueryNodeStarVector* nodes) {
351  QueryNodeList root;
352  if (ParseQueryImpl(base::i18n::ToLower(query), &root))
353    nodes->swap(*root.children());
354}
355
356bool QueryParser::DoesQueryMatch(const base::string16& text,
357                                 const QueryNodeStarVector& query_nodes,
358                                 Snippet::MatchPositions* match_positions) {
359  if (query_nodes.empty())
360    return false;
361
362  QueryWordVector query_words;
363  base::string16 lower_text = base::i18n::ToLower(text);
364  ExtractQueryWords(lower_text, &query_words);
365
366  if (query_words.empty())
367    return false;
368
369  Snippet::MatchPositions matches;
370  for (size_t i = 0; i < query_nodes.size(); ++i) {
371    if (!query_nodes[i]->HasMatchIn(query_words, &matches))
372      return false;
373  }
374  if (lower_text.length() != text.length()) {
375    // The lower case string differs from the original string. The matches are
376    // meaningless.
377    // TODO(sky): we need a better way to align the positions so that we don't
378    // completely punt here.
379    match_positions->clear();
380  } else {
381    SortAndCoalesceMatchPositions(&matches);
382    match_positions->swap(matches);
383  }
384  return true;
385}
386
387bool QueryParser::DoesQueryMatch(const QueryWordVector& query_words,
388                                 const QueryNodeStarVector& query_nodes) {
389  if (query_nodes.empty() || query_words.empty())
390    return false;
391
392  for (size_t i = 0; i < query_nodes.size(); ++i) {
393    if (!query_nodes[i]->HasMatchIn(query_words))
394      return false;
395  }
396  return true;
397}
398
399bool QueryParser::ParseQueryImpl(const base::string16& query,
400                                 QueryNodeList* root) {
401  base::i18n::BreakIterator iter(query, base::i18n::BreakIterator::BREAK_WORD);
402  // TODO(evanm): support a locale here
403  if (!iter.Init())
404    return false;
405
406  // To handle nesting, we maintain a stack of QueryNodeLists.
407  // The last element (back) of the stack contains the current, deepest node.
408  std::vector<QueryNodeList*> query_stack;
409  query_stack.push_back(root);
410
411  bool in_quotes = false;  // whether we're currently in a quoted phrase
412  while (iter.Advance()) {
413    // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
414    // is not necessarily a word, but could also be a sequence of punctuation
415    // or whitespace.
416    if (iter.IsWord()) {
417      QueryNodeWord* word_node = new QueryNodeWord(iter.GetString());
418      if (in_quotes)
419        word_node->set_literal(true);
420      query_stack.back()->AddChild(word_node);
421    } else {  // Punctuation.
422      if (IsQueryQuote(query[iter.prev()])) {
423        if (!in_quotes) {
424          QueryNodeList* quotes_node = new QueryNodePhrase;
425          query_stack.back()->AddChild(quotes_node);
426          query_stack.push_back(quotes_node);
427          in_quotes = true;
428        } else {
429          query_stack.pop_back();  // Stop adding to the quoted phrase.
430          in_quotes = false;
431        }
432      }
433    }
434  }
435
436  root->RemoveEmptySubnodes();
437  return true;
438}
439
440void QueryParser::ExtractQueryWords(const base::string16& text,
441                                    QueryWordVector* words) {
442  base::i18n::BreakIterator iter(text, base::i18n::BreakIterator::BREAK_WORD);
443  // TODO(evanm): support a locale here
444  if (!iter.Init())
445    return;
446
447  while (iter.Advance()) {
448    // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
449    // is not necessarily a word, but could also be a sequence of punctuation
450    // or whitespace.
451    if (iter.IsWord()) {
452      base::string16 word = iter.GetString();
453      if (!word.empty()) {
454        words->push_back(QueryWord());
455        words->back().word = word;
456        words->back().position = iter.prev();
457     }
458    }
459  }
460}
461
462// static
463void QueryParser::SortAndCoalesceMatchPositions(
464    Snippet::MatchPositions* matches) {
465  std::sort(matches->begin(), matches->end(), &CompareMatchPosition);
466  // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
467  // from matches.
468  for (size_t i = 0; i < matches->size(); ++i)
469    CoalesceMatchesFrom(i, matches);
470}
471
472}  // namespace query_parser
473