1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/history/query_parser.h"
6
7#include <algorithm>
8
9#include "base/i18n/break_iterator.h"
10#include "base/logging.h"
11#include "base/memory/scoped_vector.h"
12#include "base/string_util.h"
13#include "base/utf_string_conversions.h"
14#include "ui/base/l10n/l10n_util.h"
15#include "unicode/uscript.h"
16
17namespace {
18
19// Returns true if |mp1.first| is less than |mp2.first|. This is used to
20// sort match positions.
21int CompareMatchPosition(const Snippet::MatchPosition& mp1,
22                         const Snippet::MatchPosition& mp2) {
23  return mp1.first < mp2.first;
24}
25
26// Returns true if |mp2| intersects |mp1|. This is intended for use by
27// CoalesceMatchesFrom and isn't meant as a general intersectpion comparison
28// function.
29bool SnippetIntersects(const Snippet::MatchPosition& mp1,
30                       const Snippet::MatchPosition& mp2) {
31  return mp2.first >= mp1.first && mp2.first <= mp1.second;
32}
33
34// Coalesces match positions in |matches| after index that intersect the match
35// position at |index|.
36void CoalesceMatchesFrom(size_t index,
37                         Snippet::MatchPositions* matches) {
38  Snippet::MatchPosition& mp = (*matches)[index];
39  for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1;
40       i != matches->end(); ) {
41    if (SnippetIntersects(mp, *i)) {
42      mp.second = i->second;
43      i = matches->erase(i);
44    } else {
45      return;
46    }
47  }
48}
49
50// Sorts the match positions in |matches| by their first index, then coalesces
51// any match positions that intersect each other.
52void CoalseAndSortMatchPositions(Snippet::MatchPositions* matches) {
53  std::sort(matches->begin(), matches->end(), &CompareMatchPosition);
54  // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
55  // from matches.
56  for (size_t i = 0; i < matches->size(); ++i)
57    CoalesceMatchesFrom(i, matches);
58}
59
60}  // namespace
61
62// Inheritance structure:
63// Queries are represented as trees of QueryNodes.
64// QueryNodes are either a collection of subnodes (a QueryNodeList)
65// or a single word (a QueryNodeWord).
66
67// A QueryNodeWord is a single word in the query.
68class QueryNodeWord : public QueryNode {
69 public:
70  explicit QueryNodeWord(const string16& word)
71      : word_(word), literal_(false) {}
72  virtual ~QueryNodeWord() {}
73  virtual int AppendToSQLiteQuery(string16* query) const;
74  virtual bool IsWord() const { return true; }
75
76  const string16& word() const { return word_; }
77  void set_literal(bool literal) { literal_ = literal; }
78
79  virtual bool HasMatchIn(const std::vector<QueryWord>& words,
80                          Snippet::MatchPositions* match_positions) const;
81
82  virtual bool Matches(const string16& word, bool exact) const;
83  virtual void AppendWords(std::vector<string16>* words) const;
84
85 private:
86  string16 word_;
87  bool literal_;
88};
89
90bool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words,
91                               Snippet::MatchPositions* match_positions) const {
92  for (size_t i = 0; i < words.size(); ++i) {
93    if (Matches(words[i].word, false)) {
94      size_t match_start = words[i].position;
95      match_positions->push_back(
96          Snippet::MatchPosition(match_start,
97                                 match_start + static_cast<int>(word_.size())));
98      return true;
99    }
100  }
101  return false;
102}
103
104bool QueryNodeWord::Matches(const string16& word, bool exact) const {
105  if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_))
106    return word == word_;
107  return word.size() >= word_.size() &&
108         (word_.compare(0, word_.size(), word, 0, word_.size()) == 0);
109}
110
111void QueryNodeWord::AppendWords(std::vector<string16>* words) const {
112  words->push_back(word_);
113}
114
115int QueryNodeWord::AppendToSQLiteQuery(string16* query) const {
116  query->append(word_);
117
118  // Use prefix search if we're not literal and long enough.
119  if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_))
120    *query += L'*';
121  return 1;
122}
123
124// A QueryNodeList has a collection of child QueryNodes
125// which it cleans up after.
126class QueryNodeList : public QueryNode {
127 public:
128  virtual ~QueryNodeList();
129
130  virtual int AppendToSQLiteQuery(string16* query) const {
131    return AppendChildrenToString(query);
132  }
133  virtual bool IsWord() const { return false; }
134
135  void AddChild(QueryNode* node) { children_.push_back(node); }
136
137  typedef std::vector<QueryNode*> QueryNodeVector;
138  QueryNodeVector* children() { return &children_; }
139
140  // Remove empty subnodes left over from other parsing.
141  void RemoveEmptySubnodes();
142
143  // QueryNodeList is never used with Matches or HasMatchIn.
144  virtual bool Matches(const string16& word, bool exact) const {
145    NOTREACHED();
146    return false;
147  }
148  virtual bool HasMatchIn(const std::vector<QueryWord>& words,
149                          Snippet::MatchPositions* match_positions) const {
150    NOTREACHED();
151    return false;
152  }
153  virtual void AppendWords(std::vector<string16>* words) const;
154
155 protected:
156  int AppendChildrenToString(string16* query) const;
157
158  QueryNodeVector children_;
159};
160
161QueryNodeList::~QueryNodeList() {
162  for (QueryNodeVector::iterator node = children_.begin();
163       node != children_.end(); ++node)
164    delete *node;
165}
166
167void QueryNodeList::RemoveEmptySubnodes() {
168  for (size_t i = 0; i < children_.size(); ++i) {
169    if (children_[i]->IsWord())
170      continue;
171
172    QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]);
173    list_node->RemoveEmptySubnodes();
174    if (list_node->children()->empty()) {
175      children_.erase(children_.begin() + i);
176      --i;
177      delete list_node;
178    }
179  }
180}
181
182void QueryNodeList::AppendWords(std::vector<string16>* words) const {
183  for (size_t i = 0; i < children_.size(); ++i)
184    children_[i]->AppendWords(words);
185}
186
187int QueryNodeList::AppendChildrenToString(string16* query) const {
188  int num_words = 0;
189  for (QueryNodeVector::const_iterator node = children_.begin();
190       node != children_.end(); ++node) {
191    if (node != children_.begin())
192      query->push_back(L' ');
193    num_words += (*node)->AppendToSQLiteQuery(query);
194  }
195  return num_words;
196}
197
198// A QueryNodePhrase is a phrase query ("quoted").
199class QueryNodePhrase : public QueryNodeList {
200 public:
201  virtual int AppendToSQLiteQuery(string16* query) const {
202    query->push_back(L'"');
203    int num_words = AppendChildrenToString(query);
204    query->push_back(L'"');
205    return num_words;
206  }
207
208  virtual bool Matches(const string16& word, bool exact) const;
209  virtual bool HasMatchIn(const std::vector<QueryWord>& words,
210                          Snippet::MatchPositions* match_positions) const;
211};
212
213bool QueryNodePhrase::Matches(const string16& word, bool exact) const {
214  NOTREACHED();
215  return false;
216}
217
218bool QueryNodePhrase::HasMatchIn(
219    const std::vector<QueryWord>& words,
220    Snippet::MatchPositions* match_positions) const {
221  if (words.size() < children_.size())
222    return false;
223
224  for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) {
225    bool matched_all = true;
226    for (size_t j = 0; j < children_.size(); ++j) {
227      if (!children_[j]->Matches(words[i + j].word, true)) {
228        matched_all = false;
229        break;
230      }
231    }
232    if (matched_all) {
233      const QueryWord& last_word = words[i + children_.size() - 1];
234      match_positions->push_back(
235          Snippet::MatchPosition(words[i].position,
236                                 last_word.position + last_word.word.length()));
237      return true;
238    }
239  }
240  return false;
241}
242
243QueryParser::QueryParser() {
244}
245
246// static
247bool QueryParser::IsWordLongEnoughForPrefixSearch(const string16& word) {
248  DCHECK(!word.empty());
249  size_t minimum_length = 3;
250  // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
251  // because they 'behave like' Latin letters. Moreover, we should
252  // normalize the former before reaching here.
253  if (0xAC00 <= word[0] && word[0] <= 0xD7A3)
254    minimum_length = 2;
255  return word.size() >= minimum_length;
256}
257
258// Returns true if the character is considered a quote.
259static bool IsQueryQuote(wchar_t ch) {
260  return ch == '"' ||
261         ch == 0xab ||    // left pointing double angle bracket
262         ch == 0xbb ||    // right pointing double angle bracket
263         ch == 0x201c ||  // left double quotation mark
264         ch == 0x201d ||  // right double quotation mark
265         ch == 0x201e;    // double low-9 quotation mark
266}
267
268int QueryParser::ParseQuery(const string16& query,
269                            string16* sqlite_query) {
270  QueryNodeList root;
271  if (!ParseQueryImpl(query, &root))
272    return 0;
273  return root.AppendToSQLiteQuery(sqlite_query);
274}
275
276void QueryParser::ParseQuery(const string16& query,
277                             std::vector<QueryNode*>* nodes) {
278  QueryNodeList root;
279  if (ParseQueryImpl(l10n_util::ToLower(query), &root))
280    nodes->swap(*root.children());
281}
282
283
284void QueryParser::ExtractQueryWords(const string16& query,
285                                    std::vector<string16>* words) {
286  QueryNodeList root;
287  if (!ParseQueryImpl(query, &root))
288    return;
289  root.AppendWords(words);
290}
291
292bool QueryParser::DoesQueryMatch(const string16& text,
293                                 const std::vector<QueryNode*>& query_nodes,
294                                 Snippet::MatchPositions* match_positions) {
295  if (query_nodes.empty())
296    return false;
297
298  std::vector<QueryWord> query_words;
299  string16 lower_text = l10n_util::ToLower(text);
300  ExtractQueryWords(lower_text, &query_words);
301
302  if (query_words.empty())
303    return false;
304
305  Snippet::MatchPositions matches;
306  for (size_t i = 0; i < query_nodes.size(); ++i) {
307    if (!query_nodes[i]->HasMatchIn(query_words, &matches))
308      return false;
309  }
310  if (lower_text.length() != text.length()) {
311    // The lower case string differs from the original string. The matches are
312    // meaningless.
313    // TODO(sky): we need a better way to align the positions so that we don't
314    // completely punt here.
315    match_positions->clear();
316  } else {
317    CoalseAndSortMatchPositions(&matches);
318    match_positions->swap(matches);
319  }
320  return true;
321}
322
323bool QueryParser::ParseQueryImpl(const string16& query,
324                                QueryNodeList* root) {
325  base::BreakIterator iter(&query, base::BreakIterator::BREAK_WORD);
326  // TODO(evanm): support a locale here
327  if (!iter.Init())
328    return false;
329
330  // To handle nesting, we maintain a stack of QueryNodeLists.
331  // The last element (back) of the stack contains the current, deepest node.
332  std::vector<QueryNodeList*> query_stack;
333  query_stack.push_back(root);
334
335  bool in_quotes = false;  // whether we're currently in a quoted phrase
336  while (iter.Advance()) {
337    // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
338    // is not necessarily a word, but could also be a sequence of punctuation
339    // or whitespace.
340    if (iter.IsWord()) {
341      string16 word = iter.GetString();
342
343      QueryNodeWord* word_node = new QueryNodeWord(word);
344      if (in_quotes)
345        word_node->set_literal(true);
346      query_stack.back()->AddChild(word_node);
347    } else {  // Punctuation.
348      if (IsQueryQuote(query[iter.prev()])) {
349        if (!in_quotes) {
350          QueryNodeList* quotes_node = new QueryNodePhrase;
351          query_stack.back()->AddChild(quotes_node);
352          query_stack.push_back(quotes_node);
353          in_quotes = true;
354        } else {
355          query_stack.pop_back();  // Stop adding to the quoted phrase.
356          in_quotes = false;
357        }
358      }
359    }
360  }
361
362  root->RemoveEmptySubnodes();
363  return true;
364}
365
366void QueryParser::ExtractQueryWords(const string16& text,
367                                    std::vector<QueryWord>* words) {
368  base::BreakIterator iter(&text, base::BreakIterator::BREAK_WORD);
369  // TODO(evanm): support a locale here
370  if (!iter.Init())
371    return;
372
373  while (iter.Advance()) {
374    // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
375    // is not necessarily a word, but could also be a sequence of punctuation
376    // or whitespace.
377    if (iter.IsWord()) {
378      string16 word = iter.GetString();
379      if (!word.empty()) {
380        words->push_back(QueryWord());
381        words->back().word = word;
382        words->back().position = iter.prev();
383      }
384    }
385  }
386}
387