1ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Use of this source code is governed by a BSD-style license that can be
3c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// found in the LICENSE file.
4c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
5c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/browser/history/query_parser.h"
6c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
7c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include <algorithm>
8c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
921d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen#include "base/i18n/break_iterator.h"
10c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/logging.h"
11ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "base/memory/scoped_vector.h"
12c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/string_util.h"
13c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/utf_string_conversions.h"
1472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen#include "ui/base/l10n/l10n_util.h"
15c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "unicode/uscript.h"
16c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
17c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochnamespace {
18c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
19c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Returns true if |mp1.first| is less than |mp2.first|. This is used to
20c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// sort match positions.
21c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochint CompareMatchPosition(const Snippet::MatchPosition& mp1,
22c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                         const Snippet::MatchPosition& mp2) {
23c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return mp1.first < mp2.first;
24c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
25c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
26c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Returns true if |mp2| intersects |mp1|. This is intended for use by
27c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// CoalesceMatchesFrom and isn't meant as a general intersectpion comparison
28c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// function.
29c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool SnippetIntersects(const Snippet::MatchPosition& mp1,
30c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                       const Snippet::MatchPosition& mp2) {
31c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return mp2.first >= mp1.first && mp2.first <= mp1.second;
32c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
33c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
34c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Coalesces match positions in |matches| after index that intersect the match
35c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// position at |index|.
36c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid CoalesceMatchesFrom(size_t index,
37c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                         Snippet::MatchPositions* matches) {
38c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  Snippet::MatchPosition& mp = (*matches)[index];
39c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (Snippet::MatchPositions::iterator i = matches->begin() + index + 1;
40c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch       i != matches->end(); ) {
41c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (SnippetIntersects(mp, *i)) {
42c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      mp.second = i->second;
43c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      i = matches->erase(i);
44c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    } else {
45c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      return;
46c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
47c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
48c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
49c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
50c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Sorts the match positions in |matches| by their first index, then coalesces
51c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// any match positions that intersect each other.
52c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid CoalseAndSortMatchPositions(Snippet::MatchPositions* matches) {
53c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::sort(matches->begin(), matches->end(), &CompareMatchPosition);
54c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
55c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // from matches.
56c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (size_t i = 0; i < matches->size(); ++i)
57c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    CoalesceMatchesFrom(i, matches);
58c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
59c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
60c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}  // namespace
61c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
62c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Inheritance structure:
63c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Queries are represented as trees of QueryNodes.
64c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// QueryNodes are either a collection of subnodes (a QueryNodeList)
65c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// or a single word (a QueryNodeWord).
66c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
67c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// A QueryNodeWord is a single word in the query.
68c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochclass QueryNodeWord : public QueryNode {
69c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch public:
70c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  explicit QueryNodeWord(const string16& word)
71c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      : word_(word), literal_(false) {}
72c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual ~QueryNodeWord() {}
73c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual int AppendToSQLiteQuery(string16* query) const;
74c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual bool IsWord() const { return true; }
75c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
76c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const string16& word() const { return word_; }
77c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  void set_literal(bool literal) { literal_ = literal; }
78c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
79c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual bool HasMatchIn(const std::vector<QueryWord>& words,
80c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                          Snippet::MatchPositions* match_positions) const;
81c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
82c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual bool Matches(const string16& word, bool exact) const;
83c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual void AppendWords(std::vector<string16>* words) const;
84c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
85c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch private:
86c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  string16 word_;
87c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  bool literal_;
88c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch};
89c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
90c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryNodeWord::HasMatchIn(const std::vector<QueryWord>& words,
91c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                               Snippet::MatchPositions* match_positions) const {
92c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (size_t i = 0; i < words.size(); ++i) {
93c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (Matches(words[i].word, false)) {
94c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      size_t match_start = words[i].position;
95c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      match_positions->push_back(
96c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          Snippet::MatchPosition(match_start,
97c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                 match_start + static_cast<int>(word_.size())));
98c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      return true;
99c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
100c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
101c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return false;
102c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
103c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
104c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryNodeWord::Matches(const string16& word, bool exact) const {
105c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (exact || !QueryParser::IsWordLongEnoughForPrefixSearch(word_))
106c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return word == word_;
107c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return word.size() >= word_.size() &&
108c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch         (word_.compare(0, word_.size(), word, 0, word_.size()) == 0);
109c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
110c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
111c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryNodeWord::AppendWords(std::vector<string16>* words) const {
112c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  words->push_back(word_);
113c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
114c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
115c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochint QueryNodeWord::AppendToSQLiteQuery(string16* query) const {
116c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  query->append(word_);
117c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
118c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Use prefix search if we're not literal and long enough.
119c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!literal_ && QueryParser::IsWordLongEnoughForPrefixSearch(word_))
120c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    *query += L'*';
121c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return 1;
122c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
123c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
124c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// A QueryNodeList has a collection of child QueryNodes
125c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// which it cleans up after.
126c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochclass QueryNodeList : public QueryNode {
127c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch public:
128c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual ~QueryNodeList();
129c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
130c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual int AppendToSQLiteQuery(string16* query) const {
131c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return AppendChildrenToString(query);
132c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
133c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual bool IsWord() const { return false; }
134c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
135c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  void AddChild(QueryNode* node) { children_.push_back(node); }
136c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
137c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  typedef std::vector<QueryNode*> QueryNodeVector;
138c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  QueryNodeVector* children() { return &children_; }
139c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
140c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Remove empty subnodes left over from other parsing.
141c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  void RemoveEmptySubnodes();
142c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
143c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // QueryNodeList is never used with Matches or HasMatchIn.
144c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual bool Matches(const string16& word, bool exact) const {
145c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    NOTREACHED();
146c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
147c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
148c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual bool HasMatchIn(const std::vector<QueryWord>& words,
149c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                          Snippet::MatchPositions* match_positions) const {
150c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    NOTREACHED();
151c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
152c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
153c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual void AppendWords(std::vector<string16>* words) const;
154c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
155c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch protected:
156c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int AppendChildrenToString(string16* query) const;
157c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
158c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  QueryNodeVector children_;
159c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch};
160c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
161c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochQueryNodeList::~QueryNodeList() {
162c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (QueryNodeVector::iterator node = children_.begin();
163c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch       node != children_.end(); ++node)
164c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    delete *node;
165c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
166c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
167c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryNodeList::RemoveEmptySubnodes() {
168c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (size_t i = 0; i < children_.size(); ++i) {
169c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (children_[i]->IsWord())
170c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      continue;
171c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
172c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    QueryNodeList* list_node = static_cast<QueryNodeList*>(children_[i]);
173c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    list_node->RemoveEmptySubnodes();
174c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (list_node->children()->empty()) {
175c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      children_.erase(children_.begin() + i);
176c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      --i;
177c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      delete list_node;
178c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
179c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
180c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
181c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
182c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryNodeList::AppendWords(std::vector<string16>* words) const {
183c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (size_t i = 0; i < children_.size(); ++i)
184c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    children_[i]->AppendWords(words);
185c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
186c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
187c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochint QueryNodeList::AppendChildrenToString(string16* query) const {
188c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int num_words = 0;
189c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (QueryNodeVector::const_iterator node = children_.begin();
190c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch       node != children_.end(); ++node) {
191c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (node != children_.begin())
192c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      query->push_back(L' ');
193c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    num_words += (*node)->AppendToSQLiteQuery(query);
194c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
195c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return num_words;
196c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
197c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
198c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// A QueryNodePhrase is a phrase query ("quoted").
199c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochclass QueryNodePhrase : public QueryNodeList {
200c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch public:
201c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual int AppendToSQLiteQuery(string16* query) const {
202c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    query->push_back(L'"');
203c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    int num_words = AppendChildrenToString(query);
204c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    query->push_back(L'"');
205c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return num_words;
206c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
207c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
208c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual bool Matches(const string16& word, bool exact) const;
209c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  virtual bool HasMatchIn(const std::vector<QueryWord>& words,
210c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                          Snippet::MatchPositions* match_positions) const;
211c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch};
212c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
213c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryNodePhrase::Matches(const string16& word, bool exact) const {
214c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  NOTREACHED();
215c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return false;
216c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
217c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
218c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryNodePhrase::HasMatchIn(
219c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    const std::vector<QueryWord>& words,
220c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    Snippet::MatchPositions* match_positions) const {
221c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (words.size() < children_.size())
222c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
223c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
224c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (size_t i = 0, max = words.size() - children_.size() + 1; i < max; ++i) {
225c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    bool matched_all = true;
226c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    for (size_t j = 0; j < children_.size(); ++j) {
227c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      if (!children_[j]->Matches(words[i + j].word, true)) {
228c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        matched_all = false;
229c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        break;
230c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      }
231c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
232c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (matched_all) {
233c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      const QueryWord& last_word = words[i + children_.size() - 1];
234c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      match_positions->push_back(
235c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          Snippet::MatchPosition(words[i].position,
236c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                 last_word.position + last_word.word.length()));
237c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      return true;
238c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
239c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
240c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return false;
241c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
242c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
243c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochQueryParser::QueryParser() {
244c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
245c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
246c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// static
247c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryParser::IsWordLongEnoughForPrefixSearch(const string16& word) {
248dc0f95d653279beabeb9817299e2902918ba123eKristian Monsen  DCHECK(!word.empty());
249c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  size_t minimum_length = 3;
250c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
251c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // because they 'behave like' Latin letters. Moreover, we should
252c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // normalize the former before reaching here.
253c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (0xAC00 <= word[0] && word[0] <= 0xD7A3)
254c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    minimum_length = 2;
255c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return word.size() >= minimum_length;
256c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
257c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
258c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Returns true if the character is considered a quote.
259c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic bool IsQueryQuote(wchar_t ch) {
260c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return ch == '"' ||
261c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch         ch == 0xab ||    // left pointing double angle bracket
262c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch         ch == 0xbb ||    // right pointing double angle bracket
263c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch         ch == 0x201c ||  // left double quotation mark
264c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch         ch == 0x201d ||  // right double quotation mark
265c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch         ch == 0x201e;    // double low-9 quotation mark
266c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
267c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
268c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochint QueryParser::ParseQuery(const string16& query,
269c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                            string16* sqlite_query) {
270c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  QueryNodeList root;
271c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!ParseQueryImpl(query, &root))
272c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return 0;
273c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return root.AppendToSQLiteQuery(sqlite_query);
274c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
275c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
276c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryParser::ParseQuery(const string16& query,
277c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                             std::vector<QueryNode*>* nodes) {
278c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  QueryNodeList root;
279c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (ParseQueryImpl(l10n_util::ToLower(query), &root))
280c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    nodes->swap(*root.children());
281c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
282c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
283c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
284c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryParser::ExtractQueryWords(const string16& query,
285c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                    std::vector<string16>* words) {
286c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  QueryNodeList root;
287c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!ParseQueryImpl(query, &root))
288c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
289c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  root.AppendWords(words);
290c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
291c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
292c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryParser::DoesQueryMatch(const string16& text,
293c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                 const std::vector<QueryNode*>& query_nodes,
294c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                 Snippet::MatchPositions* match_positions) {
295c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (query_nodes.empty())
296c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
297c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
298c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::vector<QueryWord> query_words;
299c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  string16 lower_text = l10n_util::ToLower(text);
300c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  ExtractQueryWords(lower_text, &query_words);
301c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
302c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (query_words.empty())
303c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
304c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
305c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  Snippet::MatchPositions matches;
306c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (size_t i = 0; i < query_nodes.size(); ++i) {
307c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (!query_nodes[i]->HasMatchIn(query_words, &matches))
308c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      return false;
309c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
310c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (lower_text.length() != text.length()) {
311c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // The lower case string differs from the original string. The matches are
312c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // meaningless.
313c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // TODO(sky): we need a better way to align the positions so that we don't
314c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // completely punt here.
315c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    match_positions->clear();
316c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  } else {
317c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    CoalseAndSortMatchPositions(&matches);
318c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    match_positions->swap(matches);
319c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
320c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return true;
321c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
322c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
323c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool QueryParser::ParseQueryImpl(const string16& query,
324c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                QueryNodeList* root) {
32521d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen  base::BreakIterator iter(&query, base::BreakIterator::BREAK_WORD);
326c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // TODO(evanm): support a locale here
327c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!iter.Init())
328c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
329c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
330c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // To handle nesting, we maintain a stack of QueryNodeLists.
331c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // The last element (back) of the stack contains the current, deepest node.
332c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::vector<QueryNodeList*> query_stack;
333c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  query_stack.push_back(root);
334c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
335c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  bool in_quotes = false;  // whether we're currently in a quoted phrase
336c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  while (iter.Advance()) {
337c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
338c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // is not necessarily a word, but could also be a sequence of punctuation
339c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // or whitespace.
340c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (iter.IsWord()) {
34121d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen      string16 word = iter.GetString();
342c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
343c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      QueryNodeWord* word_node = new QueryNodeWord(word);
344c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      if (in_quotes)
345c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        word_node->set_literal(true);
346c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      query_stack.back()->AddChild(word_node);
347c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    } else {  // Punctuation.
348c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      if (IsQueryQuote(query[iter.prev()])) {
349c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        if (!in_quotes) {
350c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          QueryNodeList* quotes_node = new QueryNodePhrase;
351c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          query_stack.back()->AddChild(quotes_node);
352c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          query_stack.push_back(quotes_node);
353c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          in_quotes = true;
354c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        } else {
355c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          query_stack.pop_back();  // Stop adding to the quoted phrase.
356c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          in_quotes = false;
357c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        }
358c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      }
359c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
360c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
361c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
362c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  root->RemoveEmptySubnodes();
363c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return true;
364c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
365c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
366c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid QueryParser::ExtractQueryWords(const string16& text,
367c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                    std::vector<QueryWord>* words) {
36821d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen  base::BreakIterator iter(&text, base::BreakIterator::BREAK_WORD);
369c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // TODO(evanm): support a locale here
370c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!iter.Init())
371c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
372c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
373c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  while (iter.Advance()) {
374c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
375c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // is not necessarily a word, but could also be a sequence of punctuation
376c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // or whitespace.
377c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (iter.IsWord()) {
37821d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen      string16 word = iter.GetString();
379c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      if (!word.empty()) {
380c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        words->push_back(QueryWord());
381c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        words->back().word = word;
382c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        words->back().position = iter.prev();
383c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      }
384c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
385c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
386c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
387