1// Copyright (c) 2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/history/visitsegment_database.h"
6
7#include <math.h>
8
9#include <algorithm>
10#include <string>
11#include <vector>
12
13#include "app/sql/statement.h"
14#include "base/logging.h"
15#include "base/stl_util-inl.h"
16#include "base/string_util.h"
17#include "base/utf_string_conversions.h"
18#include "chrome/browser/history/page_usage_data.h"
19
20// The following tables are used to store url segment information.
21//
22// segments
23//   id                 Primary key
24//   name               A unique string to represent that segment. (URL derived)
25//   url_id             ID of the url currently used to represent this segment.
26//   pres_index         index used to store a fixed presentation position.
27//
28// segment_usage
29//   id                 Primary key
30//   segment_id         Corresponding segment id
31//   time_slot          time stamp identifying for what day this entry is about
32//   visit_count        Number of visit in the segment
33//
34
35namespace history {
36
37VisitSegmentDatabase::VisitSegmentDatabase() {
38}
39
40VisitSegmentDatabase::~VisitSegmentDatabase() {
41}
42
43bool VisitSegmentDatabase::InitSegmentTables() {
44  // Segments table.
45  if (!GetDB().DoesTableExist("segments")) {
46    if (!GetDB().Execute("CREATE TABLE segments ("
47        "id INTEGER PRIMARY KEY,"
48        "name VARCHAR,"
49        "url_id INTEGER NON NULL,"
50        "pres_index INTEGER DEFAULT -1 NOT NULL)")) {
51      NOTREACHED();
52      return false;
53    }
54
55    if (!GetDB().Execute("CREATE INDEX segments_name ON segments(name)")) {
56      NOTREACHED();
57      return false;
58    }
59  }
60
61  // This was added later, so we need to try to create it even if the table
62  // already exists.
63  GetDB().Execute("CREATE INDEX segments_url_id ON segments(url_id)");
64
65  // Segment usage table.
66  if (!GetDB().DoesTableExist("segment_usage")) {
67    if (!GetDB().Execute("CREATE TABLE segment_usage ("
68        "id INTEGER PRIMARY KEY,"
69        "segment_id INTEGER NOT NULL,"
70        "time_slot INTEGER NOT NULL,"
71        "visit_count INTEGER DEFAULT 0 NOT NULL)")) {
72      NOTREACHED();
73      return false;
74    }
75    if (!GetDB().Execute(
76        "CREATE INDEX segment_usage_time_slot_segment_id ON "
77        "segment_usage(time_slot, segment_id)")) {
78      NOTREACHED();
79      return false;
80    }
81  }
82
83  // Added in a later version, so we always need to try to creat this index.
84  GetDB().Execute("CREATE INDEX segments_usage_seg_id "
85                  "ON segment_usage(segment_id)");
86
87  // Presentation index table.
88  //
89  // Important note:
90  // Right now, this table is only used to store the presentation index.
91  // If you need to add more columns, keep in mind that rows are currently
92  // deleted when the presentation index is changed to -1.
93  // See SetPagePresentationIndex() in this file
94  if (!GetDB().DoesTableExist("presentation")) {
95    if (!GetDB().Execute("CREATE TABLE presentation("
96        "url_id INTEGER PRIMARY KEY,"
97        "pres_index INTEGER NOT NULL)"))
98      return false;
99  }
100  return true;
101}
102
103bool VisitSegmentDatabase::DropSegmentTables() {
104  // Dropping the tables will implicitly delete the indices.
105  return GetDB().Execute("DROP TABLE segments") &&
106         GetDB().Execute("DROP TABLE segment_usage");
107}
108
109// Note: the segment name is derived from the URL but is not a URL. It is
110// a string that can be easily recreated from various URLS. Maybe this should
111// be an MD5 to limit the length.
112//
113// static
114std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) {
115  // TODO(brettw) this should probably use the registry controlled
116  // domains service.
117  GURL::Replacements r;
118  const char kWWWDot[] = "www.";
119  const int kWWWDotLen = arraysize(kWWWDot) - 1;
120
121  std::string host = url.host();
122  const char* host_c = host.c_str();
123  // Remove www. to avoid some dups.
124  if (static_cast<int>(host.size()) > kWWWDotLen &&
125      LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) {
126    r.SetHost(host.c_str(),
127              url_parse::Component(kWWWDotLen,
128                  static_cast<int>(host.size()) - kWWWDotLen));
129  }
130  // Remove other stuff we don't want.
131  r.ClearUsername();
132  r.ClearPassword();
133  r.ClearQuery();
134  r.ClearRef();
135  r.ClearPort();
136
137  return url.ReplaceComponents(r).spec();
138}
139
140SegmentID VisitSegmentDatabase::GetSegmentNamed(
141    const std::string& segment_name) {
142  sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
143      "SELECT id FROM segments WHERE name = ?"));
144  if (!statement)
145    return 0;
146
147  statement.BindString(0, segment_name);
148  if (statement.Step())
149    return statement.ColumnInt64(0);
150  return 0;
151}
152
153bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id,
154                                                          URLID url_id) {
155  sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
156      "UPDATE segments SET url_id = ? WHERE id = ?"));
157  if (!statement)
158    return false;
159
160  statement.BindInt64(0, url_id);
161  statement.BindInt64(1, segment_id);
162  return statement.Run();
163}
164
165URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) {
166  sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
167      "SELECT url_id FROM segments WHERE id = ?"));
168  if (!statement)
169    return 0;
170
171  statement.BindInt64(0, segment_id);
172  if (statement.Step())
173    return statement.ColumnInt64(0);
174  return 0;
175}
176
177SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id,
178                                              const std::string& segment_name) {
179  sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
180      "INSERT INTO segments (name, url_id) VALUES (?,?)"));
181  if (!statement)
182    return false;
183
184  statement.BindString(0, segment_name);
185  statement.BindInt64(1, url_id);
186  if (statement.Run())
187    return GetDB().GetLastInsertRowId();
188  return false;
189}
190
191bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id,
192                                                     base::Time ts,
193                                                     int amount) {
194  base::Time t = ts.LocalMidnight();
195
196  sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE,
197      "SELECT id, visit_count FROM segment_usage "
198      "WHERE time_slot = ? AND segment_id = ?"));
199  if (!select)
200    return false;
201
202  select.BindInt64(0, t.ToInternalValue());
203  select.BindInt64(1, segment_id);
204  if (select.Step()) {
205    sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE,
206        "UPDATE segment_usage SET visit_count = ? WHERE id = ?"));
207    if (!update)
208      return false;
209
210    update.BindInt64(0, select.ColumnInt64(1) + static_cast<int64>(amount));
211    update.BindInt64(1, select.ColumnInt64(0));
212    return update.Run();
213
214  } else {
215    sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE,
216        "INSERT INTO segment_usage "
217        "(segment_id, time_slot, visit_count) VALUES (?, ?, ?)"));
218    if (!insert)
219      return false;
220
221    insert.BindInt64(0, segment_id);
222    insert.BindInt64(1, t.ToInternalValue());
223    insert.BindInt64(2, static_cast<int64>(amount));
224    return insert.Run();
225  }
226}
227
228void VisitSegmentDatabase::QuerySegmentUsage(
229    base::Time from_time,
230    int max_result_count,
231    std::vector<PageUsageData*>* results) {
232  // This function gathers the highest-ranked segments in two queries.
233  // The first gathers scores for all segments.
234  // The second gathers segment data (url, title, etc.) for the highest-ranked
235  // segments.
236  // TODO(evanm): this disregards the "presentation index", which was what was
237  // used to lock results into position.  But the rest of our code currently
238  // does as well.
239
240  // Gather all the segment scores.
241  sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
242      "SELECT segment_id, time_slot, visit_count "
243      "FROM segment_usage WHERE time_slot >= ? "
244      "ORDER BY segment_id"));
245  if (!statement) {
246    NOTREACHED() << GetDB().GetErrorMessage();
247    return;
248  }
249
250  base::Time ts = from_time.LocalMidnight();
251  statement.BindInt64(0, ts.ToInternalValue());
252
253  base::Time now = base::Time::Now();
254  SegmentID last_segment_id = 0;
255  PageUsageData* pud = NULL;
256  float score = 0;
257  while (statement.Step()) {
258    SegmentID segment_id = statement.ColumnInt64(0);
259    if (segment_id != last_segment_id) {
260      if (pud) {
261        pud->SetScore(score);
262        results->push_back(pud);
263      }
264
265      pud = new PageUsageData(segment_id);
266      score = 0;
267      last_segment_id = segment_id;
268    }
269
270    base::Time timeslot =
271        base::Time::FromInternalValue(statement.ColumnInt64(1));
272    int visit_count = statement.ColumnInt(2);
273    int days_ago = (now - timeslot).InDays();
274
275    // Score for this day in isolation.
276    float day_visits_score = 1.0f + log(static_cast<float>(visit_count));
277    // Recent visits count more than historical ones, so we multiply in a boost
278    // related to how long ago this day was.
279    // This boost is a curve that smoothly goes through these values:
280    // Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x
281    // at the limit of how far we reach into the past.
282    float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f)));
283    score += recency_boost * day_visits_score;
284  }
285
286  if (pud) {
287    pud->SetScore(score);
288    results->push_back(pud);
289  }
290
291  // Limit to the top kResultCount results.
292  sort(results->begin(), results->end(), PageUsageData::Predicate);
293  if (static_cast<int>(results->size()) > max_result_count) {
294    STLDeleteContainerPointers(results->begin() + max_result_count,
295                               results->end());
296    results->resize(max_result_count);
297  }
298
299  // Now fetch the details about the entries we care about.
300  sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE,
301      "SELECT urls.url, urls.title FROM urls "
302      "JOIN segments ON segments.url_id = urls.id "
303      "WHERE segments.id = ?"));
304  if (!statement2) {
305    NOTREACHED() << GetDB().GetErrorMessage();
306    return;
307  }
308  for (size_t i = 0; i < results->size(); ++i) {
309    PageUsageData* pud = (*results)[i];
310    statement2.BindInt64(0, pud->GetID());
311    if (statement2.Step()) {
312      pud->SetURL(GURL(statement2.ColumnString(0)));
313      pud->SetTitle(UTF8ToUTF16(statement2.ColumnString(1)));
314    }
315    statement2.Reset();
316  }
317}
318
319void VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) {
320  sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
321      "DELETE FROM segment_usage WHERE time_slot < ?"));
322  if (!statement)
323    return;
324
325  statement.BindInt64(0, older_than.LocalMidnight().ToInternalValue());
326  if (!statement.Run())
327    NOTREACHED();
328}
329
330void VisitSegmentDatabase::SetSegmentPresentationIndex(SegmentID segment_id,
331                                                       int index) {
332  sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE,
333      "UPDATE segments SET pres_index = ? WHERE id = ?"));
334  if (!statement)
335    return;
336
337  statement.BindInt(0, index);
338  statement.BindInt64(1, segment_id);
339  if (!statement.Run())
340    NOTREACHED();
341  else
342    DCHECK_EQ(1, GetDB().GetLastChangeCount());
343}
344
345bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) {
346  sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE,
347      "SELECT id FROM segments WHERE url_id = ?"));
348  if (!select)
349    return false;
350
351  sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE,
352      "DELETE FROM segments WHERE id = ?"));
353  if (!delete_seg)
354    return false;
355
356  sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE,
357      "DELETE FROM segment_usage WHERE segment_id = ?"));
358  if (!delete_usage)
359    return false;
360
361  bool r = true;
362  select.BindInt64(0, url_id);
363  // In theory there could not be more than one segment using that URL but we
364  // loop anyway to cleanup any inconsistency.
365  while (select.Step()) {
366    SegmentID segment_id = select.ColumnInt64(0);
367
368    delete_usage.BindInt64(0, segment_id);
369    if (!delete_usage.Run()) {
370      NOTREACHED();
371      r = false;
372    }
373
374    delete_seg.BindInt64(0, segment_id);
375    if (!delete_seg.Run()) {
376      NOTREACHED();
377      r = false;
378    }
379    delete_usage.Reset();
380    delete_seg.Reset();
381  }
382  return r;
383}
384
385}  // namespace history
386