106741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
206741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// Use of this source code is governed by a BSD-style license that can be
306741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// found in the LICENSE file.
406741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
506741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch#include "chrome/common/visitedlink_common.h"
606741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
73345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#include <string.h>  // for memset()
83345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick
906741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch#include "base/logging.h"
1006741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch#include "base/md5.h"
113345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#include "googleurl/src/gurl.h"
1206741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
1306741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdochconst VisitedLinkCommon::Fingerprint VisitedLinkCommon::null_fingerprint_ = 0;
1406741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdochconst VisitedLinkCommon::Hash VisitedLinkCommon::null_hash_ = -1;
1506741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
1606741cbc25cd4227a9fba40dfd0273bfcc1a587aBen MurdochVisitedLinkCommon::VisitedLinkCommon()
1706741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    : hash_table_(NULL),
1806741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch      table_length_(0) {
193345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick  memset(salt_, 0, sizeof(salt_));
2006741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch}
2106741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
2206741cbc25cd4227a9fba40dfd0273bfcc1a587aBen MurdochVisitedLinkCommon::~VisitedLinkCommon() {
2306741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch}
2406741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
2506741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// FIXME: this uses linear probing, it should be replaced with quadratic
2606741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// probing or something better. See VisitedLinkMaster::AddFingerprint
2706741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdochbool VisitedLinkCommon::IsVisited(const char* canonical_url,
2806741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch                                  size_t url_len) const {
2906741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  if (url_len == 0)
3006741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    return false;
3106741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  if (!hash_table_ || table_length_ == 0)
3206741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    return false;
3306741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  return IsVisited(ComputeURLFingerprint(canonical_url, url_len));
3406741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch}
3506741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
363345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrickbool VisitedLinkCommon::IsVisited(const GURL& url) const {
373345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick  return IsVisited(url.spec().data(), url.spec().size());
383345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick}
393345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick
4006741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdochbool VisitedLinkCommon::IsVisited(Fingerprint fingerprint) const {
4106741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  // Go through the table until we find the item or an empty spot (meaning it
4206741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  // wasn't found). This loop will terminate as long as the table isn't full,
4306741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  // which should be enforced by AddFingerprint.
4406741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  Hash first_hash = HashFingerprint(fingerprint);
4506741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  Hash cur_hash = first_hash;
4606741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  while (true) {
4706741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    Fingerprint cur_fingerprint = FingerprintAt(cur_hash);
4806741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    if (cur_fingerprint == null_fingerprint_)
4906741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch      return false;  // End of probe sequence found.
5006741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    if (cur_fingerprint == fingerprint)
5106741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch      return true;  // Found a match.
5206741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
5306741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    // This spot was taken, but not by the item we're looking for, search in
5406741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    // the next position.
5506741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    cur_hash++;
5606741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    if (cur_hash == table_length_)
5706741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch      cur_hash = 0;
5806741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    if (cur_hash == first_hash) {
5906741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch      // Wrapped around and didn't find an empty space, this means we're in an
6006741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch      // infinite loop because AddFingerprint didn't do its job resizing.
6106741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch      NOTREACHED();
6206741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch      return false;
6306741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    }
6406741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  }
6506741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch}
6606741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
6706741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// Uses the top 64 bits of the MD5 sum of the canonical URL as the fingerprint,
6806741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// this is as random as any other subset of the MD5SUM.
6906741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch//
7006741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// FIXME: this uses the MD5SUM of the 16-bit character version. For systems
7106741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// where wchar_t is not 16 bits (Linux uses 32 bits, I think), this will not be
7206741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// compatable. We should define explicitly what should happen here across
7306741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// platforms, and convert if necessary (probably to UTF-16).
7406741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
7506741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch// static
7606741cbc25cd4227a9fba40dfd0273bfcc1a587aBen MurdochVisitedLinkCommon::Fingerprint VisitedLinkCommon::ComputeURLFingerprint(
7706741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    const char* canonical_url,
7806741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    size_t url_len,
7906741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch    const uint8 salt[LINK_SALT_LENGTH]) {
8006741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  DCHECK(url_len > 0) << "Canonical URLs should not be empty";
8106741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
8206741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  MD5Context ctx;
8306741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  MD5Init(&ctx);
8406741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  MD5Update(&ctx, salt, sizeof(salt));
8506741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  MD5Update(&ctx, canonical_url, url_len * sizeof(char));
8606741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
8706741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  MD5Digest digest;
8806741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  MD5Final(&digest, &ctx);
8906741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch
9006741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  // This is the same as "return *(Fingerprint*)&digest.a;" but if we do that
9106741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  // direct cast the alignment could be wrong, and we can't access a 64-bit int
9206741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  // on arbitrary alignment on some processors. This reinterpret_casts it
9306741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  // down to a char array of the same size as fingerprint, and then does the
9406741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  // bit cast, which amounts to a memcpy. This does not handle endian issues.
9506741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch  return bit_cast<Fingerprint, uint8[8]>(
9606741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch      *reinterpret_cast<uint8(*)[8]>(&digest.a));
9706741cbc25cd4227a9fba40dfd0273bfcc1a587aBen Murdoch}
98