components/query_parser/snippet.cc

// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/query_parser/snippet.h"

#include <algorithm>

#include "base/logging.h"
#include "base/memory/scoped_ptr.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "third_party/icu/source/common/unicode/brkiter.h"
#include "third_party/icu/source/common/unicode/utext.h"
#include "third_party/icu/source/common/unicode/utf8.h"

namespace query_parser {
namespace {

bool PairFirstLessThan(const Snippet::MatchPosition& a,
                       const Snippet::MatchPosition& b) {
  return a.first < b.first;
}

// Combines all pairs after offset in match_positions that are contained
// or touch the pair at offset.
void CoalescePositionsFrom(size_t offset,
                           Snippet::MatchPositions* match_positions) {
  DCHECK(offset < match_positions->size());
  Snippet::MatchPosition& pair((*match_positions)[offset]);
  ++offset;
  while (offset < match_positions->size() &&
         pair.second >= (*match_positions)[offset].first) {
    pair.second = std::max(pair.second, (*match_positions)[offset].second);
    match_positions->erase(match_positions->begin() + offset);
  }
}

// Makes sure there is a pair in match_positions that contains the specified
// range. This keeps the pairs ordered in match_positions by first, and makes
// sure none of the pairs in match_positions touch each other.
void AddMatch(size_t start,
              size_t end,
              Snippet::MatchPositions* match_positions) {
  DCHECK(start < end);
  DCHECK(match_positions);
  Snippet::MatchPosition pair(start, end);
  if (match_positions->empty()) {
    match_positions->push_back(pair);
    return;
  }
  // There's at least one match. Find the position of the new match,
  // potentially extending pairs around it.
  Snippet::MatchPositions::iterator i =
      std::lower_bound(match_positions->begin(), match_positions->end(),
                       pair, &PairFirstLessThan);
  if (i != match_positions->end() && i->first == start) {
    // Match not at the end and there is already a pair with the same
    // start.
    if (end > i->second) {
      // New pair extends beyond existing pair. Extend existing pair and
      // coalesce matches after it.
      i->second = end;
      CoalescePositionsFrom(i - match_positions->begin(), match_positions);
    }  // else case, new pair completely contained in existing pair, nothing
       // to do.
  } else if (i == match_positions->begin()) {
    // Match at the beginning and the first pair doesn't have the same
    // start. Insert new pair and coalesce matches after it.
    match_positions->insert(i, pair);
    CoalescePositionsFrom(0, match_positions);
  } else {
    // Not at the beginning (but may be at the end).
    --i;
    if (start <= i->second && end > i->second) {
      // Previous element contains match. Extend it and coalesce.
      i->second = end;
      CoalescePositionsFrom(i - match_positions->begin(), match_positions);
    } else if (end > i->second) {
      // Region doesn't touch previous element. See if region touches current
      // element.
      ++i;
      if (i == match_positions->end() || end < i->first) {
        match_positions->insert(i, pair);
      } else {
        i->first = start;
        i->second = end;
        CoalescePositionsFrom(i - match_positions->begin(), match_positions);
      }
    }
  }
}

// Converts an index in a utf8 string into the index in the corresponding utf16
// string and returns the utf16 index. This is intended to be called in a loop
// iterating through a utf8 string.
//
// utf8_string: the utf8 string.
// utf8_length: length of the utf8 string.
// offset: the utf8 offset to convert.
// utf8_pos: current offset in the utf8 string. This is modified and on return
//           matches offset.
// wide_pos: current index in the wide string. This is the same as the return
//           value.
size_t AdvanceAndReturnUTF16Pos(const char* utf8_string,
                                int32_t utf8_length,
                                int32_t offset,
                                int32_t* utf8_pos,
                                size_t* utf16_pos) {
  DCHECK(offset >= *utf8_pos && offset <= utf8_length);

  UChar32 wide_char;
  while (*utf8_pos < offset) {
    U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
    *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2;
  }
  return *utf16_pos;
}

// Given a character break iterator over a UTF-8 string, set the iterator
// position to |*utf8_pos| and move by |count| characters. |count| can
// be either positive or negative.
void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) {
  // Ignore the return value. A side effect of the current position
  // being set at or following |*utf8_pos| is exploited here.
  // It's simpler than calling following(n) and then previous().
  // isBoundary() is not very fast, but should be good enough for the
  // snippet generation. If not, revisit the way we scan in ComputeSnippet.
  bi->isBoundary(static_cast<int32_t>(*utf8_pos));
  bi->next(count);
  *utf8_pos = static_cast<size_t>(bi->current());
}

// The amount of context to include for a given hit. Note that it's counted
// in terms of graphemes rather than bytes.
const int kSnippetContext = 50;

// Returns true if next match falls within a snippet window
// from the previous match. The window size is counted in terms
// of graphemes rather than bytes in UTF-8.
bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi,
                                    size_t previous_match_end,
                                    size_t next_match_start) {
  // If it's within a window in terms of bytes, it's certain
  // that it's within a window in terms of graphemes as well.
  if (next_match_start < previous_match_end + kSnippetContext)
    return true;
  bi->isBoundary(static_cast<int32_t>(previous_match_end));
  // An alternative to this is to call |bi->next()| at most
  // kSnippetContext times, compare |bi->current()| with |next_match_start|
  // after each call and return early if possible. There are other
  // heuristics to speed things up if necessary, but it's not likely that
  // we need to bother.
  bi->next(kSnippetContext);
  int64 current = bi->current();
  return (next_match_start < static_cast<uint64>(current) ||
          current == icu::BreakIterator::DONE);
}

}  // namespace

// static
void Snippet::ExtractMatchPositions(const std::string& offsets_str,
                                    const std::string& column_num,
                                    MatchPositions* match_positions) {
  DCHECK(match_positions);
  if (offsets_str.empty())
    return;
  std::vector<std::string> offsets;
  base::SplitString(offsets_str, ' ', &offsets);
  // SQLite offsets are sets of four integers:
  //   column, query term, match offset, match length
  // Matches within a string are marked by (start, end) pairs.
  for (size_t i = 0; i < offsets.size() - 3; i += 4) {
    if (offsets[i] != column_num)
      continue;
    const size_t start = atoi(offsets[i + 2].c_str());
    const size_t end = start + atoi(offsets[i + 3].c_str());
    // Switch to DCHECK after debugging http://crbug.com/15261.
    CHECK(end >= start);
    AddMatch(start, end, match_positions);
  }
}

// static
void Snippet::ConvertMatchPositionsToWide(
    const std::string& utf8_string,
    Snippet::MatchPositions* match_positions) {
  DCHECK(match_positions);
  int32_t utf8_pos = 0;
  size_t utf16_pos = 0;
  const char* utf8_cstring = utf8_string.c_str();
  const int32_t utf8_length = static_cast<int32_t>(utf8_string.size());
  for (Snippet::MatchPositions::iterator i = match_positions->begin();
       i != match_positions->end(); ++i) {
    i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
                                        static_cast<int32_t>(i->first),
                                        &utf8_pos, &utf16_pos);
    i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
                                         static_cast<int32_t>(i->second),
                                         &utf8_pos, &utf16_pos);
  }
}

Snippet::Snippet() {
}

Snippet::~Snippet() {
}

void Snippet::ComputeSnippet(const MatchPositions& match_positions,
                             const std::string& document) {
  // The length of snippets we try to produce.
  // We can generate longer snippets but stop once we cross kSnippetMaxLength.
  const size_t kSnippetMaxLength = 200;
  const base::string16 kEllipsis = base::ASCIIToUTF16(" ... ");

  UText* document_utext = NULL;
  UErrorCode status = U_ZERO_ERROR;
  document_utext = utext_openUTF8(document_utext, document.data(),
                                  document.size(), &status);
  // Locale does not matter because there's no per-locale customization
  // for character iterator.
  scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance(
      icu::Locale::getDefault(), status));
  bi->setText(document_utext, status);
  DCHECK(U_SUCCESS(status));

  // We build the snippet by iterating through the matches and then grabbing
  // context around each match.  If matches are near enough each other (within
  // kSnippetContext), we skip the "..." between them.
  base::string16 snippet;
  size_t start = 0;
  for (size_t i = 0; i < match_positions.size(); ++i) {
    // Some shorter names for the current match.
    const size_t match_start = match_positions[i].first;
    const size_t match_end = match_positions[i].second;

    // Switch to DCHECK after debugging http://crbug.com/15261.
    CHECK(match_end > match_start);
    CHECK(match_end <= document.size());

    // Add the context, if any, to show before the match.
    size_t context_start = match_start;
    MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
    start = std::max(start, context_start);
    if (start < match_start) {
      if (start > 0)
        snippet += kEllipsis;
      // Switch to DCHECK after debugging http://crbug.com/15261.
      CHECK(start < document.size());
      snippet += base::UTF8ToUTF16(document.substr(start, match_start - start));
    }

    // Add the match.
    const size_t first = snippet.size();
    snippet += base::UTF8ToUTF16(document.substr(match_start,
                                                 match_end - match_start));
    matches_.push_back(std::make_pair(first, snippet.size()));

    // Compute the context, if any, to show after the match.
    size_t end;
    // Check if the next match falls within our snippet window.
    if (i + 1 < match_positions.size() &&
        IsNextMatchWithinSnippetWindow(bi.get(), match_end,
            match_positions[i + 1].first)) {
      // Yes, it's within the window.  Make the end context extend just up
      // to the next match.
      end = match_positions[i + 1].first;
      // Switch to DCHECK after debugging http://crbug.com/15261.
      CHECK(end >= match_end);
      CHECK(end <= document.size());
      snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
    } else {
      // No, there's either no next match or the next match is too far away.
      end = match_end;
      MoveByNGraphemes(bi.get(), kSnippetContext, &end);
      // Switch to DCHECK after debugging http://crbug.com/15261.
      CHECK(end >= match_end);
      CHECK(end <= document.size());
      snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
      if (end < document.size())
        snippet += kEllipsis;
    }
    start = end;

    // Stop here if we have enough snippet computed.
    if (snippet.size() >= kSnippetMaxLength)
      break;
  }

  utext_close(document_utext);
  swap(text_, snippet);
}

void Snippet::Swap(Snippet* other) {
  text_.swap(other->text_);
  matches_.swap(other->matches_);
}

}  // namespace query_parser