1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "sync/internal_api/public/base/unique_position.h"
6
7#include "base/basictypes.h"
8#include "base/logging.h"
9#include "base/rand_util.h"
10#include "base/stl_util.h"
11#include "base/strings/string_number_conversions.h"
12#include "sync/protocol/unique_position.pb.h"
13#include "third_party/zlib/zlib.h"
14
15namespace syncer {
16
17const size_t UniquePosition::kSuffixLength = 28;
18const size_t UniquePosition::kCompressBytesThreshold = 128;
19
20// static.
21bool UniquePosition::IsValidSuffix(const std::string& suffix) {
22  // The suffix must be exactly the specified length, otherwise unique suffixes
23  // are not sufficient to guarantee unique positions (because prefix + suffix
24  // == p + refixsuffix).
25  return suffix.length() == kSuffixLength
26      && suffix[kSuffixLength-1] != 0;
27}
28
29// static.
30bool UniquePosition::IsValidBytes(const std::string& bytes) {
31  // The first condition ensures that our suffix uniqueness is sufficient to
32  // guarantee position uniqueness.  Otherwise, it's possible the end of some
33  // prefix + some short suffix == some long suffix.
34  // The second condition ensures that FindSmallerWithSuffix can always return a
35  // result.
36  return bytes.length() >= kSuffixLength
37      && bytes[bytes.length()-1] != 0;
38}
39
40// static.
41std::string UniquePosition::RandomSuffix() {
42  // Users random data for all but the last byte.  The last byte must not be
43  // zero.  We arbitrarily set it to 0x7f.
44  return base::RandBytesAsString(kSuffixLength - 1) + "\x7f";
45}
46
47// static.
48UniquePosition UniquePosition::CreateInvalid() {
49  UniquePosition pos;
50  DCHECK(!pos.IsValid());
51  return pos;
52}
53
54// static.
55UniquePosition UniquePosition::FromProto(const sync_pb::UniquePosition& proto) {
56  if (proto.has_custom_compressed_v1()) {
57    return UniquePosition(proto.custom_compressed_v1());
58  } else if (proto.has_value() && !proto.value().empty()) {
59    return UniquePosition(Compress(proto.value()));
60  } else if (proto.has_compressed_value() && proto.has_uncompressed_length()) {
61    uLongf uncompressed_len = proto.uncompressed_length();
62    std::string un_gzipped;
63
64    un_gzipped.resize(uncompressed_len);
65    int result = uncompress(
66        reinterpret_cast<Bytef*>(string_as_array(&un_gzipped)),
67        &uncompressed_len,
68        reinterpret_cast<const Bytef*>(proto.compressed_value().data()),
69        proto.compressed_value().size());
70    if (result != Z_OK) {
71      DLOG(ERROR) << "Unzip failed " << result;
72      return UniquePosition::CreateInvalid();
73    }
74    if (uncompressed_len != proto.uncompressed_length()) {
75      DLOG(ERROR)
76          << "Uncompressed length " << uncompressed_len
77          << " did not match specified length " << proto.uncompressed_length();
78      return UniquePosition::CreateInvalid();
79    }
80    return UniquePosition(Compress(un_gzipped));
81  } else {
82    return UniquePosition::CreateInvalid();
83  }
84}
85
86// static.
87UniquePosition UniquePosition::FromInt64(
88    int64 x, const std::string& suffix) {
89  uint64 y = static_cast<uint64>(x);
90  y ^= 0x8000000000000000ULL; // Make it non-negative.
91  std::string bytes(8, 0);
92  for (int i = 7; i >= 0; --i) {
93    bytes[i] = static_cast<uint8>(y);
94    y >>= 8;
95  }
96  return UniquePosition(bytes + suffix, suffix);
97}
98
99// static.
100UniquePosition UniquePosition::InitialPosition(
101    const std::string& suffix) {
102  DCHECK(IsValidSuffix(suffix));
103  return UniquePosition(suffix, suffix);
104}
105
106// static.
107UniquePosition UniquePosition::Before(
108    const UniquePosition& x,
109    const std::string& suffix) {
110  DCHECK(IsValidSuffix(suffix));
111  DCHECK(x.IsValid());
112  const std::string& before = FindSmallerWithSuffix(
113      Uncompress(x.compressed_), suffix);
114  return UniquePosition(before + suffix, suffix);
115}
116
117// static.
118UniquePosition UniquePosition::After(
119    const UniquePosition& x,
120    const std::string& suffix) {
121  DCHECK(IsValidSuffix(suffix));
122  DCHECK(x.IsValid());
123  const std::string& after = FindGreaterWithSuffix(
124      Uncompress(x.compressed_), suffix);
125  return UniquePosition(after + suffix, suffix);
126}
127
128// static.
129UniquePosition UniquePosition::Between(
130    const UniquePosition& before,
131    const UniquePosition& after,
132    const std::string& suffix) {
133  DCHECK(before.IsValid());
134  DCHECK(after.IsValid());
135  DCHECK(before.LessThan(after));
136  DCHECK(IsValidSuffix(suffix));
137  const std::string& mid = FindBetweenWithSuffix(
138      Uncompress(before.compressed_),
139      Uncompress(after.compressed_),
140      suffix);
141  return UniquePosition(mid + suffix, suffix);
142}
143
144UniquePosition::UniquePosition() : is_valid_(false) {}
145
146bool UniquePosition::LessThan(const UniquePosition& other) const {
147  DCHECK(this->IsValid());
148  DCHECK(other.IsValid());
149
150  return compressed_ < other.compressed_;
151}
152
153bool UniquePosition::Equals(const UniquePosition& other) const {
154  if (!this->IsValid() && !other.IsValid())
155    return true;
156
157  return compressed_ == other.compressed_;
158}
159
160void UniquePosition::ToProto(sync_pb::UniquePosition* proto) const {
161  proto->Clear();
162
163  // This is the current preferred foramt.
164  proto->set_custom_compressed_v1(compressed_);
165
166  // Older clients used to write other formats.  We don't bother doing that
167  // anymore because that form of backwards compatibility is expensive.  We no
168  // longer want to pay that price just too support clients that have been
169  // obsolete for a long time.  See the proto definition for details.
170}
171
172void UniquePosition::SerializeToString(std::string* blob) const {
173  DCHECK(blob);
174  sync_pb::UniquePosition proto;
175  ToProto(&proto);
176  proto.SerializeToString(blob);
177}
178
179int64 UniquePosition::ToInt64() const {
180  uint64 y = 0;
181  const std::string& s = Uncompress(compressed_);
182  size_t l = sizeof(int64);
183  if (s.length() < l) {
184    NOTREACHED();
185    l = s.length();
186  }
187  for (size_t i = 0; i < l; ++i) {
188    const uint8 byte = s[l - i - 1];
189    y |= static_cast<uint64>(byte) << (i * 8);
190  }
191  y ^= 0x8000000000000000ULL;
192  // This is technically implementation-defined if y > INT64_MAX, so
193  // we're assuming that we're on a twos-complement machine.
194  return static_cast<int64>(y);
195}
196
197bool UniquePosition::IsValid() const {
198  return is_valid_;
199}
200
201std::string UniquePosition::ToDebugString() const {
202  const std::string bytes = Uncompress(compressed_);
203  if (bytes.empty())
204    return std::string("INVALID[]");
205
206  std::string debug_string = base::HexEncode(bytes.data(), bytes.length());
207  if (!IsValid()) {
208    debug_string = "INVALID[" + debug_string + "]";
209  }
210
211  std::string compressed_string =
212      base::HexEncode(compressed_.data(), compressed_.length());
213  debug_string.append(", compressed: " + compressed_string);
214  return debug_string;
215}
216
217std::string UniquePosition::GetSuffixForTest() const {
218  const std::string bytes = Uncompress(compressed_);
219  const size_t prefix_len = bytes.length() - kSuffixLength;
220  return bytes.substr(prefix_len, std::string::npos);
221}
222
223std::string UniquePosition::FindSmallerWithSuffix(
224    const std::string& reference,
225    const std::string& suffix) {
226  size_t ref_zeroes = reference.find_first_not_of('\0');
227  size_t suffix_zeroes = suffix.find_first_not_of('\0');
228
229  // Neither of our inputs are allowed to have trailing zeroes, so the following
230  // must be true.
231  DCHECK_NE(ref_zeroes, std::string::npos);
232  DCHECK_NE(suffix_zeroes, std::string::npos);
233
234  if (suffix_zeroes > ref_zeroes) {
235    // Implies suffix < ref.
236    return std::string();
237  }
238
239  if (suffix.substr(suffix_zeroes) < reference.substr(ref_zeroes)) {
240    // Prepend zeroes so the result has as many zero digits as |reference|.
241    return std::string(ref_zeroes - suffix_zeroes, '\0');
242  } else if (suffix_zeroes > 1) {
243    // Prepend zeroes so the result has one more zero digit than |reference|.
244    // We could also take the "else" branch below, but taking this branch will
245    // give us a smaller result.
246    return std::string(ref_zeroes - suffix_zeroes + 1, '\0');
247  } else {
248    // Prepend zeroes to match those in the |reference|, then something smaller
249    // than the first non-zero digit in |reference|.
250    char lt_digit = static_cast<uint8>(reference[ref_zeroes])/2;
251    return std::string(ref_zeroes, '\0') + lt_digit;
252  }
253}
254
255// static
256std::string UniquePosition::FindGreaterWithSuffix(
257    const std::string& reference,
258    const std::string& suffix) {
259  size_t ref_FFs = reference.find_first_not_of(kuint8max);
260  size_t suffix_FFs = suffix.find_first_not_of(kuint8max);
261
262  if (ref_FFs == std::string::npos) {
263    ref_FFs = reference.length();
264  }
265  if (suffix_FFs == std::string::npos) {
266    suffix_FFs = suffix.length();
267  }
268
269  if (suffix_FFs > ref_FFs) {
270    // Implies suffix > reference.
271    return std::string();
272  }
273
274  if (suffix.substr(suffix_FFs) > reference.substr(ref_FFs)) {
275    // Prepend FF digits to match those in |reference|.
276    return std::string(ref_FFs - suffix_FFs, kuint8max);
277  } else if (suffix_FFs > 1) {
278    // Prepend enough leading FF digits so result has one more of them than
279    // |reference| does.  We could also take the "else" branch below, but this
280    // gives us a smaller result.
281    return std::string(ref_FFs - suffix_FFs + 1, kuint8max);
282  } else {
283    // Prepend FF digits to match those in |reference|, then something larger
284    // than the first non-FF digit in |reference|.
285    char gt_digit = static_cast<uint8>(reference[ref_FFs]) +
286        (kuint8max - static_cast<uint8>(reference[ref_FFs]) + 1) / 2;
287    return std::string(ref_FFs, kuint8max) + gt_digit;
288  }
289}
290
291// static
292std::string UniquePosition::FindBetweenWithSuffix(
293    const std::string& before,
294    const std::string& after,
295    const std::string& suffix) {
296  DCHECK(IsValidSuffix(suffix));
297  DCHECK_NE(before, after);
298  DCHECK_LT(before, after);
299
300  std::string mid;
301
302  // Sometimes our suffix puts us where we want to be.
303  if (before < suffix && suffix < after) {
304    return std::string();
305  }
306
307  size_t i = 0;
308  for ( ; i < std::min(before.length(), after.length()); ++i) {
309    uint8 a_digit = before[i];
310    uint8 b_digit = after[i];
311
312    if (b_digit - a_digit >= 2) {
313      mid.push_back(a_digit + (b_digit - a_digit)/2);
314      return mid;
315    } else if (a_digit == b_digit) {
316      mid.push_back(a_digit);
317
318      // Both strings are equal so far.  Will appending the suffix at this point
319      // give us the comparison we're looking for?
320      if (before.substr(i+1) < suffix && suffix < after.substr(i+1)) {
321        return mid;
322      }
323    } else {
324      DCHECK_EQ(b_digit - a_digit, 1);  // Implied by above if branches.
325
326      // The two options are off by one digit.  The choice of whether to round
327      // up or down here will have consequences on what we do with the remaining
328      // digits.  Exploring both options is an optimization and is not required
329      // for the correctness of this algorithm.
330
331      // Option A: Round down the current digit.  This makes our |mid| <
332      // |after|, no matter what we append afterwards.  We then focus on
333      // appending digits until |mid| > |before|.
334      std::string mid_a = mid;
335      mid_a.push_back(a_digit);
336      mid_a.append(FindGreaterWithSuffix(before.substr(i+1), suffix));
337
338      // Option B: Round up the current digit.  This makes our |mid| > |before|,
339      // no matter what we append afterwards.  We then focus on appending digits
340      // until |mid| < |after|.  Note that this option may not be viable if the
341      // current digit is the last one in |after|, so we skip the option in that
342      // case.
343      if (after.length() > i+1) {
344        std::string mid_b = mid;
345        mid_b.push_back(b_digit);
346        mid_b.append(FindSmallerWithSuffix(after.substr(i+1), suffix));
347
348        // Does this give us a shorter position value?  If so, use it.
349        if (mid_b.length() < mid_a.length()) {
350          return mid_b;
351        }
352      }
353      return mid_a;
354    }
355  }
356
357  // If we haven't found a midpoint yet, the following must be true:
358  DCHECK_EQ(before.substr(0, i), after.substr(0, i));
359  DCHECK_EQ(before, mid);
360  DCHECK_LT(before.length(), after.length());
361
362  // We know that we'll need to append at least one more byte to |mid| in the
363  // process of making it < |after|.  Appending any digit, regardless of the
364  // value, will make |before| < |mid|.  Therefore, the following will get us a
365  // valid position.
366
367  mid.append(FindSmallerWithSuffix(after.substr(i), suffix));
368  return mid;
369}
370
371UniquePosition::UniquePosition(const std::string& internal_rep)
372    : compressed_(internal_rep),
373      is_valid_(IsValidBytes(Uncompress(internal_rep))) {
374}
375
376UniquePosition::UniquePosition(
377    const std::string& uncompressed,
378    const std::string& suffix)
379  : compressed_(Compress(uncompressed)),
380    is_valid_(IsValidBytes(uncompressed)) {
381  DCHECK(uncompressed.rfind(suffix) + kSuffixLength == uncompressed.length());
382  DCHECK(IsValidSuffix(suffix));
383  DCHECK(IsValid());
384}
385
386// On custom compression:
387//
388// Let C(x) be the compression function and U(x) be the uncompression function.
389//
390// This compression scheme has a few special properties.  For one, it is
391// order-preserving.  For any two valid position strings x and y:
392//   x < y <=> C(x) < C(y)
393// This allows us keep the position strings compressed as we sort them.
394//
395// The compressed format and the decode algorithm:
396//
397// The compressed string is a series of blocks, almost all of which are 8 bytes
398// in length.  The only exception is the last block in the compressed string,
399// which may be a remainder block, which has length no greater than 7.  The
400// full-length blocks are either repeated character blocks or plain data blocks.
401// All blocks are entirely self-contained.  Their decoded values are independent
402// from that of their neighbours.
403//
404// A repeated character block is encoded into eight bytes and represents between
405// 4 and 2^31 repeated instances of a given character in the unencoded stream.
406// The encoding consists of a single character repeated four times, followed by
407// an encoded count.  The encoded count is stored as a big-endian 32 bit
408// integer.  There are 2^31 possible count values, and two encodings for each.
409// The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc =
410// count'.  At compression time, the algorithm will choose between the two
411// encodings based on which of the two will maintain the appropriate sort
412// ordering (by a process which will be described below).  The decompression
413// algorithm need not concern itself with which encoding was used; it needs only
414// to decode it.  The decoded value of this block is "count" instances of the
415// character that was repeated four times in the first half of this block.
416//
417// A plain data block is encoded into eight bytes and represents exactly eight
418// bytes of data in the unencoded stream.  The plain data block must not begin
419// with the same character repeated four times.  It is allowed to contain such a
420// four-character sequence, just not at the start of the block.  The decoded
421// value of a plain data block is identical to its encoded value.
422//
423// A remainder block has length of at most seven.  It is a shorter version of
424// the plain data block.  It occurs only at the end of the encoded stream and
425// represents exactly as many bytes of unencoded data as its own length.  Like a
426// plain data block, the remainder block never begins with the same character
427// repeated four times.  The decoded value of this block is identical to its
428// encoded value.
429//
430// The encode algorithm:
431//
432// From the above description, it can be seen that there may be more than one
433// way to encode a given input string.  The encoder must be careful to choose
434// the encoding that guarantees sort ordering.
435//
436// The rules for the encoder are as follows:
437// 1. Iterate through the input string and produce output blocks one at a time.
438// 2. Where possible (ie. where the next four bytes of input consist of the
439//    same character repeated four times), produce a repeated data block of
440//    maximum possible length.
441// 3. If there is at least 8 bytes of data remaining and it is not possible
442//    to produce a repeated character block, produce a plain data block.
443// 4. If there are less than 8 bytes of data remaining and it is not possible
444//    to produce a repeated character block, produce a remainder block.
445// 5. When producing a repeated character block, the count encoding must be
446//    chosen in such a way that the sort ordering is maintained.  The choice is
447//    best illustrated by way of example:
448//
449//      When comparing two strings, the first of which begins with of 8
450//      instances of the letter 'B' and the second with 10 instances of the
451//      letter 'B', which of the two should compare lower?  The result depends
452//      on the 9th character of the first string, since it will be compared
453//      against the 9th 'B' in the second string.  If that character is an 'A',
454//      then the first string will compare lower.  If it is a 'C', then the
455//      first string will compare higher.
456//
457//    The key insight is that the comparison value of a repeated character block
458//    depends on the value of the character that follows it.  If the character
459//    follows the repeated character has a value greater than the repeated
460//    character itself, then a shorter run length should translate to a higher
461//    comparison value.  Therefore, we encode its count using the low encoding.
462//    Similarly, if the following character is lower, we use the high encoding.
463
464namespace {
465
466// Appends an encoded run length to |output_str|.
467static void WriteEncodedRunLength(uint32 length,
468                                  bool high_encoding,
469                                  std::string* output_str) {
470  CHECK_GE(length, 4U);
471  CHECK_LT(length, 0x80000000);
472
473  // Step 1: Invert the count, if necessary, to account for the following digit.
474  uint32 encoded_length;
475  if (high_encoding) {
476    encoded_length = 0xffffffff - length;
477  } else {
478    encoded_length = length;
479  }
480
481  // Step 2: Write it as big-endian so it compares correctly with memcmp(3).
482  output_str->append(1, 0xff & (encoded_length >> 24U));
483  output_str->append(1, 0xff & (encoded_length >> 16U));
484  output_str->append(1, 0xff & (encoded_length >> 8U));
485  output_str->append(1, 0xff & (encoded_length >> 0U));
486}
487
488// Reads an encoded run length for |str| at position |i|.
489static uint32 ReadEncodedRunLength(const std::string& str, size_t i) {
490  DCHECK_LE(i + 4, str.length());
491
492  // Step 1: Extract the big-endian count.
493  uint32 encoded_length =
494      ((uint8)(str[i+3]) << 0)  |
495      ((uint8)(str[i+2]) << 8)  |
496      ((uint8)(str[i+1]) << 16) |
497      ((uint8)(str[i+0]) << 24);
498
499  // Step 2: If this was an inverted count, un-invert it.
500  uint32 length;
501  if (encoded_length & 0x80000000) {
502    length = 0xffffffff - encoded_length;
503  } else {
504    length = encoded_length;
505  }
506
507  return length;
508}
509
510// A series of four identical chars at the beginning of a block indicates
511// the beginning of a repeated character block.
512static bool IsRepeatedCharPrefix(const std::string& chars, size_t start_index) {
513  return chars[start_index] == chars[start_index+1]
514      && chars[start_index] == chars[start_index+2]
515      && chars[start_index] == chars[start_index+3];
516}
517
518}  // namespace
519
520// static
521// Wraps the CompressImpl function with a bunch of DCHECKs.
522std::string UniquePosition::Compress(const std::string& str) {
523  DCHECK(IsValidBytes(str));
524  std::string compressed = CompressImpl(str);
525  DCHECK(IsValidCompressed(compressed));
526  DCHECK_EQ(str, Uncompress(compressed));
527  return compressed;
528}
529
530// static
531// Performs the order preserving run length compression of a given input string.
532std::string UniquePosition::CompressImpl(const std::string& str) {
533  std::string output;
534
535  // The compressed length will usually be at least as long as the suffix (28),
536  // since the suffix bytes are mostly random.  Most are a few bytes longer; a
537  // small few are tens of bytes longer.  Some early tests indicated that
538  // roughly 99% had length 40 or smaller.  We guess that pre-sizing for 48 is a
539  // good trade-off, but that has not been confirmed through profiling.
540  output.reserve(48);
541
542  // Each loop iteration will consume 8, or N bytes, where N >= 4 and is the
543  // length of a string of identical digits starting at i.
544  for (size_t i = 0; i < str.length(); ) {
545    if (i + 4 <= str.length() && IsRepeatedCharPrefix(str, i)) {
546      // Four identical bytes in a row at this position means that we must start
547      // a repeated character block.  Begin by outputting those four bytes.
548      output.append(str, i, 4);
549
550      // Determine the size of the run.
551      const char rep_digit = str[i];
552      const size_t runs_until = str.find_first_not_of(rep_digit, i+4);
553
554      // Handle the 'runs until end' special case specially.
555      size_t run_length;
556      bool encode_high;  // True if the next byte is greater than |rep_digit|.
557      if (runs_until == std::string::npos) {
558        run_length = str.length() - i;
559        encode_high = false;
560      } else {
561        run_length = runs_until - i;
562        encode_high = static_cast<uint8>(str[runs_until]) >
563            static_cast<uint8>(rep_digit);
564      }
565      DCHECK_LT(run_length, static_cast<size_t>(kint32max))
566          << "This implementation can't encode run-lengths greater than 2^31.";
567
568      WriteEncodedRunLength(run_length, encode_high, &output);
569      i += run_length;  // Jump forward by the size of the run length.
570    } else {
571      // Output up to eight bytes without any encoding.
572      const size_t len = std::min(static_cast<size_t>(8), str.length() - i);
573      output.append(str, i, len);
574      i += len;  // Jump forward by the amount of input consumed (usually 8).
575    }
576  }
577
578  return output;
579}
580
581// static
582// Uncompresses strings that were compresed with UniquePosition::Compress.
583std::string UniquePosition::Uncompress(const std::string& str) {
584  std::string output;
585  size_t i = 0;
586  // Iterate through the compressed string one block at a time.
587  for (i = 0; i + 8 <= str.length(); i += 8) {
588    if (IsRepeatedCharPrefix(str, i)) {
589      // Found a repeated character block.  Expand it.
590      const char rep_digit = str[i];
591      uint32 length = ReadEncodedRunLength(str, i+4);
592      output.append(length, rep_digit);
593    } else {
594      // Found a regular block.  Copy it.
595      output.append(str, i, 8);
596    }
597  }
598  // Copy the remaining bytes that were too small to form a block.
599  output.append(str, i, std::string::npos);
600  return output;
601}
602
603bool UniquePosition::IsValidCompressed(const std::string& str) {
604  for (size_t i = 0; i + 8 <= str.length(); i += 8) {
605    if (IsRepeatedCharPrefix(str, i)) {
606      uint32 count = ReadEncodedRunLength(str, i+4);
607      if (count < 4) {
608        // A repeated character block should at least represent the four
609        // characters that started it.
610        return false;
611      }
612      if (str[i] == str[i+4]) {
613        // Does the next digit after a count match the repeated character?  Then
614        // this is not the highest possible count.
615        return false;
616      }
617    }
618  }
619  // We don't bother looking for the existence or checking the validity of
620  // any partial blocks.  There's no way they could be invalid anyway.
621  return true;
622}
623
624}  // namespace syncer
625