1package com.google.android.rappor;
2
3// BEGIN android-changed: Removed guava dependency
4// import static com.google.common.base.Preconditions.checkArgument;
5//
6// import com.google.common.base.Verify;
7// END android-changed
8
9import java.nio.ByteBuffer;
10import java.nio.charset.StandardCharsets;
11import java.security.MessageDigest;
12import java.security.NoSuchAlgorithmException;
13import java.security.SecureRandom;
14import java.util.BitSet;
15// BEGIN android-changed
16import java.util.Random;
17// END android-changed
18
19// BEGIN android-changed: Remove javax
20// import javax.annotation.Nullable;
21// import javax.annotation.concurrent.GuardedBy;
22// END android-changed
23
24/**
25 * Encodes reports using the RAPPOR differentially-private encoding algorithm.
26 *
27 * The RAPPOR algorithm is described at go/rappor and presented in detail at go/rappor-writeup.
28 *
29 * @author bonawitz@google.com Keith Bonawitz
30 */
31// TODO(bonawitz): Make encoder and interface and make this a final class implementing it.
32// We can't just make this final now because there exist tests that need to Mock it.
33public class Encoder {
34  /**
35   * A non-decreasing version number.
36   *
37   * <p>The version number should increase any time the Encoder has a user-visible functional change
38   * to any of encoding algorithms or the interpretation of the input parameters.
39   */
40  public static final long VERSION = 3;
41
42  /**
43   * Minimum length required for the user secret, in bytes.
44   */
45  public static final int MIN_USER_SECRET_BYTES = HmacDrbg.ENTROPY_INPUT_SIZE_BYTES;
46
47  /**
48   * Maximum number of bits in the RAPPOR-encoded report.
49   *
50   * Must be less than HmacDrbg.MAX_BYTES_TOTAL;
51   */
52  public static final int MAX_BITS = 4096;
53
54  /**
55   * Maximum number of Bloom filter hashes used for RAPPOR-encoded strings.
56   *
57   * <p>This is constrained in the current implementation by requiring 2 bytes from an MD5 value
58   * (which is 16 bytes long) for each Bloom filter hash.
59   */
60  public static final int MAX_BLOOM_HASHES = 8;
61
62  /**
63   * Maximum number of cohorts supported.
64   */
65  public static final int MAX_COHORTS = 128;
66
67  /**
68   * First (and only) byte of HMAC_DRBG personalization strings used to generate the cohort number.
69   */
70  private static final byte HMAC_DRBG_TYPE_COHORT = 0x00;
71
72  /**
73   * First byte of HMAC_DRBG personalization strings used to generate the PRR response.
74   */
75  private static final byte HMAC_DRBG_TYPE_PRR = 0x01;
76
77  /**
78   * A unique identifier for this Encoder, represented in UTF-8.
79   *
80   * <p>The (userSecret, encoderId) pair identify a the logical memoization table used for RAPPOR's
81   * Permanent Randomized Response stage.  Therefore, for any userSecret, each Encoder must have a
82   * distinct identifier for Permanent Randomized Response to be effective.
83   *
84   * <p>In practice, "memoization" is achieved by generating deterministic pseudo-random bits using
85   * HMAC_DRBG.  encoderIdBytes is used as part of the personalization string.
86   */
87  private final byte[] encoderIdBytes;
88
89  /**
90   * The RAPPOR "f" probability, on the range [0.0, 1.0].
91   *
92   * <p>This it the probability of replacing a bit from the input with a uniform random bit when
93   * generating the permanent randomized response.
94   *
95   * <p>Setting probabilityF to 0 disables the PRR phase of RAPPOR.
96   */
97  private final double probabilityF;
98
99  /**
100   * The RAPPOR "p" probability, on the range [0.0, 1.0].
101   *
102   * <p>This is the probability of emitting a '1' bit in the instantaneous randomized response,
103   * given that the corresponding bit in the permanent randomized response is '0'.
104   *
105   * <p>Setting probabilityP to 0.0 and probabilityQ to 1.0 disables the IRR phase of RAPPOR.
106   */
107  private final double probabilityP;
108
109  /**
110   * The RAPPOR "1" probability, on the range [0.0, 1.0].
111   *
112   * <p>This is the probability of emitting a 1 bit in the instantaneous randomized response, given
113   * that the corresponding bit in the permanent randomized response is 1.
114   *
115   * <p>Setting probabilityP to 0.0 and probabilityQ to 1.0 disables the IRR phase of RAPPOR.
116   */
117  private final double probabilityQ;
118
119  /**
120   * The number of bits in the RAPPOR-encoded report.
121   *
122   * <p>Must satisfy 1 &lt;= numBits &lt;= MAX_BITS.
123   *
124   * <ul>
125   * <li>For encodeOrdinal, requires 0 &lt;= ordinal &lt; numBits.
126   * <li>For encodeString, uses a numBits-wide Bloom filter.
127   * <li>For encodeBits, only the low-order numBits may be non-zero.
128   * </ul>
129   */
130  private final int numBits;
131
132  /**
133   * The number of hash functions used forming the Bloom filter encoding of a string.
134   *
135   * <p>Must satisfy 1 &lt;= numBloomHashes &lt;= MAX_BLOOM_HASHES.
136   */
137  private final int numBloomHashes;
138
139  /**
140   * The number of cohorts used for cohort assignment.
141   */
142  private final int numCohorts;
143
144  /**
145   * The cohort this user is assigned to for Bloom filter string encoding.
146   *
147   * <p>This value is stable for a given (userSecret, numCohorts) pair.  Furthermore, if two
148   * encoders use the same userSecret but have different numCohorts values, the cohort assignment
149   * for the encoder with the smaller numCohorts value is a bitwise suffix of the cohort assignment
150   * for the encoder with the larger numCohorts value.  It follows that, if you know maximum cohort
151   * assignment across a set of encoders, and you know the numCohorts setting for each encoder, then
152   * you can deduce the cohort assignment for each encoder by taking the bitwise-and of the max
153   * cohort value and (numCohorts-1), noting that numCohorts is required to be a positive power of
154   * 2.
155   */
156  private final int cohort;
157
158  /**
159   * A bitmask with 1 bits in all the positions where a RAPPOR-encoded report could have a 1 bit.
160   *
161   * <p>The bitmask has the lowest order numBits set to 1 and the rest 0.
162   */
163  private final BitSet inputMask;
164
165  /**
166   * SHA-256 utility class instance.
167   *
168   * <p>This object is stateful; access must be synchronized.  The reset method must be
169   * called before each use.
170   */
171// BEGIN android-changed: Remove javax
172//  @GuardedBy("this")
173// END android-changed
174  private final MessageDigest sha256;
175
176  /**
177   * MD5 utility class instance.
178   *
179   * <p>This object is stateful; access must be synchronized.  The reset method must be
180   * called before each use.
181   */
182// BEGIN android-changed: Remove javax
183//  @GuardedBy("this")
184// END android-changed
185  private final MessageDigest md5;
186
187  /**
188   * A SecureRandom instance, initialized with a cryptographically secure random seed.
189   */
190// BEGIN android-changed
191//  private final SecureRandom random;
192  private final Random random;
193// BEGIN android-changed
194
195  /**
196   * Entropy input for constructing HmacDrbg objects.
197   */
198  private final byte[] userSecret;
199
200  /**
201   * Constructs a new RAPPOR message encoder.
202   *
203   * @param userSecret Stable secret randomly selected for this user.  UserSecret must be at least
204   *     MIN_USER_SECRET_BYTES bytes of high-quality entropy.  Changing the user secret clears the
205   *     memoized cohort assignments and permanent randomized responses.  Be aware that resetting
206   *     these memoizations has significant privacy risks -- consult documentation at go/rappor for
207   *     more details.
208   * @param encoderId Uniquely identifies this encoder.  Used to differentiate momoized
209   *     cohort assignments and permanent randomized responses.
210   * @param numBits The number of bits in the RAPPOR-encoded report.
211   * @param probabilityF The RAPPOR "f" probability, on the range [0.0, 1.0].  This will be
212   *     quantized to the nearest 1/128.
213   * @param probabilityP The RAPPOR "p" probability, on the range [0.0, 1.0].
214   * @param probabilityQ The RAPPOR "1" probability, on the range [0.0, 1.0].
215   * @param numCohorts Number of cohorts into which the user pool is randomly segmented.
216   * @param numBloomHashes The number of hash functions used forming the Bloom filter encoding of a
217   *     string.
218   */
219  public Encoder(byte[] userSecret, String encoderId, int numBits,
220                 double probabilityF, double probabilityP, double probabilityQ,
221                 int numCohorts, int numBloomHashes) {
222    this(
223        null, // random
224        null, // md5,
225        null, // sha256,
226        userSecret,
227        encoderId,
228        numBits,
229        probabilityF,
230        probabilityP,
231        probabilityQ,
232        numCohorts,
233        numBloomHashes);
234  }
235
236  /**
237   * Constructs a new RAPPOR message encoder, using constructor-style dependency injection.
238   *
239   * @param random A cryptographically secure random number generator, or null (in which case a
240   *     new SecureRandom will be constructed).
241   * @param md5 A configured MD5 hash algorithm, or null (in which case a new MessageDigest will be
242   *     constructed).   Note: MessageDigest objects are stateful, and that state must not be
243   *     modified while calls to the Encoder are active.
244   * @param sha256 A configured SHA-256 hash algorithm, or null (in which case a new MessageDigest
245   *     will be constructed).   Note: MessageDigest objects are stateful, and that state must not
246   *     be modified while calls to the Encoder are active.
247   * @param userSecret Stable secret randomly selected for this user.  UserSecret must be at least
248   *     32 bytes of high-quality entropy.  Changing the user secret clears the memoized cohort
249   *     assignments and permanent randomized responses.  Be aware that resetting these memoizations
250   *     has significant privacy risks -- consult documentation at go/rappor for more details.
251   * @param encoderId Uniquely identifies this encoder.  Used to differentiate momoized
252   *     cohort assignments and permanent randomized responses.
253   * @param numBits The number of bits in the RAPPOR-encoded report.
254   * @param probabilityF The RAPPOR "f" probability, on the range [0.0, 1.0].  This will be
255   *     quantized to the nearest 1/128.
256   * @param probabilityP The RAPPOR "p" probability, on the range [0.0, 1.0].
257   * @param probabilityQ The RAPPOR "1" probability, on the range [0.0, 1.0].
258   * @param numCohorts Number of cohorts into which the user pool is randomly segmented.
259   * @param numBloomHashes The number of hash functions used forming the Bloom filter encoding of a
260   *     string.
261   */
262  public Encoder(
263// BEGIN android-changed: Remove javax
264//      @Nullable SecureRandom random,
265//      @Nullable MessageDigest md5,
266//      @Nullable MessageDigest sha256,
267//      SecureRandom random,
268      Random random,
269      MessageDigest md5,
270      MessageDigest sha256,
271// END android-changed
272      byte[] userSecret,
273      String encoderId,
274      int numBits,
275      double probabilityF,
276      double probabilityP,
277      double probabilityQ,
278      int numCohorts,
279      int numBloomHashes) {
280    if (md5 != null) {
281      this.md5 = md5;
282    } else {
283      try {
284        this.md5 = MessageDigest.getInstance("MD5");
285      } catch (NoSuchAlgorithmException impossible) {
286        // This should never happen.  Every implementation of the Java platform
287        // is required to support MD5.
288        throw new AssertionError(impossible);
289      }
290    }
291    this.md5.reset();
292
293    if (sha256 != null) {
294      this.sha256 = sha256;
295    } else {
296      try {
297        this.sha256 = MessageDigest.getInstance("SHA-256");
298      } catch (NoSuchAlgorithmException impossible) {
299        // This should never happen.  Every implementation of the Java platform
300        // is required to support 256.
301        throw new AssertionError(impossible);
302      }
303    }
304    this.sha256.reset();
305
306    this.encoderIdBytes = encoderId.getBytes(StandardCharsets.UTF_8);
307
308    if (random != null) {
309      this.random = random;
310    } else {
311      this.random = new SecureRandom();
312    }
313
314    // android-changed: Removed guava dependency
315    // checkArgument(
316    //     userSecret.length >= MIN_USER_SECRET_BYTES,
317    //     "userSecret must be at least %s bytes of high-quality entropy.",
318    //     MIN_USER_SECRET_BYTES);
319    checkArgument(
320        userSecret.length >= MIN_USER_SECRET_BYTES,
321        "userSecret must be at least " + MIN_USER_SECRET_BYTES
322            + " bytes of high-quality entropy.");
323    this.userSecret = userSecret;
324
325    checkArgument(
326        probabilityF >= 0 && probabilityF <= 1, "probabilityF must be on range [0.0, 1.0]");
327    this.probabilityF = Math.round(probabilityF * 128) / 128.0;
328
329    checkArgument(
330        probabilityP >= 0 && probabilityP <= 1, "probabilityP must be on range [0.0, 1.0]");
331    this.probabilityP = probabilityP;
332
333    checkArgument(
334        probabilityQ >= 0 && probabilityQ <= 1, "probabilityQ must be on range [0.0, 1.0]");
335    this.probabilityQ = probabilityQ;
336
337    checkArgument(
338        numBits >= 1 && numBits <= MAX_BITS, "numBits must be on range [1, " + MAX_BITS + "].");
339    this.numBits = numBits;
340    // Make a bitmask with the lowest-order numBits set to 1.
341    this.inputMask = new BitSet(numBits);
342    this.inputMask.set(0, numBits, true);
343
344    checkArgument(
345        numBloomHashes >= 1 && numBloomHashes <= numBits,
346        "numBloomHashes must be on range [1, numBits).");
347    this.numBloomHashes = numBloomHashes;
348
349    checkArgument(
350        numCohorts >= 1 && numCohorts <= MAX_COHORTS,
351        "numCohorts must be on range [1, " + MAX_COHORTS + "].");
352
353    // Assuming numCohorts >= 1:
354    //
355    // If numCohorts is a power of 2, then numCohorts has one bit set and numCohorts - 1 has only
356    // the bits to the right of numCohorts's bit set.  The bitwise-and of these is 0.
357    //
358    // If numCohorts is not a power of 2, then numCohorts has at least two bits set.  It follows
359    // subtracting one from numCohorts keeps the most significant bit set to 1, and thus the
360    // bitwise-and has at least this non-zero bit.
361    boolean numCohortsIsPowerOfTwo = (numCohorts & (numCohorts - 1)) == 0;
362    checkArgument(numCohortsIsPowerOfTwo, "numCohorts must be a power of 2.");
363    this.numCohorts = numCohorts;
364
365    // cohortMasterAssignment depends only on the userSecret.
366    HmacDrbg cohortDrbg = new HmacDrbg(userSecret, new byte[] {HMAC_DRBG_TYPE_COHORT});
367    ByteBuffer cohortDrbgBytes = ByteBuffer.wrap(cohortDrbg.nextBytes(4));
368    int cohortMasterAssignment = Math.abs(cohortDrbgBytes.getInt()) % MAX_COHORTS;
369    this.cohort = cohortMasterAssignment & (numCohorts - 1);
370  }
371
372  public double getProbabilityF() {
373    return probabilityF;
374  }
375
376  public double getProbabilityP() {
377    return probabilityP;
378  }
379
380  public double getProbabilityQ() {
381    return probabilityQ;
382  }
383
384  public int getNumBits() {
385    return numBits;
386  }
387
388  public int getNumBloomHashes() {
389    return numBloomHashes;
390  }
391
392  public int getNumCohorts() {
393    return numCohorts;
394  }
395
396  public int getCohort() {
397    return cohort;
398  }
399
400  /**
401   * Returns the Encoder ID as a UTF-8 formatted string.
402   */
403  public String getEncoderId() {
404    return new String(encoderIdBytes, StandardCharsets.UTF_8);
405  }
406
407  /**
408   * Encodes a boolean into a RAPPOR report.
409   *
410   * <p>The boolean is 0 or 1, then encoded using permanent and instantaneous randomized response.
411   *
412   * <p>In most cases, numBits should be 1 when using this method.
413   */
414  public byte[] encodeBoolean(boolean bool) {
415    BitSet input = new BitSet(numBits);
416    input.set(0, bool);
417    return encodeBits(input);
418  }
419
420  /**
421   * Encodes an ordinal into a RAPPOR report.
422   *
423   * <p>The ordinal is represented using a 1-hot binary representation, then encoded using permanent
424   * and instantaneous randomized response.
425   *
426   * @param ordinal A value on the range [0, numBits).
427   */
428  public byte[] encodeOrdinal(int ordinal) {
429    checkArgument(
430        ordinal >= 0 && ordinal < numBits, "Ordinal value must be in range [0, numBits).");
431    BitSet input = new BitSet(numBits);
432    input.set(ordinal, true);
433    return encodeBits(input);
434  }
435
436  /**
437   * Encodes a string into a RAPPOR report.
438   *
439   * <p>The string is represented using a Bloom filter with numBloomHashes hash functions, then
440   * encoded using permanent and instantaneous randomized response.
441   *
442   * @param string An arbitrary string.
443   */
444  public byte[] encodeString(String string) {
445    // Implements a Bloom filter by slicing a single MD5 hash into numBloomHashes bit indices.
446    byte[] stringInUtf8 = string.getBytes(StandardCharsets.UTF_8);
447    byte[] message =
448        ByteBuffer.allocate(4 + stringInUtf8.length)
449                  .putInt(cohort)
450                  .put(stringInUtf8)
451                  .array();
452
453    byte[] digest;
454    synchronized (this) {
455      md5.reset();
456      digest = md5.digest(message);
457    }
458    // android-changed: Removed guava dependency
459    // Verify.verify(digest.length == 16);
460    // Verify.verify(numBloomHashes <= digest.length / 2);
461    verify(digest.length == 16);
462    verify(numBloomHashes <= digest.length / 2);
463
464    BitSet input = new BitSet(numBits);
465    for (int i = 0; i < numBloomHashes; i++) {
466      // Convert byte pairs to ints on [0, 65535].
467      // Anding with 0xFF converts signed byte to unsigned int.
468      int digestWord = (digest[i * 2] & 0xFF) * 256 + (digest[i * 2 + 1] & 0xFF);
469      int chosenBit = digestWord % numBits;
470      input.set(chosenBit, true);
471    }
472
473    return encodeBits(input);
474  }
475
476  /**
477   * Encodes an arbitrary bitstring into a RAPPOR report.
478   *
479   * @param bits A bitstring in which only the least significant numBits bits may be 1.
480   */
481  public byte[] encodeBits(byte[] bits) {
482    return encodeBits(BitSet.valueOf(bits));
483  }
484
485  /**
486   * Encodes an arbitrary bitstring into a RAPPOR report.
487   *
488   * @param bits A bitstring in which only the least significant numBits bits may be 1.
489   */
490  private byte[] encodeBits(BitSet bits) {
491    BitSet permanentRandomizedResponse = computePermanentRandomizedResponse(bits);
492    BitSet encodedBitSet = computeInstantaneousRandomizedResponse(permanentRandomizedResponse);
493
494    // BitSet.toByteArray only returns enough bytes to capture the most significant bit
495    // that is set.  For example, a BitSet with no bits set could return a length-0 array.
496    // In contrast, we guarantee that our output is sized according to numBits.
497    byte[] encodedBytes = encodedBitSet.toByteArray();
498    byte[] output = new byte[(numBits + 7) / 8];
499    // android-changed: Removed guava dependency
500    // Verify.verify(encodedBytes.length <= output.length);
501    verify(encodedBytes.length <= output.length);
502    System.arraycopy(
503        encodedBytes, // src
504        0, // srcPos
505        output, // dest
506        0, // destPos
507        encodedBytes.length); // length
508    return output;
509  }
510
511  /**
512   * Returns the permanent randomized response for the given bits.
513   *
514   * <p>The response for a particular bits input is guaranteed to always be the same for any encoder
515   * constructed with the same parameters (including the encoderId and the userSecret).
516   */
517  private BitSet computePermanentRandomizedResponse(BitSet bits) {
518    // Ensures that the input only has bits set in the lowest
519    BitSet masked = new BitSet();
520    masked.or(bits);
521    masked.andNot(inputMask);
522    checkArgument(masked.isEmpty(), "Input bits had bits set past Encoder's numBits limit.");
523
524    if (probabilityF == 0.0) {
525      return bits;
526    }
527
528    // Builds a personalization string that contains both the encoderId and input value (bits),
529    // and is no longer than HmacDrbg.MAX_PERSONALIZATION_STRING_LENGTH_BYTES.  The first byte
530    // of the personalization string is always HMAC_DRBG_TYPE_PRR, to avoid collisions with the
531    // cohort-generation personalization string.
532    byte[] personalizationString;
533    synchronized (this) {
534      int personalizationStringLength =
535          Math.min(HmacDrbg.MAX_PERSONALIZATION_STRING_LENGTH_BYTES, 1 + sha256.getDigestLength());
536      personalizationString = new byte[personalizationStringLength];
537      personalizationString[0] = HMAC_DRBG_TYPE_PRR;
538      sha256.reset();
539      sha256.update(encoderIdBytes);
540      sha256.update(new byte[] {0});
541      sha256.update(bits.toByteArray());
542      byte[] digest = sha256.digest(personalizationString);
543      System.arraycopy(digest, 0, personalizationString, 1, personalizationString.length - 1);
544    }
545
546    HmacDrbg drbg = new HmacDrbg(userSecret, personalizationString);
547    byte[] pseudorandomStream = drbg.nextBytes(numBits);
548    // android-changed: Removed guava dependency
549    // Verify.verify(numBits <= pseudorandomStream.length);
550    verify(numBits <= pseudorandomStream.length);
551
552    int probabilityFTimes128 = (int) Math.round(probabilityF * 128);
553    BitSet result = new BitSet(numBits);
554    for (int i = 0; i < numBits; i++) {
555      // Grabs a single byte from the pseudorandom stream.
556      // Anding with 0xFF converts a signed byte to an unsigned integer.
557      int pseudorandomByte = pseudorandomStream[i] & 0xFF;
558
559      // Uses bits 1-7 to get a random number between 0 and 127.
560      int uniform0to127 = pseudorandomByte >> 1;
561      boolean shouldUseNoise = uniform0to127 < probabilityFTimes128;
562
563      if (shouldUseNoise) {
564        // Uses bit 0 as a flip of a fair coin.
565        result.set(i, (pseudorandomByte & 0x01) > 0);
566      } else {
567        result.set(i, bits.get(i));
568      }
569    }
570    return result;
571  }
572
573  /**
574   * Returns the instantaneous randomized response for the given bits.
575   *
576   * <p>The instantaneous response is NOT memoized -- it is sampled randomly on
577   * every invocation.
578   */
579  private BitSet computeInstantaneousRandomizedResponse(BitSet bits) {
580    // Ensures that the input only has bits set in the lowest
581    BitSet masked = new BitSet();
582    masked.or(bits);
583    masked.andNot(inputMask);
584    checkArgument(masked.isEmpty(), "Input bits had bits set past Encoder's numBits limit.");
585
586    if (probabilityP == 0.0 && probabilityQ == 1.0) {
587      return bits;
588    }
589
590    BitSet response = new BitSet(numBits);
591    for (int i = 0; i < numBits; i++) {
592      boolean bit = bits.get(i);
593      double probability = bit ? probabilityQ : probabilityP;
594      boolean responseBit = random.nextFloat() < probability;
595      response.set(i, responseBit);
596    }
597    return response;
598  }
599
600  // BEGIN android-changed: Added guava methods
601  private static void checkArgument(boolean expression, Object errorMessage) {
602    if (!expression) {
603      throw new IllegalArgumentException(String.valueOf(errorMessage));
604    }
605  }
606
607  private static void verify(boolean expression) {
608    if (!expression) {
609      throw new java.lang.IllegalStateException();
610    }
611  }
612  // END android-changed
613}
614