10888a09821a98ac0680fad765217302858e70fa4Paul Duffin/*
20888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Copyright (C) 2013 The Guava Authors
30888a09821a98ac0680fad765217302858e70fa4Paul Duffin *
40888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
50888a09821a98ac0680fad765217302858e70fa4Paul Duffin * in compliance with the License. You may obtain a copy of the License at
60888a09821a98ac0680fad765217302858e70fa4Paul Duffin *
70888a09821a98ac0680fad765217302858e70fa4Paul Duffin * http://www.apache.org/licenses/LICENSE-2.0
80888a09821a98ac0680fad765217302858e70fa4Paul Duffin *
90888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Unless required by applicable law or agreed to in writing, software distributed under the License
100888a09821a98ac0680fad765217302858e70fa4Paul Duffin * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
110888a09821a98ac0680fad765217302858e70fa4Paul Duffin * or implied. See the License for the specific language governing permissions and limitations under
120888a09821a98ac0680fad765217302858e70fa4Paul Duffin * the License.
130888a09821a98ac0680fad765217302858e70fa4Paul Duffin */
140888a09821a98ac0680fad765217302858e70fa4Paul Duffin
150888a09821a98ac0680fad765217302858e70fa4Paul Duffinpackage com.google.common.base;
160888a09821a98ac0680fad765217302858e70fa4Paul Duffin
170888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport static com.google.common.base.Preconditions.checkPositionIndexes;
180888a09821a98ac0680fad765217302858e70fa4Paul Duffin
190888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport com.google.common.annotations.Beta;
200888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport com.google.common.annotations.GwtCompatible;
210888a09821a98ac0680fad765217302858e70fa4Paul Duffin
220888a09821a98ac0680fad765217302858e70fa4Paul Duffin/**
230888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8}
240888a09821a98ac0680fad765217302858e70fa4Paul Duffin * character encoding. UTF-8 is defined in section D92 of
250888a09821a98ac0680fad765217302858e70fa4Paul Duffin * <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core
260888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Specification, Chapter 3</a>.
270888a09821a98ac0680fad765217302858e70fa4Paul Duffin *
280888a09821a98ac0680fad765217302858e70fa4Paul Duffin * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8
290888a09821a98ac0680fad765217302858e70fa4Paul Duffin * introduced in Unicode 3.1. One implication of this is that it rejects
300888a09821a98ac0680fad765217302858e70fa4Paul Duffin * <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte
310888a09821a98ac0680fad765217302858e70fa4Paul Duffin * sequences, even though the JDK decoder may accept them.
320888a09821a98ac0680fad765217302858e70fa4Paul Duffin *
330888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @author Martin Buchholz
340888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @author Clément Roux
350888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @since 16.0
360888a09821a98ac0680fad765217302858e70fa4Paul Duffin */
370888a09821a98ac0680fad765217302858e70fa4Paul Duffin@Beta
380888a09821a98ac0680fad765217302858e70fa4Paul Duffin@GwtCompatible
390888a09821a98ac0680fad765217302858e70fa4Paul Duffinpublic final class Utf8 {
400888a09821a98ac0680fad765217302858e70fa4Paul Duffin  /**
410888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
420888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
430888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * both time and space.
440888a09821a98ac0680fad765217302858e70fa4Paul Duffin   *
450888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
460888a09821a98ac0680fad765217302858e70fa4Paul Duffin   *     surrogates)
470888a09821a98ac0680fad765217302858e70fa4Paul Duffin   */
480888a09821a98ac0680fad765217302858e70fa4Paul Duffin  public static int encodedLength(CharSequence sequence) {
490888a09821a98ac0680fad765217302858e70fa4Paul Duffin    // Warning to maintainers: this implementation is highly optimized.
500888a09821a98ac0680fad765217302858e70fa4Paul Duffin    int utf16Length = sequence.length();
510888a09821a98ac0680fad765217302858e70fa4Paul Duffin    int utf8Length = utf16Length;
520888a09821a98ac0680fad765217302858e70fa4Paul Duffin    int i = 0;
530888a09821a98ac0680fad765217302858e70fa4Paul Duffin
540888a09821a98ac0680fad765217302858e70fa4Paul Duffin    // This loop optimizes for pure ASCII.
550888a09821a98ac0680fad765217302858e70fa4Paul Duffin    while (i < utf16Length && sequence.charAt(i) < 0x80) {
560888a09821a98ac0680fad765217302858e70fa4Paul Duffin      i++;
570888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
580888a09821a98ac0680fad765217302858e70fa4Paul Duffin
590888a09821a98ac0680fad765217302858e70fa4Paul Duffin    // This loop optimizes for chars less than 0x800.
600888a09821a98ac0680fad765217302858e70fa4Paul Duffin    for (; i < utf16Length; i++) {
610888a09821a98ac0680fad765217302858e70fa4Paul Duffin      char c = sequence.charAt(i);
620888a09821a98ac0680fad765217302858e70fa4Paul Duffin      if (c < 0x800) {
630888a09821a98ac0680fad765217302858e70fa4Paul Duffin        utf8Length += ((0x7f - c) >>> 31);  // branch free!
640888a09821a98ac0680fad765217302858e70fa4Paul Duffin      } else {
650888a09821a98ac0680fad765217302858e70fa4Paul Duffin        utf8Length += encodedLengthGeneral(sequence, i);
660888a09821a98ac0680fad765217302858e70fa4Paul Duffin        break;
670888a09821a98ac0680fad765217302858e70fa4Paul Duffin      }
680888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
690888a09821a98ac0680fad765217302858e70fa4Paul Duffin
700888a09821a98ac0680fad765217302858e70fa4Paul Duffin    if (utf8Length < utf16Length) {
710888a09821a98ac0680fad765217302858e70fa4Paul Duffin      // Necessary and sufficient condition for overflow because of maximum 3x expansion
720888a09821a98ac0680fad765217302858e70fa4Paul Duffin      throw new IllegalArgumentException("UTF-8 length does not fit in int: "
730888a09821a98ac0680fad765217302858e70fa4Paul Duffin                                         + (utf8Length + (1L << 32)));
740888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
750888a09821a98ac0680fad765217302858e70fa4Paul Duffin    return utf8Length;
760888a09821a98ac0680fad765217302858e70fa4Paul Duffin  }
770888a09821a98ac0680fad765217302858e70fa4Paul Duffin
780888a09821a98ac0680fad765217302858e70fa4Paul Duffin  private static int encodedLengthGeneral(CharSequence sequence, int start) {
790888a09821a98ac0680fad765217302858e70fa4Paul Duffin    int utf16Length = sequence.length();
800888a09821a98ac0680fad765217302858e70fa4Paul Duffin    int utf8Length = 0;
810888a09821a98ac0680fad765217302858e70fa4Paul Duffin    for (int i = start; i < utf16Length; i++) {
820888a09821a98ac0680fad765217302858e70fa4Paul Duffin      char c = sequence.charAt(i);
830888a09821a98ac0680fad765217302858e70fa4Paul Duffin      if (c < 0x800) {
840888a09821a98ac0680fad765217302858e70fa4Paul Duffin        utf8Length += (0x7f - c) >>> 31; // branch free!
850888a09821a98ac0680fad765217302858e70fa4Paul Duffin      } else {
860888a09821a98ac0680fad765217302858e70fa4Paul Duffin        utf8Length += 2;
870888a09821a98ac0680fad765217302858e70fa4Paul Duffin        // jdk7+: if (Character.isSurrogate(c)) {
880888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
890888a09821a98ac0680fad765217302858e70fa4Paul Duffin          // Check that we have a well-formed surrogate pair.
900888a09821a98ac0680fad765217302858e70fa4Paul Duffin          int cp = Character.codePointAt(sequence, i);
910888a09821a98ac0680fad765217302858e70fa4Paul Duffin          if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
920888a09821a98ac0680fad765217302858e70fa4Paul Duffin            throw new IllegalArgumentException("Unpaired surrogate at index " + i);
930888a09821a98ac0680fad765217302858e70fa4Paul Duffin          }
940888a09821a98ac0680fad765217302858e70fa4Paul Duffin          i++;
950888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
960888a09821a98ac0680fad765217302858e70fa4Paul Duffin      }
970888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
980888a09821a98ac0680fad765217302858e70fa4Paul Duffin    return utf8Length;
990888a09821a98ac0680fad765217302858e70fa4Paul Duffin  }
1000888a09821a98ac0680fad765217302858e70fa4Paul Duffin
1010888a09821a98ac0680fad765217302858e70fa4Paul Duffin  /**
1020888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to
1030888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be
1040888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte
1050888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered
1060888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * well-formed.
1070888a09821a98ac0680fad765217302858e70fa4Paul Duffin   *
1080888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new
1090888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space.
1100888a09821a98ac0680fad765217302858e70fa4Paul Duffin   */
1110888a09821a98ac0680fad765217302858e70fa4Paul Duffin  public static boolean isWellFormed(byte[] bytes) {
1120888a09821a98ac0680fad765217302858e70fa4Paul Duffin    return isWellFormed(bytes, 0, bytes.length);
1130888a09821a98ac0680fad765217302858e70fa4Paul Duffin  }
1140888a09821a98ac0680fad765217302858e70fa4Paul Duffin
1150888a09821a98ac0680fad765217302858e70fa4Paul Duffin  /**
1160888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by
1170888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code
1180888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * isWellFormed(bytes)} is true.
1190888a09821a98ac0680fad765217302858e70fa4Paul Duffin   *
1200888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * @param bytes the input buffer
1210888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * @param off the offset in the buffer of the first byte to read
1220888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * @param len the number of bytes to read from the buffer
1230888a09821a98ac0680fad765217302858e70fa4Paul Duffin   */
1240888a09821a98ac0680fad765217302858e70fa4Paul Duffin  public static boolean isWellFormed(byte[] bytes, int off, int len) {
1250888a09821a98ac0680fad765217302858e70fa4Paul Duffin    int end = off + len;
1260888a09821a98ac0680fad765217302858e70fa4Paul Duffin    checkPositionIndexes(off, end, bytes.length);
1270888a09821a98ac0680fad765217302858e70fa4Paul Duffin    // Look for the first non-ASCII character.
1280888a09821a98ac0680fad765217302858e70fa4Paul Duffin    for (int i = off; i < end; i++) {
1290888a09821a98ac0680fad765217302858e70fa4Paul Duffin      if (bytes[i] < 0) {
1300888a09821a98ac0680fad765217302858e70fa4Paul Duffin        return isWellFormedSlowPath(bytes, i, end);
1310888a09821a98ac0680fad765217302858e70fa4Paul Duffin      }
1320888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
1330888a09821a98ac0680fad765217302858e70fa4Paul Duffin    return true;
1340888a09821a98ac0680fad765217302858e70fa4Paul Duffin  }
1350888a09821a98ac0680fad765217302858e70fa4Paul Duffin
1360888a09821a98ac0680fad765217302858e70fa4Paul Duffin  private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) {
1370888a09821a98ac0680fad765217302858e70fa4Paul Duffin    int index = off;
1380888a09821a98ac0680fad765217302858e70fa4Paul Duffin    while (true) {
1390888a09821a98ac0680fad765217302858e70fa4Paul Duffin      int byte1;
1400888a09821a98ac0680fad765217302858e70fa4Paul Duffin
1410888a09821a98ac0680fad765217302858e70fa4Paul Duffin      // Optimize for interior runs of ASCII bytes.
1420888a09821a98ac0680fad765217302858e70fa4Paul Duffin      do {
1430888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (index >= end) {
1440888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return true;
1450888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
1460888a09821a98ac0680fad765217302858e70fa4Paul Duffin      } while ((byte1 = bytes[index++]) >= 0);
1470888a09821a98ac0680fad765217302858e70fa4Paul Duffin
1480888a09821a98ac0680fad765217302858e70fa4Paul Duffin      if (byte1 < (byte) 0xE0) {
1490888a09821a98ac0680fad765217302858e70fa4Paul Duffin        // Two-byte form.
1500888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (index == end) {
1510888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return false;
1520888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
1530888a09821a98ac0680fad765217302858e70fa4Paul Duffin        // Simultaneously check for illegal trailing-byte in leading position
1540888a09821a98ac0680fad765217302858e70fa4Paul Duffin        // and overlong 2-byte form.
1550888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
1560888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return false;
1570888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
1580888a09821a98ac0680fad765217302858e70fa4Paul Duffin      } else if (byte1 < (byte) 0xF0) {
1590888a09821a98ac0680fad765217302858e70fa4Paul Duffin        // Three-byte form.
1600888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (index + 1 >= end) {
1610888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return false;
1620888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
1630888a09821a98ac0680fad765217302858e70fa4Paul Duffin        int byte2 = bytes[index++];
1640888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (byte2 > (byte) 0xBF
1650888a09821a98ac0680fad765217302858e70fa4Paul Duffin            // Overlong? 5 most significant bits must not all be zero.
1660888a09821a98ac0680fad765217302858e70fa4Paul Duffin            || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1670888a09821a98ac0680fad765217302858e70fa4Paul Duffin            // Check for illegal surrogate codepoints.
1680888a09821a98ac0680fad765217302858e70fa4Paul Duffin            || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)
1690888a09821a98ac0680fad765217302858e70fa4Paul Duffin            // Third byte trailing-byte test.
1700888a09821a98ac0680fad765217302858e70fa4Paul Duffin            || bytes[index++] > (byte) 0xBF) {
1710888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return false;
1720888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
1730888a09821a98ac0680fad765217302858e70fa4Paul Duffin      } else {
1740888a09821a98ac0680fad765217302858e70fa4Paul Duffin        // Four-byte form.
1750888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (index + 2 >= end) {
1760888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return false;
1770888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
1780888a09821a98ac0680fad765217302858e70fa4Paul Duffin        int byte2 = bytes[index++];
1790888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (byte2 > (byte) 0xBF
1800888a09821a98ac0680fad765217302858e70fa4Paul Duffin            // Check that 1 <= plane <= 16. Tricky optimized form of:
1810888a09821a98ac0680fad765217302858e70fa4Paul Duffin            // if (byte1 > (byte) 0xF4
1820888a09821a98ac0680fad765217302858e70fa4Paul Duffin            //     || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
1830888a09821a98ac0680fad765217302858e70fa4Paul Duffin            //     || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1840888a09821a98ac0680fad765217302858e70fa4Paul Duffin            || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1850888a09821a98ac0680fad765217302858e70fa4Paul Duffin            // Third byte trailing-byte test
1860888a09821a98ac0680fad765217302858e70fa4Paul Duffin            || bytes[index++] > (byte) 0xBF
1870888a09821a98ac0680fad765217302858e70fa4Paul Duffin            // Fourth byte trailing-byte test
1880888a09821a98ac0680fad765217302858e70fa4Paul Duffin            || bytes[index++] > (byte) 0xBF) {
1890888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return false;
1900888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
1910888a09821a98ac0680fad765217302858e70fa4Paul Duffin      }
1920888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
1930888a09821a98ac0680fad765217302858e70fa4Paul Duffin  }
1940888a09821a98ac0680fad765217302858e70fa4Paul Duffin
1950888a09821a98ac0680fad765217302858e70fa4Paul Duffin  private Utf8() {}
1960888a09821a98ac0680fad765217302858e70fa4Paul Duffin}
197