10888a09821a98ac0680fad765217302858e70fa4Paul Duffin/* 20888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Copyright (C) 2013 The Guava Authors 30888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 40888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 50888a09821a98ac0680fad765217302858e70fa4Paul Duffin * in compliance with the License. You may obtain a copy of the License at 60888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 70888a09821a98ac0680fad765217302858e70fa4Paul Duffin * http://www.apache.org/licenses/LICENSE-2.0 80888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 90888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Unless required by applicable law or agreed to in writing, software distributed under the License 100888a09821a98ac0680fad765217302858e70fa4Paul Duffin * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 110888a09821a98ac0680fad765217302858e70fa4Paul Duffin * or implied. See the License for the specific language governing permissions and limitations under 120888a09821a98ac0680fad765217302858e70fa4Paul Duffin * the License. 130888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 140888a09821a98ac0680fad765217302858e70fa4Paul Duffin 150888a09821a98ac0680fad765217302858e70fa4Paul Duffinpackage com.google.common.base; 160888a09821a98ac0680fad765217302858e70fa4Paul Duffin 170888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport static com.google.common.base.Preconditions.checkPositionIndexes; 180888a09821a98ac0680fad765217302858e70fa4Paul Duffin 190888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport com.google.common.annotations.Beta; 200888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport com.google.common.annotations.GwtCompatible; 210888a09821a98ac0680fad765217302858e70fa4Paul Duffin 220888a09821a98ac0680fad765217302858e70fa4Paul Duffin/** 230888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8} 240888a09821a98ac0680fad765217302858e70fa4Paul Duffin * character encoding. UTF-8 is defined in section D92 of 250888a09821a98ac0680fad765217302858e70fa4Paul Duffin * <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core 260888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Specification, Chapter 3</a>. 270888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 280888a09821a98ac0680fad765217302858e70fa4Paul Duffin * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8 290888a09821a98ac0680fad765217302858e70fa4Paul Duffin * introduced in Unicode 3.1. One implication of this is that it rejects 300888a09821a98ac0680fad765217302858e70fa4Paul Duffin * <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte 310888a09821a98ac0680fad765217302858e70fa4Paul Duffin * sequences, even though the JDK decoder may accept them. 320888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 330888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @author Martin Buchholz 340888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @author Clément Roux 350888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @since 16.0 360888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 370888a09821a98ac0680fad765217302858e70fa4Paul Duffin@Beta 380888a09821a98ac0680fad765217302858e70fa4Paul Duffin@GwtCompatible 390888a09821a98ac0680fad765217302858e70fa4Paul Duffinpublic final class Utf8 { 400888a09821a98ac0680fad765217302858e70fa4Paul Duffin /** 410888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, 420888a09821a98ac0680fad765217302858e70fa4Paul Duffin * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in 430888a09821a98ac0680fad765217302858e70fa4Paul Duffin * both time and space. 440888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 450888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired 460888a09821a98ac0680fad765217302858e70fa4Paul Duffin * surrogates) 470888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 480888a09821a98ac0680fad765217302858e70fa4Paul Duffin public static int encodedLength(CharSequence sequence) { 490888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Warning to maintainers: this implementation is highly optimized. 500888a09821a98ac0680fad765217302858e70fa4Paul Duffin int utf16Length = sequence.length(); 510888a09821a98ac0680fad765217302858e70fa4Paul Duffin int utf8Length = utf16Length; 520888a09821a98ac0680fad765217302858e70fa4Paul Duffin int i = 0; 530888a09821a98ac0680fad765217302858e70fa4Paul Duffin 540888a09821a98ac0680fad765217302858e70fa4Paul Duffin // This loop optimizes for pure ASCII. 550888a09821a98ac0680fad765217302858e70fa4Paul Duffin while (i < utf16Length && sequence.charAt(i) < 0x80) { 560888a09821a98ac0680fad765217302858e70fa4Paul Duffin i++; 570888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 580888a09821a98ac0680fad765217302858e70fa4Paul Duffin 590888a09821a98ac0680fad765217302858e70fa4Paul Duffin // This loop optimizes for chars less than 0x800. 600888a09821a98ac0680fad765217302858e70fa4Paul Duffin for (; i < utf16Length; i++) { 610888a09821a98ac0680fad765217302858e70fa4Paul Duffin char c = sequence.charAt(i); 620888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (c < 0x800) { 630888a09821a98ac0680fad765217302858e70fa4Paul Duffin utf8Length += ((0x7f - c) >>> 31); // branch free! 640888a09821a98ac0680fad765217302858e70fa4Paul Duffin } else { 650888a09821a98ac0680fad765217302858e70fa4Paul Duffin utf8Length += encodedLengthGeneral(sequence, i); 660888a09821a98ac0680fad765217302858e70fa4Paul Duffin break; 670888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 680888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 690888a09821a98ac0680fad765217302858e70fa4Paul Duffin 700888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (utf8Length < utf16Length) { 710888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Necessary and sufficient condition for overflow because of maximum 3x expansion 720888a09821a98ac0680fad765217302858e70fa4Paul Duffin throw new IllegalArgumentException("UTF-8 length does not fit in int: " 730888a09821a98ac0680fad765217302858e70fa4Paul Duffin + (utf8Length + (1L << 32))); 740888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 750888a09821a98ac0680fad765217302858e70fa4Paul Duffin return utf8Length; 760888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 770888a09821a98ac0680fad765217302858e70fa4Paul Duffin 780888a09821a98ac0680fad765217302858e70fa4Paul Duffin private static int encodedLengthGeneral(CharSequence sequence, int start) { 790888a09821a98ac0680fad765217302858e70fa4Paul Duffin int utf16Length = sequence.length(); 800888a09821a98ac0680fad765217302858e70fa4Paul Duffin int utf8Length = 0; 810888a09821a98ac0680fad765217302858e70fa4Paul Duffin for (int i = start; i < utf16Length; i++) { 820888a09821a98ac0680fad765217302858e70fa4Paul Duffin char c = sequence.charAt(i); 830888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (c < 0x800) { 840888a09821a98ac0680fad765217302858e70fa4Paul Duffin utf8Length += (0x7f - c) >>> 31; // branch free! 850888a09821a98ac0680fad765217302858e70fa4Paul Duffin } else { 860888a09821a98ac0680fad765217302858e70fa4Paul Duffin utf8Length += 2; 870888a09821a98ac0680fad765217302858e70fa4Paul Duffin // jdk7+: if (Character.isSurrogate(c)) { 880888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) { 890888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Check that we have a well-formed surrogate pair. 900888a09821a98ac0680fad765217302858e70fa4Paul Duffin int cp = Character.codePointAt(sequence, i); 910888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 920888a09821a98ac0680fad765217302858e70fa4Paul Duffin throw new IllegalArgumentException("Unpaired surrogate at index " + i); 930888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 940888a09821a98ac0680fad765217302858e70fa4Paul Duffin i++; 950888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 960888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 970888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 980888a09821a98ac0680fad765217302858e70fa4Paul Duffin return utf8Length; 990888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1000888a09821a98ac0680fad765217302858e70fa4Paul Duffin 1010888a09821a98ac0680fad765217302858e70fa4Paul Duffin /** 1020888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to 1030888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be 1040888a09821a98ac0680fad765217302858e70fa4Paul Duffin * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte 1050888a09821a98ac0680fad765217302858e70fa4Paul Duffin * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered 1060888a09821a98ac0680fad765217302858e70fa4Paul Duffin * well-formed. 1070888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 1080888a09821a98ac0680fad765217302858e70fa4Paul Duffin * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new 1090888a09821a98ac0680fad765217302858e70fa4Paul Duffin * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space. 1100888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 1110888a09821a98ac0680fad765217302858e70fa4Paul Duffin public static boolean isWellFormed(byte[] bytes) { 1120888a09821a98ac0680fad765217302858e70fa4Paul Duffin return isWellFormed(bytes, 0, bytes.length); 1130888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1140888a09821a98ac0680fad765217302858e70fa4Paul Duffin 1150888a09821a98ac0680fad765217302858e70fa4Paul Duffin /** 1160888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by 1170888a09821a98ac0680fad765217302858e70fa4Paul Duffin * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code 1180888a09821a98ac0680fad765217302858e70fa4Paul Duffin * isWellFormed(bytes)} is true. 1190888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 1200888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @param bytes the input buffer 1210888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @param off the offset in the buffer of the first byte to read 1220888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @param len the number of bytes to read from the buffer 1230888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 1240888a09821a98ac0680fad765217302858e70fa4Paul Duffin public static boolean isWellFormed(byte[] bytes, int off, int len) { 1250888a09821a98ac0680fad765217302858e70fa4Paul Duffin int end = off + len; 1260888a09821a98ac0680fad765217302858e70fa4Paul Duffin checkPositionIndexes(off, end, bytes.length); 1270888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Look for the first non-ASCII character. 1280888a09821a98ac0680fad765217302858e70fa4Paul Duffin for (int i = off; i < end; i++) { 1290888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (bytes[i] < 0) { 1300888a09821a98ac0680fad765217302858e70fa4Paul Duffin return isWellFormedSlowPath(bytes, i, end); 1310888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1320888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1330888a09821a98ac0680fad765217302858e70fa4Paul Duffin return true; 1340888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1350888a09821a98ac0680fad765217302858e70fa4Paul Duffin 1360888a09821a98ac0680fad765217302858e70fa4Paul Duffin private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) { 1370888a09821a98ac0680fad765217302858e70fa4Paul Duffin int index = off; 1380888a09821a98ac0680fad765217302858e70fa4Paul Duffin while (true) { 1390888a09821a98ac0680fad765217302858e70fa4Paul Duffin int byte1; 1400888a09821a98ac0680fad765217302858e70fa4Paul Duffin 1410888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Optimize for interior runs of ASCII bytes. 1420888a09821a98ac0680fad765217302858e70fa4Paul Duffin do { 1430888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (index >= end) { 1440888a09821a98ac0680fad765217302858e70fa4Paul Duffin return true; 1450888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1460888a09821a98ac0680fad765217302858e70fa4Paul Duffin } while ((byte1 = bytes[index++]) >= 0); 1470888a09821a98ac0680fad765217302858e70fa4Paul Duffin 1480888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (byte1 < (byte) 0xE0) { 1490888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Two-byte form. 1500888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (index == end) { 1510888a09821a98ac0680fad765217302858e70fa4Paul Duffin return false; 1520888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1530888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Simultaneously check for illegal trailing-byte in leading position 1540888a09821a98ac0680fad765217302858e70fa4Paul Duffin // and overlong 2-byte form. 1550888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) { 1560888a09821a98ac0680fad765217302858e70fa4Paul Duffin return false; 1570888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1580888a09821a98ac0680fad765217302858e70fa4Paul Duffin } else if (byte1 < (byte) 0xF0) { 1590888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Three-byte form. 1600888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (index + 1 >= end) { 1610888a09821a98ac0680fad765217302858e70fa4Paul Duffin return false; 1620888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1630888a09821a98ac0680fad765217302858e70fa4Paul Duffin int byte2 = bytes[index++]; 1640888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (byte2 > (byte) 0xBF 1650888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Overlong? 5 most significant bits must not all be zero. 1660888a09821a98ac0680fad765217302858e70fa4Paul Duffin || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1670888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Check for illegal surrogate codepoints. 1680888a09821a98ac0680fad765217302858e70fa4Paul Duffin || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2) 1690888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Third byte trailing-byte test. 1700888a09821a98ac0680fad765217302858e70fa4Paul Duffin || bytes[index++] > (byte) 0xBF) { 1710888a09821a98ac0680fad765217302858e70fa4Paul Duffin return false; 1720888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1730888a09821a98ac0680fad765217302858e70fa4Paul Duffin } else { 1740888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Four-byte form. 1750888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (index + 2 >= end) { 1760888a09821a98ac0680fad765217302858e70fa4Paul Duffin return false; 1770888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1780888a09821a98ac0680fad765217302858e70fa4Paul Duffin int byte2 = bytes[index++]; 1790888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (byte2 > (byte) 0xBF 1800888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Check that 1 <= plane <= 16. Tricky optimized form of: 1810888a09821a98ac0680fad765217302858e70fa4Paul Duffin // if (byte1 > (byte) 0xF4 1820888a09821a98ac0680fad765217302858e70fa4Paul Duffin // || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 1830888a09821a98ac0680fad765217302858e70fa4Paul Duffin // || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1840888a09821a98ac0680fad765217302858e70fa4Paul Duffin || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1850888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Third byte trailing-byte test 1860888a09821a98ac0680fad765217302858e70fa4Paul Duffin || bytes[index++] > (byte) 0xBF 1870888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Fourth byte trailing-byte test 1880888a09821a98ac0680fad765217302858e70fa4Paul Duffin || bytes[index++] > (byte) 0xBF) { 1890888a09821a98ac0680fad765217302858e70fa4Paul Duffin return false; 1900888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1910888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1920888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1930888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1940888a09821a98ac0680fad765217302858e70fa4Paul Duffin 1950888a09821a98ac0680fad765217302858e70fa4Paul Duffin private Utf8() {} 1960888a09821a98ac0680fad765217302858e70fa4Paul Duffin} 197