/* GENERATED SOURCE. DO NOT MODIFY. */ // © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html#License /* * Copyright (C) 1998-2007 International Business Machines Corporation and * Unicode, Inc. All Rights Reserved.
* The Unicode Consortium makes no expressed or implied warranty of any * kind, and assumes no liability for errors or omissions. * No liability is assumed for incidental and consequential damages * in connection with or arising out of the use of the information here. */ package android.icu.dev.test.normalizer; import android.icu.dev.test.UTF16Util; import android.icu.testsharding.MainTestShard; /** * Implements Unicode Normalization Forms C, D, KC, KD.
* See UTR#15 for details.
* @author Mark Davis * Updates for supplementary code points: * Vladimir Weinstein & Markus Scherer */ @MainTestShard public class UnicodeNormalizer { // static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc."; /** * Create a normalizer for a given form. */ public UnicodeNormalizer(byte form, boolean fullData) { this.form = form; if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time } /** * Masks for the form selector */ static final byte COMPATIBILITY_MASK = 1, COMPOSITION_MASK = 2; /** * Normalization Form Selector */ public static final byte D = 0 , C = COMPOSITION_MASK, KD = COMPATIBILITY_MASK, KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK); /** * Normalizes text according to the chosen form, * replacing contents of the target buffer. * @param source the original text, unnormalized * @param target the resulting normalized text */ public StringBuffer normalize(String source, StringBuffer target) { // First decompose the source into target, // then compose if the form requires. if (source.length() != 0) { internalDecompose(source, target); if ((form & COMPOSITION_MASK) != 0) { internalCompose(target); } } return target; } /** * Normalizes text according to the chosen form * @param source the original text, unnormalized * @return target the resulting normalized text */ public String normalize(String source) { return normalize(source, new StringBuffer()).toString(); } // ====================================== // PRIVATES // ====================================== /** * The current form. */ private byte form; /** * Decomposes text, either canonical or compatibility, * replacing contents of the target buffer. * @param form the normalization form. If COMPATIBILITY_MASK * bit is on in this byte, then selects the recursive * compatibility decomposition, otherwise selects * the recursive canonical decomposition. * @param source the original text, unnormalized * @param target the resulting normalized text */ private void internalDecompose(String source, StringBuffer target) { StringBuffer buffer = new StringBuffer(); boolean canonical = (form & COMPATIBILITY_MASK) == 0; int ch; for (int i = 0; i < source.length();) { buffer.setLength(0); ch = UTF16Util.nextCodePoint(source, i); i+=UTF16Util.codePointLength(ch); data.getRecursiveDecomposition(canonical, ch, buffer); // add all of the characters in the decomposition. // (may be just the original character, if there was // no decomposition mapping) for (int j = 0; j < buffer.length();) { ch = UTF16Util.nextCodePoint(buffer, j); j+=UTF16Util.codePointLength(ch); int chClass = data.getCanonicalClass(ch); int k = target.length(); // insertion point if (chClass != 0) { // bubble-sort combining marks as necessary int ch2; for (; k > 0; k -= UTF16Util.codePointLength(ch2)) { ch2 = UTF16Util.prevCodePoint(target, k); if (data.getCanonicalClass(ch2) <= chClass) break; } } UTF16Util.insertCodePoint(target, k, ch); } } } /** * Composes text in place. Target must already * have been decomposed. * @param target input: decomposed text. * output: the resulting normalized text. */ private void internalCompose(StringBuffer target) { int starterPos = 0; int starterCh = UTF16Util.nextCodePoint(target,0); int compPos = UTF16Util.codePointLength(starterCh); int lastClass = data.getCanonicalClass(starterCh); if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence // Loop on the decomposed characters, combining where possible for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) { int ch = UTF16Util.nextCodePoint(target, decompPos); decompPos += UTF16Util.codePointLength(ch); int chClass = data.getCanonicalClass(ch); int composite = data.getPairwiseComposition(starterCh, ch); if (composite != NormalizerData.NOT_COMPOSITE && (lastClass < chClass || lastClass == 0)) { UTF16Util.setCodePointAt(target, starterPos, composite); starterCh = composite; } else { if (chClass == 0) { starterPos = compPos; starterCh = ch; } lastClass = chClass; decompPos += UTF16Util.setCodePointAt(target, compPos, ch); compPos += UTF16Util.codePointLength(ch); } } target.setLength(compPos); } /** * Contains normalization data from the Unicode Character Database. * use false for the minimal set, true for the real set. */ private static NormalizerData data = null; /** * Just accessible for testing. */ boolean getExcluded (char ch) { return data.getExcluded(ch); } /** * Just accessible for testing. */ String getRawDecompositionMapping (char ch) { return data.getRawDecompositionMapping(ch); } }