1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5 * Copyright (C) 1998-2007 International Business Machines Corporation and 6 * Unicode, Inc. All Rights Reserved.<br> 7 * The Unicode Consortium makes no expressed or implied warranty of any 8 * kind, and assumes no liability for errors or omissions. 9 * No liability is assumed for incidental and consequential damages 10 * in connection with or arising out of the use of the information here. 11 */ 12 13package android.icu.dev.test.normalizer; 14 15import android.icu.dev.test.UTF16Util; 16import android.icu.testsharding.MainTestShard; 17 18/** 19 * Implements Unicode Normalization Forms C, D, KC, KD.<br> 20 * See UTR#15 for details.<br> 21 * @author Mark Davis 22 * Updates for supplementary code points: 23 * Vladimir Weinstein & Markus Scherer 24 */ 25@MainTestShard 26public class UnicodeNormalizer { 27// static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc."; 28 29 /** 30 * Create a normalizer for a given form. 31 */ 32 public UnicodeNormalizer(byte form, boolean fullData) { 33 this.form = form; 34 if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time 35 } 36 37 /** 38 * Masks for the form selector 39 */ 40 static final byte 41 COMPATIBILITY_MASK = 1, 42 COMPOSITION_MASK = 2; 43 44 /** 45 * Normalization Form Selector 46 */ 47 public static final byte 48 D = 0 , 49 C = COMPOSITION_MASK, 50 KD = COMPATIBILITY_MASK, 51 KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK); 52 53 /** 54 * Normalizes text according to the chosen form, 55 * replacing contents of the target buffer. 56 * @param source the original text, unnormalized 57 * @param target the resulting normalized text 58 */ 59 public StringBuffer normalize(String source, StringBuffer target) { 60 61 // First decompose the source into target, 62 // then compose if the form requires. 63 64 if (source.length() != 0) { 65 internalDecompose(source, target); 66 if ((form & COMPOSITION_MASK) != 0) { 67 internalCompose(target); 68 } 69 } 70 return target; 71 } 72 73 /** 74 * Normalizes text according to the chosen form 75 * @param source the original text, unnormalized 76 * @return target the resulting normalized text 77 */ 78 public String normalize(String source) { 79 return normalize(source, new StringBuffer()).toString(); 80 } 81 82 // ====================================== 83 // PRIVATES 84 // ====================================== 85 86 /** 87 * The current form. 88 */ 89 private byte form; 90 91 /** 92 * Decomposes text, either canonical or compatibility, 93 * replacing contents of the target buffer. 94 * @param form the normalization form. If COMPATIBILITY_MASK 95 * bit is on in this byte, then selects the recursive 96 * compatibility decomposition, otherwise selects 97 * the recursive canonical decomposition. 98 * @param source the original text, unnormalized 99 * @param target the resulting normalized text 100 */ 101 private void internalDecompose(String source, StringBuffer target) { 102 StringBuffer buffer = new StringBuffer(); 103 boolean canonical = (form & COMPATIBILITY_MASK) == 0; 104 int ch; 105 for (int i = 0; i < source.length();) { 106 buffer.setLength(0); 107 ch = UTF16Util.nextCodePoint(source, i); 108 i+=UTF16Util.codePointLength(ch); 109 data.getRecursiveDecomposition(canonical, ch, buffer); 110 111 // add all of the characters in the decomposition. 112 // (may be just the original character, if there was 113 // no decomposition mapping) 114 115 for (int j = 0; j < buffer.length();) { 116 ch = UTF16Util.nextCodePoint(buffer, j); 117 j+=UTF16Util.codePointLength(ch); 118 int chClass = data.getCanonicalClass(ch); 119 int k = target.length(); // insertion point 120 if (chClass != 0) { 121 122 // bubble-sort combining marks as necessary 123 124 int ch2; 125 for (; k > 0; k -= UTF16Util.codePointLength(ch2)) { 126 ch2 = UTF16Util.prevCodePoint(target, k); 127 if (data.getCanonicalClass(ch2) <= chClass) break; 128 } 129 } 130 UTF16Util.insertCodePoint(target, k, ch); 131 } 132 } 133 } 134 135 /** 136 * Composes text in place. Target must already 137 * have been decomposed. 138 * @param target input: decomposed text. 139 * output: the resulting normalized text. 140 */ 141 private void internalCompose(StringBuffer target) { 142 143 int starterPos = 0; 144 int starterCh = UTF16Util.nextCodePoint(target,0); 145 int compPos = UTF16Util.codePointLength(starterCh); 146 int lastClass = data.getCanonicalClass(starterCh); 147 if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence 148 149 // Loop on the decomposed characters, combining where possible 150 151 for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) { 152 int ch = UTF16Util.nextCodePoint(target, decompPos); 153 decompPos += UTF16Util.codePointLength(ch); 154 int chClass = data.getCanonicalClass(ch); 155 int composite = data.getPairwiseComposition(starterCh, ch); 156 if (composite != NormalizerData.NOT_COMPOSITE 157 && (lastClass < chClass || lastClass == 0)) { 158 UTF16Util.setCodePointAt(target, starterPos, composite); 159 starterCh = composite; 160 } else { 161 if (chClass == 0) { 162 starterPos = compPos; 163 starterCh = ch; 164 } 165 lastClass = chClass; 166 decompPos += UTF16Util.setCodePointAt(target, compPos, ch); 167 compPos += UTF16Util.codePointLength(ch); 168 } 169 } 170 target.setLength(compPos); 171 } 172 173 /** 174 * Contains normalization data from the Unicode Character Database. 175 * use false for the minimal set, true for the real set. 176 */ 177 private static NormalizerData data = null; 178 179 /** 180 * Just accessible for testing. 181 */ 182 boolean getExcluded (char ch) { 183 return data.getExcluded(ch); 184 } 185 186 /** 187 * Just accessible for testing. 188 */ 189 String getRawDecompositionMapping (char ch) { 190 return data.getRawDecompositionMapping(ch); 191 } 192}