UnicodeNormalizer.java revision f86f25d102340da66b9c7cb6b2d5ecdc0de43ecf
1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5 * Copyright (C) 1998-2007 International Business Machines Corporation and 6 * Unicode, Inc. All Rights Reserved.<br> 7 * The Unicode Consortium makes no expressed or implied warranty of any 8 * kind, and assumes no liability for errors or omissions. 9 * No liability is assumed for incidental and consequential damages 10 * in connection with or arising out of the use of the information here. 11 */ 12 13package android.icu.dev.test.normalizer; 14 15import android.icu.dev.test.UTF16Util; 16 17/** 18 * Implements Unicode Normalization Forms C, D, KC, KD.<br> 19 * See UTR#15 for details.<br> 20 * @author Mark Davis 21 * Updates for supplementary code points: 22 * Vladimir Weinstein & Markus Scherer 23 */ 24public class UnicodeNormalizer { 25// static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc."; 26 27 /** 28 * Create a normalizer for a given form. 29 */ 30 public UnicodeNormalizer(byte form, boolean fullData) { 31 this.form = form; 32 if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time 33 } 34 35 /** 36 * Masks for the form selector 37 */ 38 static final byte 39 COMPATIBILITY_MASK = 1, 40 COMPOSITION_MASK = 2; 41 42 /** 43 * Normalization Form Selector 44 */ 45 public static final byte 46 D = 0 , 47 C = COMPOSITION_MASK, 48 KD = COMPATIBILITY_MASK, 49 KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK); 50 51 /** 52 * Normalizes text according to the chosen form, 53 * replacing contents of the target buffer. 54 * @param source the original text, unnormalized 55 * @param target the resulting normalized text 56 */ 57 public StringBuffer normalize(String source, StringBuffer target) { 58 59 // First decompose the source into target, 60 // then compose if the form requires. 61 62 if (source.length() != 0) { 63 internalDecompose(source, target); 64 if ((form & COMPOSITION_MASK) != 0) { 65 internalCompose(target); 66 } 67 } 68 return target; 69 } 70 71 /** 72 * Normalizes text according to the chosen form 73 * @param source the original text, unnormalized 74 * @return target the resulting normalized text 75 */ 76 public String normalize(String source) { 77 return normalize(source, new StringBuffer()).toString(); 78 } 79 80 // ====================================== 81 // PRIVATES 82 // ====================================== 83 84 /** 85 * The current form. 86 */ 87 private byte form; 88 89 /** 90 * Decomposes text, either canonical or compatibility, 91 * replacing contents of the target buffer. 92 * @param form the normalization form. If COMPATIBILITY_MASK 93 * bit is on in this byte, then selects the recursive 94 * compatibility decomposition, otherwise selects 95 * the recursive canonical decomposition. 96 * @param source the original text, unnormalized 97 * @param target the resulting normalized text 98 */ 99 private void internalDecompose(String source, StringBuffer target) { 100 StringBuffer buffer = new StringBuffer(); 101 boolean canonical = (form & COMPATIBILITY_MASK) == 0; 102 int ch; 103 for (int i = 0; i < source.length();) { 104 buffer.setLength(0); 105 ch = UTF16Util.nextCodePoint(source, i); 106 i+=UTF16Util.codePointLength(ch); 107 data.getRecursiveDecomposition(canonical, ch, buffer); 108 109 // add all of the characters in the decomposition. 110 // (may be just the original character, if there was 111 // no decomposition mapping) 112 113 for (int j = 0; j < buffer.length();) { 114 ch = UTF16Util.nextCodePoint(buffer, j); 115 j+=UTF16Util.codePointLength(ch); 116 int chClass = data.getCanonicalClass(ch); 117 int k = target.length(); // insertion point 118 if (chClass != 0) { 119 120 // bubble-sort combining marks as necessary 121 122 int ch2; 123 for (; k > 0; k -= UTF16Util.codePointLength(ch2)) { 124 ch2 = UTF16Util.prevCodePoint(target, k); 125 if (data.getCanonicalClass(ch2) <= chClass) break; 126 } 127 } 128 UTF16Util.insertCodePoint(target, k, ch); 129 } 130 } 131 } 132 133 /** 134 * Composes text in place. Target must already 135 * have been decomposed. 136 * @param target input: decomposed text. 137 * output: the resulting normalized text. 138 */ 139 private void internalCompose(StringBuffer target) { 140 141 int starterPos = 0; 142 int starterCh = UTF16Util.nextCodePoint(target,0); 143 int compPos = UTF16Util.codePointLength(starterCh); 144 int lastClass = data.getCanonicalClass(starterCh); 145 if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence 146 147 // Loop on the decomposed characters, combining where possible 148 149 for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) { 150 int ch = UTF16Util.nextCodePoint(target, decompPos); 151 decompPos += UTF16Util.codePointLength(ch); 152 int chClass = data.getCanonicalClass(ch); 153 int composite = data.getPairwiseComposition(starterCh, ch); 154 if (composite != NormalizerData.NOT_COMPOSITE 155 && (lastClass < chClass || lastClass == 0)) { 156 UTF16Util.setCodePointAt(target, starterPos, composite); 157 starterCh = composite; 158 } else { 159 if (chClass == 0) { 160 starterPos = compPos; 161 starterCh = ch; 162 } 163 lastClass = chClass; 164 decompPos += UTF16Util.setCodePointAt(target, compPos, ch); 165 compPos += UTF16Util.codePointLength(ch); 166 } 167 } 168 target.setLength(compPos); 169 } 170 171 /** 172 * Contains normalization data from the Unicode Character Database. 173 * use false for the minimal set, true for the real set. 174 */ 175 private static NormalizerData data = null; 176 177 /** 178 * Just accessible for testing. 179 */ 180 boolean getExcluded (char ch) { 181 return data.getExcluded(ch); 182 } 183 184 /** 185 * Just accessible for testing. 186 */ 187 String getRawDecompositionMapping (char ch) { 188 return data.getRawDecompositionMapping(ch); 189 } 190}