12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Copyright (C) 1998-2007 International Business Machines Corporation and 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Unicode, Inc. All Rights Reserved.<br> 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The Unicode Consortium makes no expressed or implied warranty of any 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * kind, and assumes no liability for errors or omissions. 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * No liability is assumed for incidental and consequential damages 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * in connection with or arising out of the use of the information here. 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.dev.test.normalizer; 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.BitSet; 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.dev.test.UTF16Util; 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/** 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Accesses the Normalization Data used for Forms C and D.<br> 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @author Mark Davis 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Updates for supplementary code points: 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Vladimir Weinstein & Markus Scherer 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic class NormalizerData { 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert// static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc."; 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Constant for use in getPairwiseComposition 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public static final int NOT_COMPOSITE = '\uFFFF'; 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Gets the combining class of a character from the 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Unicode Character Database. 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param ch the source character 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return value from 0 to 255 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getCanonicalClass(int ch) { 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return canonicalClass.get(ch); 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Returns the composite of the two characters. If the two 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * characters don't combine, returns NOT_COMPOSITE. 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param first first character (e.g. 'c') 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param second second character (e.g. \u0327 cedilla) 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @return composite (e.g. \u00C7 c cedilla) 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int getPairwiseComposition(int first, int second) { 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return compose.get(((long)first << 32) | second); 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Gets recursive decomposition of a character from the 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Unicode Character Database. 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param canonical If true 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * bit is on in this byte, then selects the recursive 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * canonical decomposition, otherwise selects 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * the recursive compatibility and canonical decomposition. 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param ch the source character 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @param buffer buffer to be filled with the decomposition 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public void getRecursiveDecomposition(boolean canonical, int ch, StringBuffer buffer) { 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String decomp = decompose.get(ch); 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (decomp != null && !(canonical && isCompatibility.get(ch))) { 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < decomp.length(); i+=UTF16Util.codePointLength(ch)) { 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ch = UTF16Util.nextCodePoint(decomp, i); 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert getRecursiveDecomposition(canonical, ch, buffer); 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { // if no decomp, append 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert UTF16Util.appendCodePoint(buffer, ch); 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ================================================= 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // PRIVATES 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // ================================================= 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Only accessed by NormalizerBuilder. 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert NormalizerData(IntHashtable canonicalClass, IntStringHashtable decompose, 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) { 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.canonicalClass = canonicalClass; 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.decompose = decompose; 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.compose = compose; 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.isCompatibility = isCompatibility; 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert this.isExcluded = isExcluded; 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Just accessible for testing. 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean getExcluded (char ch) { 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return isExcluded.get(ch); 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Just accessible for testing. 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert String getRawDecompositionMapping (char ch) { 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return decompose.get(ch); 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * For now, just use IntHashtable 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Two-stage tables would be used in an optimized implementation. 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private IntHashtable canonicalClass; 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The main data table maps chars to a 32-bit int. 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * It holds either a pair: top = first, bottom = second 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * or singleton: top = 0, bottom = single. 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If there is no decomposition, the value is 0. 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Two-stage tables would be used in an optimized implementation. 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * An optimization could also map chars to a small index, then use that 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * index in a small array of ints. 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private IntStringHashtable decompose; 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Maps from pairs of characters to single. 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * If there is no decomposition, the value is NOT_COMPOSITE. 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private LongHashtable compose; 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Tells whether decomposition is canonical or not. 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private BitSet isCompatibility = new BitSet(); 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert /** 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Tells whether character is script-excluded or not. 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Used only while building, and for testing. 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private BitSet isExcluded = new BitSet(); 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 140