12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others.
22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/*
47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Copyright (C) 1998-2007 International Business Machines Corporation and
57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Unicode, Inc. All Rights Reserved.<br>
67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * The Unicode Consortium makes no expressed or implied warranty of any
77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * kind, and assumes no liability for errors or omissions.
87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * No liability is assumed for incidental and consequential damages
97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * in connection with or arising out of the use of the information here.
107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.dev.test.normalizer;
127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.util.BitSet;
147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.dev.test.UTF16Util;
167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/**
187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Accesses the Normalization Data used for Forms C and D.<br>
197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @author Mark Davis
207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Updates for supplementary code points:
217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Vladimir Weinstein & Markus Scherer
227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic class NormalizerData {
247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert//    static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Constant for use in getPairwiseComposition
287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public static final int NOT_COMPOSITE = '\uFFFF';
307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Gets the combining class of a character from the
337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Unicode Character Database.
347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * @param   ch      the source character
357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * @return          value from 0 to 255
367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public int getCanonicalClass(int ch) {
387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return canonicalClass.get(ch);
397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Returns the composite of the two characters. If the two
437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * characters don't combine, returns NOT_COMPOSITE.
447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * @param   first   first character (e.g. 'c')
457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * @param   second  second character (e.g. \u0327 cedilla)
467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * @return          composite (e.g. \u00C7 c cedilla)
477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public int getPairwiseComposition(int first, int second) {
497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return compose.get(((long)first << 32) | second);
507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Gets recursive decomposition of a character from the
557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Unicode Character Database.
567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * @param   canonical    If true
577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    *                  bit is on in this byte, then selects the recursive
587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    *                  canonical decomposition, otherwise selects
597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    *                  the recursive compatibility and canonical decomposition.
607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * @param   ch      the source character
617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * @param   buffer  buffer to be filled with the decomposition
627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public void getRecursiveDecomposition(boolean canonical, int ch, StringBuffer buffer) {
647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        String decomp = decompose.get(ch);
657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (decomp != null && !(canonical && isCompatibility.get(ch))) {
667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for (int i = 0; i < decomp.length(); i+=UTF16Util.codePointLength(ch)) {
677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ch = UTF16Util.nextCodePoint(decomp, i);
687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                getRecursiveDecomposition(canonical, ch, buffer);
697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else {                    // if no decomp, append
717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            UTF16Util.appendCodePoint(buffer, ch);
727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // =================================================
767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    //                   PRIVATES
777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // =================================================
787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Only accessed by NormalizerBuilder.
817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    NormalizerData(IntHashtable canonicalClass, IntStringHashtable decompose,
837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert      LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {
847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.canonicalClass = canonicalClass;
857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.decompose = decompose;
867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.compose = compose;
877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.isCompatibility = isCompatibility;
887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        this.isExcluded = isExcluded;
897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Just accessible for testing.
937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    boolean getExcluded (char ch) {
957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return isExcluded.get(ch);
967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Just accessible for testing.
1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    String getRawDecompositionMapping (char ch) {
1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return decompose.get(ch);
1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * For now, just use IntHashtable
1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Two-stage tables would be used in an optimized implementation.
1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private IntHashtable canonicalClass;
1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * The main data table maps chars to a 32-bit int.
1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * It holds either a pair: top = first, bottom = second
1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * or singleton: top = 0, bottom = single.
1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * If there is no decomposition, the value is 0.
1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Two-stage tables would be used in an optimized implementation.
1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * An optimization could also map chars to a small index, then use that
1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * index in a small array of ints.
1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private IntStringHashtable decompose;
1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Maps from pairs of characters to single.
1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * If there is no decomposition, the value is NOT_COMPOSITE.
1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private LongHashtable compose;
1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Tells whether decomposition is canonical or not.
1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private BitSet isCompatibility = new BitSet();
1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Tells whether character is script-excluded or not.
1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    * Used only while building, and for testing.
1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    */
1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private BitSet isExcluded = new BitSet();
1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert}
140