1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 * Copyright (C) 1998-2007 International Business Machines Corporation and
6 * Unicode, Inc. All Rights Reserved.<br>
7 * The Unicode Consortium makes no expressed or implied warranty of any
8 * kind, and assumes no liability for errors or omissions.
9 * No liability is assumed for incidental and consequential damages
10 * in connection with or arising out of the use of the information here.
11 */
12
13package android.icu.dev.test.normalizer;
14
15import android.icu.dev.test.UTF16Util;
16import android.icu.testsharding.MainTestShard;
17
18/**
19 * Implements Unicode Normalization Forms C, D, KC, KD.<br>
20 * See UTR#15 for details.<br>
21 * @author Mark Davis
22 * Updates for supplementary code points:
23 * Vladimir Weinstein & Markus Scherer
24 */
25@MainTestShard
26public class UnicodeNormalizer {
27//    static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
28
29    /**
30     * Create a normalizer for a given form.
31     */
32    public UnicodeNormalizer(byte form, boolean fullData) {
33        this.form = form;
34        if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time
35    }
36
37    /**
38    * Masks for the form selector
39    */
40    static final byte
41        COMPATIBILITY_MASK = 1,
42        COMPOSITION_MASK = 2;
43
44    /**
45    * Normalization Form Selector
46    */
47    public static final byte
48        D = 0 ,
49        C = COMPOSITION_MASK,
50        KD = COMPATIBILITY_MASK,
51        KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
52
53    /**
54    * Normalizes text according to the chosen form,
55    * replacing contents of the target buffer.
56    * @param   source      the original text, unnormalized
57    * @param   target      the resulting normalized text
58    */
59    public StringBuffer normalize(String source, StringBuffer target) {
60
61        // First decompose the source into target,
62        // then compose if the form requires.
63
64        if (source.length() != 0) {
65            internalDecompose(source, target);
66            if ((form & COMPOSITION_MASK) != 0) {
67                internalCompose(target);
68            }
69        }
70        return target;
71    }
72
73    /**
74    * Normalizes text according to the chosen form
75    * @param   source      the original text, unnormalized
76    * @return  target      the resulting normalized text
77    */
78    public String normalize(String source) {
79        return normalize(source, new StringBuffer()).toString();
80    }
81
82    // ======================================
83    //                  PRIVATES
84    // ======================================
85
86    /**
87     * The current form.
88     */
89    private byte form;
90
91    /**
92    * Decomposes text, either canonical or compatibility,
93    * replacing contents of the target buffer.
94    * @param   form        the normalization form. If COMPATIBILITY_MASK
95    *                      bit is on in this byte, then selects the recursive
96    *                      compatibility decomposition, otherwise selects
97    *                      the recursive canonical decomposition.
98    * @param   source      the original text, unnormalized
99    * @param   target      the resulting normalized text
100    */
101    private void internalDecompose(String source, StringBuffer target) {
102        StringBuffer buffer = new StringBuffer();
103        boolean canonical = (form & COMPATIBILITY_MASK) == 0;
104        int ch;
105        for (int i = 0; i < source.length();) {
106            buffer.setLength(0);
107            ch = UTF16Util.nextCodePoint(source, i);
108            i+=UTF16Util.codePointLength(ch);
109            data.getRecursiveDecomposition(canonical, ch, buffer);
110
111            // add all of the characters in the decomposition.
112            // (may be just the original character, if there was
113            // no decomposition mapping)
114
115            for (int j = 0; j < buffer.length();) {
116                ch = UTF16Util.nextCodePoint(buffer, j);
117                j+=UTF16Util.codePointLength(ch);
118                int chClass = data.getCanonicalClass(ch);
119                int k = target.length(); // insertion point
120                if (chClass != 0) {
121
122                    // bubble-sort combining marks as necessary
123
124                    int ch2;
125                    for (; k > 0; k -= UTF16Util.codePointLength(ch2)) {
126                        ch2 = UTF16Util.prevCodePoint(target, k);
127                        if (data.getCanonicalClass(ch2) <= chClass) break;
128                    }
129                }
130                UTF16Util.insertCodePoint(target, k, ch);
131            }
132        }
133    }
134
135    /**
136    * Composes text in place. Target must already
137    * have been decomposed.
138    * @param   target      input: decomposed text.
139    *                      output: the resulting normalized text.
140    */
141    private void internalCompose(StringBuffer target) {
142
143        int starterPos = 0;
144        int starterCh = UTF16Util.nextCodePoint(target,0);
145        int compPos = UTF16Util.codePointLength(starterCh);
146        int lastClass = data.getCanonicalClass(starterCh);
147        if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence
148
149        // Loop on the decomposed characters, combining where possible
150
151        for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) {
152            int ch = UTF16Util.nextCodePoint(target, decompPos);
153            decompPos += UTF16Util.codePointLength(ch);
154            int chClass = data.getCanonicalClass(ch);
155            int composite = data.getPairwiseComposition(starterCh, ch);
156            if (composite != NormalizerData.NOT_COMPOSITE
157            && (lastClass < chClass || lastClass == 0)) {
158                UTF16Util.setCodePointAt(target, starterPos, composite);
159                starterCh = composite;
160            } else {
161                if (chClass == 0) {
162                    starterPos = compPos;
163                    starterCh  = ch;
164                }
165                lastClass = chClass;
166                decompPos += UTF16Util.setCodePointAt(target, compPos, ch);
167                compPos += UTF16Util.codePointLength(ch);
168            }
169        }
170        target.setLength(compPos);
171    }
172
173    /**
174    * Contains normalization data from the Unicode Character Database.
175    * use false for the minimal set, true for the real set.
176    */
177    private static NormalizerData data = null;
178
179    /**
180    * Just accessible for testing.
181    */
182    boolean getExcluded (char ch) {
183        return data.getExcluded(ch);
184    }
185
186    /**
187    * Just accessible for testing.
188    */
189    String getRawDecompositionMapping (char ch) {
190        return data.getRawDecompositionMapping(ch);
191    }
192}