UnicodeNormalizer.java revision f86f25d102340da66b9c7cb6b2d5ecdc0de43ecf
1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 * Copyright (C) 1998-2007 International Business Machines Corporation and
6 * Unicode, Inc. All Rights Reserved.<br>
7 * The Unicode Consortium makes no expressed or implied warranty of any
8 * kind, and assumes no liability for errors or omissions.
9 * No liability is assumed for incidental and consequential damages
10 * in connection with or arising out of the use of the information here.
11 */
12
13package android.icu.dev.test.normalizer;
14
15import android.icu.dev.test.UTF16Util;
16
17/**
18 * Implements Unicode Normalization Forms C, D, KC, KD.<br>
19 * See UTR#15 for details.<br>
20 * @author Mark Davis
21 * Updates for supplementary code points:
22 * Vladimir Weinstein & Markus Scherer
23 */
24public class UnicodeNormalizer {
25//    static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
26
27    /**
28     * Create a normalizer for a given form.
29     */
30    public UnicodeNormalizer(byte form, boolean fullData) {
31        this.form = form;
32        if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time
33    }
34
35    /**
36    * Masks for the form selector
37    */
38    static final byte
39        COMPATIBILITY_MASK = 1,
40        COMPOSITION_MASK = 2;
41
42    /**
43    * Normalization Form Selector
44    */
45    public static final byte
46        D = 0 ,
47        C = COMPOSITION_MASK,
48        KD = COMPATIBILITY_MASK,
49        KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
50
51    /**
52    * Normalizes text according to the chosen form,
53    * replacing contents of the target buffer.
54    * @param   source      the original text, unnormalized
55    * @param   target      the resulting normalized text
56    */
57    public StringBuffer normalize(String source, StringBuffer target) {
58
59        // First decompose the source into target,
60        // then compose if the form requires.
61
62        if (source.length() != 0) {
63            internalDecompose(source, target);
64            if ((form & COMPOSITION_MASK) != 0) {
65                internalCompose(target);
66            }
67        }
68        return target;
69    }
70
71    /**
72    * Normalizes text according to the chosen form
73    * @param   source      the original text, unnormalized
74    * @return  target      the resulting normalized text
75    */
76    public String normalize(String source) {
77        return normalize(source, new StringBuffer()).toString();
78    }
79
80    // ======================================
81    //                  PRIVATES
82    // ======================================
83
84    /**
85     * The current form.
86     */
87    private byte form;
88
89    /**
90    * Decomposes text, either canonical or compatibility,
91    * replacing contents of the target buffer.
92    * @param   form        the normalization form. If COMPATIBILITY_MASK
93    *                      bit is on in this byte, then selects the recursive
94    *                      compatibility decomposition, otherwise selects
95    *                      the recursive canonical decomposition.
96    * @param   source      the original text, unnormalized
97    * @param   target      the resulting normalized text
98    */
99    private void internalDecompose(String source, StringBuffer target) {
100        StringBuffer buffer = new StringBuffer();
101        boolean canonical = (form & COMPATIBILITY_MASK) == 0;
102        int ch;
103        for (int i = 0; i < source.length();) {
104            buffer.setLength(0);
105            ch = UTF16Util.nextCodePoint(source, i);
106            i+=UTF16Util.codePointLength(ch);
107            data.getRecursiveDecomposition(canonical, ch, buffer);
108
109            // add all of the characters in the decomposition.
110            // (may be just the original character, if there was
111            // no decomposition mapping)
112
113            for (int j = 0; j < buffer.length();) {
114                ch = UTF16Util.nextCodePoint(buffer, j);
115                j+=UTF16Util.codePointLength(ch);
116                int chClass = data.getCanonicalClass(ch);
117                int k = target.length(); // insertion point
118                if (chClass != 0) {
119
120                    // bubble-sort combining marks as necessary
121
122                    int ch2;
123                    for (; k > 0; k -= UTF16Util.codePointLength(ch2)) {
124                        ch2 = UTF16Util.prevCodePoint(target, k);
125                        if (data.getCanonicalClass(ch2) <= chClass) break;
126                    }
127                }
128                UTF16Util.insertCodePoint(target, k, ch);
129            }
130        }
131    }
132
133    /**
134    * Composes text in place. Target must already
135    * have been decomposed.
136    * @param   target      input: decomposed text.
137    *                      output: the resulting normalized text.
138    */
139    private void internalCompose(StringBuffer target) {
140
141        int starterPos = 0;
142        int starterCh = UTF16Util.nextCodePoint(target,0);
143        int compPos = UTF16Util.codePointLength(starterCh);
144        int lastClass = data.getCanonicalClass(starterCh);
145        if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence
146
147        // Loop on the decomposed characters, combining where possible
148
149        for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) {
150            int ch = UTF16Util.nextCodePoint(target, decompPos);
151            decompPos += UTF16Util.codePointLength(ch);
152            int chClass = data.getCanonicalClass(ch);
153            int composite = data.getPairwiseComposition(starterCh, ch);
154            if (composite != NormalizerData.NOT_COMPOSITE
155            && (lastClass < chClass || lastClass == 0)) {
156                UTF16Util.setCodePointAt(target, starterPos, composite);
157                starterCh = composite;
158            } else {
159                if (chClass == 0) {
160                    starterPos = compPos;
161                    starterCh  = ch;
162                }
163                lastClass = chClass;
164                decompPos += UTF16Util.setCodePointAt(target, compPos, ch);
165                compPos += UTF16Util.codePointLength(ch);
166            }
167        }
168        target.setLength(compPos);
169    }
170
171    /**
172    * Contains normalization data from the Unicode Character Database.
173    * use false for the minimal set, true for the real set.
174    */
175    private static NormalizerData data = null;
176
177    /**
178    * Just accessible for testing.
179    */
180    boolean getExcluded (char ch) {
181        return data.getExcluded(ch);
182    }
183
184    /**
185    * Just accessible for testing.
186    */
187    String getRawDecompositionMapping (char ch) {
188        return data.getRawDecompositionMapping(ch);
189    }
190}