1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5*   Copyright (C) 2009-2014, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*/
9package com.ibm.icu.text;
10
11import java.io.IOException;
12
13import com.ibm.icu.util.ICUUncheckedIOException;
14
15/**
16 * Normalization filtered by a UnicodeSet.
17 * Normalizes portions of the text contained in the filter set and leaves
18 * portions not contained in the filter set unchanged.
19 * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE).
20 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
21 * This class implements all of (and only) the Normalizer2 API.
22 * An instance of this class is unmodifiable/immutable.
23 * @stable ICU 4.4
24 * @author Markus W. Scherer
25 */
26public class FilteredNormalizer2 extends Normalizer2 {
27    /**
28     * Constructs a filtered normalizer wrapping any Normalizer2 instance
29     * and a filter set.
30     * Both are aliased and must not be modified or deleted while this object
31     * is used.
32     * The filter set should be frozen; otherwise the performance will suffer greatly.
33     * @param n2 wrapped Normalizer2 instance
34     * @param filterSet UnicodeSet which determines the characters to be normalized
35     * @stable ICU 4.4
36     */
37    public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
38        norm2=n2;
39        set=filterSet;
40    }
41
42    /**
43     * {@inheritDoc}
44     * @stable ICU 4.4
45     */
46    @Override
47    public StringBuilder normalize(CharSequence src, StringBuilder dest) {
48        if(dest==src) {
49            throw new IllegalArgumentException();
50        }
51        dest.setLength(0);
52        normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
53        return dest;
54    }
55    /**
56     * {@inheritDoc}
57     * @stable ICU 4.6
58     */
59    @Override
60    public Appendable normalize(CharSequence src, Appendable dest) {
61        if(dest==src) {
62            throw new IllegalArgumentException();
63        }
64        return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
65    }
66
67    /**
68     * {@inheritDoc}
69     * @stable ICU 4.4
70     */
71    @Override
72    public StringBuilder normalizeSecondAndAppend(
73            StringBuilder first, CharSequence second) {
74        return normalizeSecondAndAppend(first, second, true);
75    }
76    /**
77     * {@inheritDoc}
78     * @stable ICU 4.4
79     */
80    @Override
81    public StringBuilder append(StringBuilder first, CharSequence second) {
82        return normalizeSecondAndAppend(first, second, false);
83    }
84
85    /**
86     * {@inheritDoc}
87     * @stable ICU 4.6
88     */
89    @Override
90    public String getDecomposition(int c) {
91        return set.contains(c) ? norm2.getDecomposition(c) : null;
92    }
93
94    /**
95     * {@inheritDoc}
96     * @stable ICU 49
97     */
98    @Override
99    public String getRawDecomposition(int c) {
100        return set.contains(c) ? norm2.getRawDecomposition(c) : null;
101    }
102
103    /**
104     * {@inheritDoc}
105     * @stable ICU 49
106     */
107    @Override
108    public int composePair(int a, int b) {
109        return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : -1;
110    }
111
112    /**
113     * {@inheritDoc}
114     * @stable ICU 49
115     */
116    @Override
117    public int getCombiningClass(int c) {
118        return set.contains(c) ? norm2.getCombiningClass(c) : 0;
119    }
120
121    /**
122     * {@inheritDoc}
123     * @stable ICU 4.4
124     */
125    @Override
126    public boolean isNormalized(CharSequence s) {
127        UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
128        for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
129            int spanLimit=set.span(s, prevSpanLimit, spanCondition);
130            if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
131                spanCondition=UnicodeSet.SpanCondition.SIMPLE;
132            } else {
133                if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
134                    return false;
135                }
136                spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
137            }
138            prevSpanLimit=spanLimit;
139        }
140        return true;
141    }
142
143    /**
144     * {@inheritDoc}
145     * @stable ICU 4.4
146     */
147    @Override
148    public Normalizer.QuickCheckResult quickCheck(CharSequence s) {
149        Normalizer.QuickCheckResult result=Normalizer.YES;
150        UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
151        for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
152            int spanLimit=set.span(s, prevSpanLimit, spanCondition);
153            if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
154                spanCondition=UnicodeSet.SpanCondition.SIMPLE;
155            } else {
156                Normalizer.QuickCheckResult qcResult=
157                    norm2.quickCheck(s.subSequence(prevSpanLimit, spanLimit));
158                if(qcResult==Normalizer.NO) {
159                    return qcResult;
160                } else if(qcResult==Normalizer.MAYBE) {
161                    result=qcResult;
162                }
163                spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
164            }
165            prevSpanLimit=spanLimit;
166        }
167        return result;
168    }
169    /**
170     * {@inheritDoc}
171     * @stable ICU 4.4
172     */
173    @Override
174    public int spanQuickCheckYes(CharSequence s) {
175        UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
176        for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
177            int spanLimit=set.span(s, prevSpanLimit, spanCondition);
178            if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
179                spanCondition=UnicodeSet.SpanCondition.SIMPLE;
180            } else {
181                int yesLimit=
182                    prevSpanLimit+
183                    norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
184                if(yesLimit<spanLimit) {
185                    return yesLimit;
186                }
187                spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
188            }
189            prevSpanLimit=spanLimit;
190        }
191        return s.length();
192    }
193
194    /**
195     * {@inheritDoc}
196     * @stable ICU 4.4
197     */
198    @Override
199    public boolean hasBoundaryBefore(int c) {
200        return !set.contains(c) || norm2.hasBoundaryBefore(c);
201    }
202
203    /**
204     * {@inheritDoc}
205     * @stable ICU 4.4
206     */
207    @Override
208    public boolean hasBoundaryAfter(int c) {
209        return !set.contains(c) || norm2.hasBoundaryAfter(c);
210    }
211
212    /**
213     * {@inheritDoc}
214     * @stable ICU 4.4
215     */
216    @Override
217    public boolean isInert(int c) {
218        return !set.contains(c) || norm2.isInert(c);
219    }
220
221    // Internal: No argument checking, and appends to dest.
222    // Pass as input spanCondition the one that is likely to yield a non-zero
223    // span length at the start of src.
224    // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
225    // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
226    // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after
227    // an in-filter prefix.
228    private Appendable normalize(CharSequence src, Appendable dest,
229                                 UnicodeSet.SpanCondition spanCondition) {
230        // Don't throw away destination buffer between iterations.
231        StringBuilder tempDest=new StringBuilder();
232        try {
233            for(int prevSpanLimit=0; prevSpanLimit<src.length();) {
234                int spanLimit=set.span(src, prevSpanLimit, spanCondition);
235                int spanLength=spanLimit-prevSpanLimit;
236                if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
237                    if(spanLength!=0) {
238                        dest.append(src, prevSpanLimit, spanLimit);
239                    }
240                    spanCondition=UnicodeSet.SpanCondition.SIMPLE;
241                } else {
242                    if(spanLength!=0) {
243                        // Not norm2.normalizeSecondAndAppend() because we do not want
244                        // to modify the non-filter part of dest.
245                        dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
246                    }
247                    spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
248                }
249                prevSpanLimit=spanLimit;
250            }
251        } catch(IOException e) {
252            throw new ICUUncheckedIOException(e);
253        }
254        return dest;
255    }
256
257    private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,
258                                                   boolean doNormalize) {
259        if(first==second) {
260            throw new IllegalArgumentException();
261        }
262        if(first.length()==0) {
263            if(doNormalize) {
264                return normalize(second, first);
265            } else {
266                return first.append(second);
267            }
268        }
269        // merge the in-filter suffix of the first string with the in-filter prefix of the second
270        int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
271        if(prefixLimit!=0) {
272            CharSequence prefix=second.subSequence(0, prefixLimit);
273            int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
274            if(suffixStart==0) {
275                if(doNormalize) {
276                    norm2.normalizeSecondAndAppend(first, prefix);
277                } else {
278                    norm2.append(first, prefix);
279                }
280            } else {
281                StringBuilder middle=new StringBuilder(
282                        first.subSequence(suffixStart, first.length()));
283                if(doNormalize) {
284                    norm2.normalizeSecondAndAppend(middle, prefix);
285                } else {
286                    norm2.append(middle, prefix);
287                }
288                first.delete(suffixStart, 0x7fffffff).append(middle);
289            }
290        }
291        if(prefixLimit<second.length()) {
292            CharSequence rest=second.subSequence(prefixLimit, second.length());
293            if(doNormalize) {
294                normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
295            } else {
296                first.append(rest);
297            }
298        }
299        return first;
300    }
301
302    private Normalizer2 norm2;
303    private UnicodeSet set;
304};
305