1/*
2*******************************************************************************
3*   Copyright (C) 2009-2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*/
7package com.ibm.icu.text;
8
9import java.io.IOException;
10
11import com.ibm.icu.util.ICUUncheckedIOException;
12
13/**
14 * Normalization filtered by a UnicodeSet.
15 * Normalizes portions of the text contained in the filter set and leaves
16 * portions not contained in the filter set unchanged.
17 * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE).
18 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
19 * This class implements all of (and only) the Normalizer2 API.
20 * An instance of this class is unmodifiable/immutable.
21 * @stable ICU 4.4
22 * @author Markus W. Scherer
23 */
24public class FilteredNormalizer2 extends Normalizer2 {
25    /**
26     * Constructs a filtered normalizer wrapping any Normalizer2 instance
27     * and a filter set.
28     * Both are aliased and must not be modified or deleted while this object
29     * is used.
30     * The filter set should be frozen; otherwise the performance will suffer greatly.
31     * @param n2 wrapped Normalizer2 instance
32     * @param filterSet UnicodeSet which determines the characters to be normalized
33     * @stable ICU 4.4
34     */
35    public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
36        norm2=n2;
37        set=filterSet;
38    }
39
40    /**
41     * {@inheritDoc}
42     * @stable ICU 4.4
43     */
44    @Override
45    public StringBuilder normalize(CharSequence src, StringBuilder dest) {
46        if(dest==src) {
47            throw new IllegalArgumentException();
48        }
49        dest.setLength(0);
50        normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
51        return dest;
52    }
53    /**
54     * {@inheritDoc}
55     * @stable ICU 4.6
56     */
57    @Override
58    public Appendable normalize(CharSequence src, Appendable dest) {
59        if(dest==src) {
60            throw new IllegalArgumentException();
61        }
62        return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
63    }
64
65    /**
66     * {@inheritDoc}
67     * @stable ICU 4.4
68     */
69    @Override
70    public StringBuilder normalizeSecondAndAppend(
71            StringBuilder first, CharSequence second) {
72        return normalizeSecondAndAppend(first, second, true);
73    }
74    /**
75     * {@inheritDoc}
76     * @stable ICU 4.4
77     */
78    @Override
79    public StringBuilder append(StringBuilder first, CharSequence second) {
80        return normalizeSecondAndAppend(first, second, false);
81    }
82
83    /**
84     * {@inheritDoc}
85     * @stable ICU 4.6
86     */
87    @Override
88    public String getDecomposition(int c) {
89        return set.contains(c) ? norm2.getDecomposition(c) : null;
90    }
91
92    /**
93     * {@inheritDoc}
94     * @stable ICU 49
95     */
96    @Override
97    public String getRawDecomposition(int c) {
98        return set.contains(c) ? norm2.getRawDecomposition(c) : null;
99    }
100
101    /**
102     * {@inheritDoc}
103     * @stable ICU 49
104     */
105    @Override
106    public int composePair(int a, int b) {
107        return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : -1;
108    }
109
110    /**
111     * {@inheritDoc}
112     * @stable ICU 49
113     */
114    @Override
115    public int getCombiningClass(int c) {
116        return set.contains(c) ? norm2.getCombiningClass(c) : 0;
117    }
118
119    /**
120     * {@inheritDoc}
121     * @stable ICU 4.4
122     */
123    @Override
124    public boolean isNormalized(CharSequence s) {
125        UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
126        for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
127            int spanLimit=set.span(s, prevSpanLimit, spanCondition);
128            if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
129                spanCondition=UnicodeSet.SpanCondition.SIMPLE;
130            } else {
131                if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
132                    return false;
133                }
134                spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
135            }
136            prevSpanLimit=spanLimit;
137        }
138        return true;
139    }
140
141    /**
142     * {@inheritDoc}
143     * @stable ICU 4.4
144     */
145    @Override
146    public Normalizer.QuickCheckResult quickCheck(CharSequence s) {
147        Normalizer.QuickCheckResult result=Normalizer.YES;
148        UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
149        for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
150            int spanLimit=set.span(s, prevSpanLimit, spanCondition);
151            if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
152                spanCondition=UnicodeSet.SpanCondition.SIMPLE;
153            } else {
154                Normalizer.QuickCheckResult qcResult=
155                    norm2.quickCheck(s.subSequence(prevSpanLimit, spanLimit));
156                if(qcResult==Normalizer.NO) {
157                    return qcResult;
158                } else if(qcResult==Normalizer.MAYBE) {
159                    result=qcResult;
160                }
161                spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
162            }
163            prevSpanLimit=spanLimit;
164        }
165        return result;
166    }
167    /**
168     * {@inheritDoc}
169     * @stable ICU 4.4
170     */
171    @Override
172    public int spanQuickCheckYes(CharSequence s) {
173        UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
174        for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
175            int spanLimit=set.span(s, prevSpanLimit, spanCondition);
176            if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
177                spanCondition=UnicodeSet.SpanCondition.SIMPLE;
178            } else {
179                int yesLimit=
180                    prevSpanLimit+
181                    norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
182                if(yesLimit<spanLimit) {
183                    return yesLimit;
184                }
185                spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
186            }
187            prevSpanLimit=spanLimit;
188        }
189        return s.length();
190    }
191
192    /**
193     * {@inheritDoc}
194     * @stable ICU 4.4
195     */
196    @Override
197    public boolean hasBoundaryBefore(int c) {
198        return !set.contains(c) || norm2.hasBoundaryBefore(c);
199    }
200
201    /**
202     * {@inheritDoc}
203     * @stable ICU 4.4
204     */
205    @Override
206    public boolean hasBoundaryAfter(int c) {
207        return !set.contains(c) || norm2.hasBoundaryAfter(c);
208    }
209
210    /**
211     * {@inheritDoc}
212     * @stable ICU 4.4
213     */
214    @Override
215    public boolean isInert(int c) {
216        return !set.contains(c) || norm2.isInert(c);
217    }
218
219    // Internal: No argument checking, and appends to dest.
220    // Pass as input spanCondition the one that is likely to yield a non-zero
221    // span length at the start of src.
222    // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
223    // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
224    // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after
225    // an in-filter prefix.
226    private Appendable normalize(CharSequence src, Appendable dest,
227                                 UnicodeSet.SpanCondition spanCondition) {
228        // Don't throw away destination buffer between iterations.
229        StringBuilder tempDest=new StringBuilder();
230        try {
231            for(int prevSpanLimit=0; prevSpanLimit<src.length();) {
232                int spanLimit=set.span(src, prevSpanLimit, spanCondition);
233                int spanLength=spanLimit-prevSpanLimit;
234                if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
235                    if(spanLength!=0) {
236                        dest.append(src, prevSpanLimit, spanLimit);
237                    }
238                    spanCondition=UnicodeSet.SpanCondition.SIMPLE;
239                } else {
240                    if(spanLength!=0) {
241                        // Not norm2.normalizeSecondAndAppend() because we do not want
242                        // to modify the non-filter part of dest.
243                        dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
244                    }
245                    spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
246                }
247                prevSpanLimit=spanLimit;
248            }
249        } catch(IOException e) {
250            throw new ICUUncheckedIOException(e);
251        }
252        return dest;
253    }
254
255    private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,
256                                                   boolean doNormalize) {
257        if(first==second) {
258            throw new IllegalArgumentException();
259        }
260        if(first.length()==0) {
261            if(doNormalize) {
262                return normalize(second, first);
263            } else {
264                return first.append(second);
265            }
266        }
267        // merge the in-filter suffix of the first string with the in-filter prefix of the second
268        int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
269        if(prefixLimit!=0) {
270            CharSequence prefix=second.subSequence(0, prefixLimit);
271            int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
272            if(suffixStart==0) {
273                if(doNormalize) {
274                    norm2.normalizeSecondAndAppend(first, prefix);
275                } else {
276                    norm2.append(first, prefix);
277                }
278            } else {
279                StringBuilder middle=new StringBuilder(
280                        first.subSequence(suffixStart, first.length()));
281                if(doNormalize) {
282                    norm2.normalizeSecondAndAppend(middle, prefix);
283                } else {
284                    norm2.append(middle, prefix);
285                }
286                first.delete(suffixStart, 0x7fffffff).append(middle);
287            }
288        }
289        if(prefixLimit<second.length()) {
290            CharSequence rest=second.subSequence(prefixLimit, second.length());
291            if(doNormalize) {
292                normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
293            } else {
294                first.append(rest);
295            }
296        }
297        return first;
298    }
299
300    private Normalizer2 norm2;
301    private UnicodeSet set;
302};
303