1/* 2******************************************************************************* 3* Copyright (C) 2009-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6*/ 7package com.ibm.icu.text; 8 9import java.io.IOException; 10 11import com.ibm.icu.util.ICUUncheckedIOException; 12 13/** 14 * Normalization filtered by a UnicodeSet. 15 * Normalizes portions of the text contained in the filter set and leaves 16 * portions not contained in the filter set unchanged. 17 * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE). 18 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 19 * This class implements all of (and only) the Normalizer2 API. 20 * An instance of this class is unmodifiable/immutable. 21 * @stable ICU 4.4 22 * @author Markus W. Scherer 23 */ 24public class FilteredNormalizer2 extends Normalizer2 { 25 /** 26 * Constructs a filtered normalizer wrapping any Normalizer2 instance 27 * and a filter set. 28 * Both are aliased and must not be modified or deleted while this object 29 * is used. 30 * The filter set should be frozen; otherwise the performance will suffer greatly. 31 * @param n2 wrapped Normalizer2 instance 32 * @param filterSet UnicodeSet which determines the characters to be normalized 33 * @stable ICU 4.4 34 */ 35 public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) { 36 norm2=n2; 37 set=filterSet; 38 } 39 40 /** 41 * {@inheritDoc} 42 * @stable ICU 4.4 43 */ 44 @Override 45 public StringBuilder normalize(CharSequence src, StringBuilder dest) { 46 if(dest==src) { 47 throw new IllegalArgumentException(); 48 } 49 dest.setLength(0); 50 normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE); 51 return dest; 52 } 53 /** 54 * {@inheritDoc} 55 * @stable ICU 4.6 56 */ 57 @Override 58 public Appendable normalize(CharSequence src, Appendable dest) { 59 if(dest==src) { 60 throw new IllegalArgumentException(); 61 } 62 return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE); 63 } 64 65 /** 66 * {@inheritDoc} 67 * @stable ICU 4.4 68 */ 69 @Override 70 public StringBuilder normalizeSecondAndAppend( 71 StringBuilder first, CharSequence second) { 72 return normalizeSecondAndAppend(first, second, true); 73 } 74 /** 75 * {@inheritDoc} 76 * @stable ICU 4.4 77 */ 78 @Override 79 public StringBuilder append(StringBuilder first, CharSequence second) { 80 return normalizeSecondAndAppend(first, second, false); 81 } 82 83 /** 84 * {@inheritDoc} 85 * @stable ICU 4.6 86 */ 87 @Override 88 public String getDecomposition(int c) { 89 return set.contains(c) ? norm2.getDecomposition(c) : null; 90 } 91 92 /** 93 * {@inheritDoc} 94 * @stable ICU 49 95 */ 96 @Override 97 public String getRawDecomposition(int c) { 98 return set.contains(c) ? norm2.getRawDecomposition(c) : null; 99 } 100 101 /** 102 * {@inheritDoc} 103 * @stable ICU 49 104 */ 105 @Override 106 public int composePair(int a, int b) { 107 return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : -1; 108 } 109 110 /** 111 * {@inheritDoc} 112 * @stable ICU 49 113 */ 114 @Override 115 public int getCombiningClass(int c) { 116 return set.contains(c) ? norm2.getCombiningClass(c) : 0; 117 } 118 119 /** 120 * {@inheritDoc} 121 * @stable ICU 4.4 122 */ 123 @Override 124 public boolean isNormalized(CharSequence s) { 125 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE; 126 for(int prevSpanLimit=0; prevSpanLimit<s.length();) { 127 int spanLimit=set.span(s, prevSpanLimit, spanCondition); 128 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { 129 spanCondition=UnicodeSet.SpanCondition.SIMPLE; 130 } else { 131 if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) { 132 return false; 133 } 134 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; 135 } 136 prevSpanLimit=spanLimit; 137 } 138 return true; 139 } 140 141 /** 142 * {@inheritDoc} 143 * @stable ICU 4.4 144 */ 145 @Override 146 public Normalizer.QuickCheckResult quickCheck(CharSequence s) { 147 Normalizer.QuickCheckResult result=Normalizer.YES; 148 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE; 149 for(int prevSpanLimit=0; prevSpanLimit<s.length();) { 150 int spanLimit=set.span(s, prevSpanLimit, spanCondition); 151 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { 152 spanCondition=UnicodeSet.SpanCondition.SIMPLE; 153 } else { 154 Normalizer.QuickCheckResult qcResult= 155 norm2.quickCheck(s.subSequence(prevSpanLimit, spanLimit)); 156 if(qcResult==Normalizer.NO) { 157 return qcResult; 158 } else if(qcResult==Normalizer.MAYBE) { 159 result=qcResult; 160 } 161 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; 162 } 163 prevSpanLimit=spanLimit; 164 } 165 return result; 166 } 167 /** 168 * {@inheritDoc} 169 * @stable ICU 4.4 170 */ 171 @Override 172 public int spanQuickCheckYes(CharSequence s) { 173 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE; 174 for(int prevSpanLimit=0; prevSpanLimit<s.length();) { 175 int spanLimit=set.span(s, prevSpanLimit, spanCondition); 176 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { 177 spanCondition=UnicodeSet.SpanCondition.SIMPLE; 178 } else { 179 int yesLimit= 180 prevSpanLimit+ 181 norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit)); 182 if(yesLimit<spanLimit) { 183 return yesLimit; 184 } 185 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; 186 } 187 prevSpanLimit=spanLimit; 188 } 189 return s.length(); 190 } 191 192 /** 193 * {@inheritDoc} 194 * @stable ICU 4.4 195 */ 196 @Override 197 public boolean hasBoundaryBefore(int c) { 198 return !set.contains(c) || norm2.hasBoundaryBefore(c); 199 } 200 201 /** 202 * {@inheritDoc} 203 * @stable ICU 4.4 204 */ 205 @Override 206 public boolean hasBoundaryAfter(int c) { 207 return !set.contains(c) || norm2.hasBoundaryAfter(c); 208 } 209 210 /** 211 * {@inheritDoc} 212 * @stable ICU 4.4 213 */ 214 @Override 215 public boolean isInert(int c) { 216 return !set.contains(c) || norm2.isInert(c); 217 } 218 219 // Internal: No argument checking, and appends to dest. 220 // Pass as input spanCondition the one that is likely to yield a non-zero 221 // span length at the start of src. 222 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, 223 // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src 224 // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after 225 // an in-filter prefix. 226 private Appendable normalize(CharSequence src, Appendable dest, 227 UnicodeSet.SpanCondition spanCondition) { 228 // Don't throw away destination buffer between iterations. 229 StringBuilder tempDest=new StringBuilder(); 230 try { 231 for(int prevSpanLimit=0; prevSpanLimit<src.length();) { 232 int spanLimit=set.span(src, prevSpanLimit, spanCondition); 233 int spanLength=spanLimit-prevSpanLimit; 234 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { 235 if(spanLength!=0) { 236 dest.append(src, prevSpanLimit, spanLimit); 237 } 238 spanCondition=UnicodeSet.SpanCondition.SIMPLE; 239 } else { 240 if(spanLength!=0) { 241 // Not norm2.normalizeSecondAndAppend() because we do not want 242 // to modify the non-filter part of dest. 243 dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest)); 244 } 245 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; 246 } 247 prevSpanLimit=spanLimit; 248 } 249 } catch(IOException e) { 250 throw new ICUUncheckedIOException(e); 251 } 252 return dest; 253 } 254 255 private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second, 256 boolean doNormalize) { 257 if(first==second) { 258 throw new IllegalArgumentException(); 259 } 260 if(first.length()==0) { 261 if(doNormalize) { 262 return normalize(second, first); 263 } else { 264 return first.append(second); 265 } 266 } 267 // merge the in-filter suffix of the first string with the in-filter prefix of the second 268 int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE); 269 if(prefixLimit!=0) { 270 CharSequence prefix=second.subSequence(0, prefixLimit); 271 int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE); 272 if(suffixStart==0) { 273 if(doNormalize) { 274 norm2.normalizeSecondAndAppend(first, prefix); 275 } else { 276 norm2.append(first, prefix); 277 } 278 } else { 279 StringBuilder middle=new StringBuilder( 280 first.subSequence(suffixStart, first.length())); 281 if(doNormalize) { 282 norm2.normalizeSecondAndAppend(middle, prefix); 283 } else { 284 norm2.append(middle, prefix); 285 } 286 first.delete(suffixStart, 0x7fffffff).append(middle); 287 } 288 } 289 if(prefixLimit<second.length()) { 290 CharSequence rest=second.subSequence(prefixLimit, second.length()); 291 if(doNormalize) { 292 normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED); 293 } else { 294 first.append(rest); 295 } 296 } 297 return first; 298 } 299 300 private Normalizer2 norm2; 301 private UnicodeSet set; 302}; 303