1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4******************************************************************************* 5* Copyright (C) 2009-2014, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8*/ 9package com.ibm.icu.text; 10 11import java.io.IOException; 12 13import com.ibm.icu.util.ICUUncheckedIOException; 14 15/** 16 * Normalization filtered by a UnicodeSet. 17 * Normalizes portions of the text contained in the filter set and leaves 18 * portions not contained in the filter set unchanged. 19 * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE). 20 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 21 * This class implements all of (and only) the Normalizer2 API. 22 * An instance of this class is unmodifiable/immutable. 23 * @stable ICU 4.4 24 * @author Markus W. Scherer 25 */ 26public class FilteredNormalizer2 extends Normalizer2 { 27 /** 28 * Constructs a filtered normalizer wrapping any Normalizer2 instance 29 * and a filter set. 30 * Both are aliased and must not be modified or deleted while this object 31 * is used. 32 * The filter set should be frozen; otherwise the performance will suffer greatly. 33 * @param n2 wrapped Normalizer2 instance 34 * @param filterSet UnicodeSet which determines the characters to be normalized 35 * @stable ICU 4.4 36 */ 37 public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) { 38 norm2=n2; 39 set=filterSet; 40 } 41 42 /** 43 * {@inheritDoc} 44 * @stable ICU 4.4 45 */ 46 @Override 47 public StringBuilder normalize(CharSequence src, StringBuilder dest) { 48 if(dest==src) { 49 throw new IllegalArgumentException(); 50 } 51 dest.setLength(0); 52 normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE); 53 return dest; 54 } 55 /** 56 * {@inheritDoc} 57 * @stable ICU 4.6 58 */ 59 @Override 60 public Appendable normalize(CharSequence src, Appendable dest) { 61 if(dest==src) { 62 throw new IllegalArgumentException(); 63 } 64 return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE); 65 } 66 67 /** 68 * {@inheritDoc} 69 * @stable ICU 4.4 70 */ 71 @Override 72 public StringBuilder normalizeSecondAndAppend( 73 StringBuilder first, CharSequence second) { 74 return normalizeSecondAndAppend(first, second, true); 75 } 76 /** 77 * {@inheritDoc} 78 * @stable ICU 4.4 79 */ 80 @Override 81 public StringBuilder append(StringBuilder first, CharSequence second) { 82 return normalizeSecondAndAppend(first, second, false); 83 } 84 85 /** 86 * {@inheritDoc} 87 * @stable ICU 4.6 88 */ 89 @Override 90 public String getDecomposition(int c) { 91 return set.contains(c) ? norm2.getDecomposition(c) : null; 92 } 93 94 /** 95 * {@inheritDoc} 96 * @stable ICU 49 97 */ 98 @Override 99 public String getRawDecomposition(int c) { 100 return set.contains(c) ? norm2.getRawDecomposition(c) : null; 101 } 102 103 /** 104 * {@inheritDoc} 105 * @stable ICU 49 106 */ 107 @Override 108 public int composePair(int a, int b) { 109 return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : -1; 110 } 111 112 /** 113 * {@inheritDoc} 114 * @stable ICU 49 115 */ 116 @Override 117 public int getCombiningClass(int c) { 118 return set.contains(c) ? norm2.getCombiningClass(c) : 0; 119 } 120 121 /** 122 * {@inheritDoc} 123 * @stable ICU 4.4 124 */ 125 @Override 126 public boolean isNormalized(CharSequence s) { 127 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE; 128 for(int prevSpanLimit=0; prevSpanLimit<s.length();) { 129 int spanLimit=set.span(s, prevSpanLimit, spanCondition); 130 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { 131 spanCondition=UnicodeSet.SpanCondition.SIMPLE; 132 } else { 133 if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) { 134 return false; 135 } 136 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; 137 } 138 prevSpanLimit=spanLimit; 139 } 140 return true; 141 } 142 143 /** 144 * {@inheritDoc} 145 * @stable ICU 4.4 146 */ 147 @Override 148 public Normalizer.QuickCheckResult quickCheck(CharSequence s) { 149 Normalizer.QuickCheckResult result=Normalizer.YES; 150 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE; 151 for(int prevSpanLimit=0; prevSpanLimit<s.length();) { 152 int spanLimit=set.span(s, prevSpanLimit, spanCondition); 153 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { 154 spanCondition=UnicodeSet.SpanCondition.SIMPLE; 155 } else { 156 Normalizer.QuickCheckResult qcResult= 157 norm2.quickCheck(s.subSequence(prevSpanLimit, spanLimit)); 158 if(qcResult==Normalizer.NO) { 159 return qcResult; 160 } else if(qcResult==Normalizer.MAYBE) { 161 result=qcResult; 162 } 163 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; 164 } 165 prevSpanLimit=spanLimit; 166 } 167 return result; 168 } 169 /** 170 * {@inheritDoc} 171 * @stable ICU 4.4 172 */ 173 @Override 174 public int spanQuickCheckYes(CharSequence s) { 175 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE; 176 for(int prevSpanLimit=0; prevSpanLimit<s.length();) { 177 int spanLimit=set.span(s, prevSpanLimit, spanCondition); 178 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { 179 spanCondition=UnicodeSet.SpanCondition.SIMPLE; 180 } else { 181 int yesLimit= 182 prevSpanLimit+ 183 norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit)); 184 if(yesLimit<spanLimit) { 185 return yesLimit; 186 } 187 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; 188 } 189 prevSpanLimit=spanLimit; 190 } 191 return s.length(); 192 } 193 194 /** 195 * {@inheritDoc} 196 * @stable ICU 4.4 197 */ 198 @Override 199 public boolean hasBoundaryBefore(int c) { 200 return !set.contains(c) || norm2.hasBoundaryBefore(c); 201 } 202 203 /** 204 * {@inheritDoc} 205 * @stable ICU 4.4 206 */ 207 @Override 208 public boolean hasBoundaryAfter(int c) { 209 return !set.contains(c) || norm2.hasBoundaryAfter(c); 210 } 211 212 /** 213 * {@inheritDoc} 214 * @stable ICU 4.4 215 */ 216 @Override 217 public boolean isInert(int c) { 218 return !set.contains(c) || norm2.isInert(c); 219 } 220 221 // Internal: No argument checking, and appends to dest. 222 // Pass as input spanCondition the one that is likely to yield a non-zero 223 // span length at the start of src. 224 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, 225 // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src 226 // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after 227 // an in-filter prefix. 228 private Appendable normalize(CharSequence src, Appendable dest, 229 UnicodeSet.SpanCondition spanCondition) { 230 // Don't throw away destination buffer between iterations. 231 StringBuilder tempDest=new StringBuilder(); 232 try { 233 for(int prevSpanLimit=0; prevSpanLimit<src.length();) { 234 int spanLimit=set.span(src, prevSpanLimit, spanCondition); 235 int spanLength=spanLimit-prevSpanLimit; 236 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { 237 if(spanLength!=0) { 238 dest.append(src, prevSpanLimit, spanLimit); 239 } 240 spanCondition=UnicodeSet.SpanCondition.SIMPLE; 241 } else { 242 if(spanLength!=0) { 243 // Not norm2.normalizeSecondAndAppend() because we do not want 244 // to modify the non-filter part of dest. 245 dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest)); 246 } 247 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; 248 } 249 prevSpanLimit=spanLimit; 250 } 251 } catch(IOException e) { 252 throw new ICUUncheckedIOException(e); 253 } 254 return dest; 255 } 256 257 private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second, 258 boolean doNormalize) { 259 if(first==second) { 260 throw new IllegalArgumentException(); 261 } 262 if(first.length()==0) { 263 if(doNormalize) { 264 return normalize(second, first); 265 } else { 266 return first.append(second); 267 } 268 } 269 // merge the in-filter suffix of the first string with the in-filter prefix of the second 270 int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE); 271 if(prefixLimit!=0) { 272 CharSequence prefix=second.subSequence(0, prefixLimit); 273 int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE); 274 if(suffixStart==0) { 275 if(doNormalize) { 276 norm2.normalizeSecondAndAppend(first, prefix); 277 } else { 278 norm2.append(first, prefix); 279 } 280 } else { 281 StringBuilder middle=new StringBuilder( 282 first.subSequence(suffixStart, first.length())); 283 if(doNormalize) { 284 norm2.normalizeSecondAndAppend(middle, prefix); 285 } else { 286 norm2.append(middle, prefix); 287 } 288 first.delete(suffixStart, 0x7fffffff).append(middle); 289 } 290 } 291 if(prefixLimit<second.length()) { 292 CharSequence rest=second.subSequence(prefixLimit, second.length()); 293 if(doNormalize) { 294 normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED); 295 } else { 296 first.append(rest); 297 } 298 } 299 return first; 300 } 301 302 private Normalizer2 norm2; 303 private UnicodeSet set; 304}; 305