1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2003-2015, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9
10package com.ibm.icu.text;
11
12import java.io.IOException;
13import java.io.InputStream;
14import java.lang.ref.WeakReference;
15import java.nio.ByteBuffer;
16
17import com.ibm.icu.impl.CharTrie;
18import com.ibm.icu.impl.ICUBinary;
19import com.ibm.icu.impl.StringPrepDataReader;
20import com.ibm.icu.impl.UBiDiProps;
21import com.ibm.icu.lang.UCharacter;
22import com.ibm.icu.lang.UCharacterDirection;
23import com.ibm.icu.util.ICUUncheckedIOException;
24import com.ibm.icu.util.VersionInfo;
25
26/**
27 * StringPrep API implements the StingPrep framework as described by
28 * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
29 * StringPrep prepares Unicode strings for use in network protocols.
30 * Profiles of StingPrep are set of rules and data according to which the
31 * Unicode Strings are prepared. Each profiles contains tables which describe
32 * how a code point should be treated. The tables are broadly classied into
33 * <ul>
34 *     <li> Unassigned Table: Contains code points that are unassigned
35 *          in the Unicode Version supported by StringPrep. Currently
36 *          RFC 3454 supports Unicode 3.2. </li>
37 *     <li> Prohibited Table: Contains code points that are prohibted from
38 *          the output of the StringPrep processing function. </li>
39 *     <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
40 * </ul>
41 *
42 * The procedure for preparing Unicode strings:
43 * <ol>
44 *      <li> Map: For each character in the input, check if it has a mapping
45 *           and, if so, replace it with its mapping. </li>
46 *      <li> Normalize: Possibly normalize the result of step 1 using Unicode
47 *           normalization. </li>
48 *      <li> Prohibit: Check for any characters that are not allowed in the
49 *           output.  If any are found, return an error.</li>
50 *      <li> Check bidi: Possibly check for right-to-left characters, and if
51 *           any are found, make sure that the whole string satisfies the
52 *           requirements for bidirectional strings.  If the string does not
53 *           satisfy the requirements for bidirectional strings, return an
54 *           error.  </li>
55 * </ol>
56 * @author Ram Viswanadha
57 * @stable ICU 2.8
58 */
59public final class StringPrep {
60    /**
61     * Option to prohibit processing of unassigned code points in the input
62     *
63     * @see   #prepare
64     * @stable ICU 2.8
65     */
66    public static final int DEFAULT = 0x0000;
67
68    /**
69     * Option to allow processing of unassigned code points in the input
70     *
71     * @see   #prepare
72     * @stable ICU 2.8
73     */
74    public static final int ALLOW_UNASSIGNED = 0x0001;
75
76    /**
77     * Profile type: RFC3491 Nameprep
78     * @see #getInstance(int)
79     * @stable ICU 4.2
80     */
81    public static final int RFC3491_NAMEPREP = 0;
82
83    /**
84     * Profile type: RFC3530 nfs4_cs_prep
85     * @see #getInstance(int)
86     * @stable ICU 4.2
87     */
88    public static final int RFC3530_NFS4_CS_PREP = 1;
89
90    /**
91     * Profile type: RFC3530 nfs4_cs_prep with case insensitive option
92     * @see #getInstance(int)
93     * @stable ICU 4.2
94     */
95    public static final int RFC3530_NFS4_CS_PREP_CI = 2;
96
97    /**
98     * Profile type: RFC3530 nfs4_cis_prep
99     * @see #getInstance(int)
100     * @stable ICU 4.2
101     */
102    public static final int RFC3530_NFS4_CIS_PREP = 3;
103
104    /**
105     * Profile type: RFC3530 nfs4_mixed_prep for prefix
106     * @see #getInstance(int)
107     * @stable ICU 4.2
108     */
109    public static final int RFC3530_NFS4_MIXED_PREP_PREFIX = 4;
110
111    /**
112     * Profile type: RFC3530 nfs4_mixed_prep for suffix
113     * @see #getInstance(int)
114     * @stable ICU 4.2
115     */
116    public static final int RFC3530_NFS4_MIXED_PREP_SUFFIX = 5;
117
118    /**
119     * Profile type: RFC3722 iSCSI
120     * @see #getInstance(int)
121     * @stable ICU 4.2
122     */
123    public static final int RFC3722_ISCSI = 6;
124
125    /**
126     * Profile type: RFC3920 XMPP Nodeprep
127     * @see #getInstance(int)
128     * @stable ICU 4.2
129     */
130    public static final int RFC3920_NODEPREP = 7;
131
132    /**
133     * Profile type: RFC3920 XMPP Resourceprep
134     * @see #getInstance(int)
135     * @stable ICU 4.2
136     */
137    public static final int RFC3920_RESOURCEPREP = 8;
138
139    /**
140     * Profile type: RFC4011 Policy MIB Stringprep
141     * @see #getInstance(int)
142     * @stable ICU 4.2
143     */
144    public static final int RFC4011_MIB = 9;
145
146    /**
147     * Profile type: RFC4013 SASLprep
148     * @see #getInstance(int)
149     * @stable ICU 4.2
150     */
151    public static final int RFC4013_SASLPREP = 10;
152
153    /**
154     * Profile type: RFC4505 trace
155     * @see #getInstance(int)
156     * @stable ICU 4.2
157     */
158    public static final int RFC4505_TRACE = 11;
159
160    /**
161     * Profile type: RFC4518 LDAP
162     * @see #getInstance(int)
163     * @stable ICU 4.2
164     */
165    public static final int RFC4518_LDAP = 12;
166
167    /**
168     * Profile type: RFC4518 LDAP for case ignore, numeric and stored prefix
169     * matching rules
170     * @see #getInstance(int)
171     * @stable ICU 4.2
172     */
173    public static final int RFC4518_LDAP_CI = 13;
174
175    // Last available profile
176    private static final int MAX_PROFILE = RFC4518_LDAP_CI;
177
178    // Profile names must be aligned to profile type definitions
179    private static final String[] PROFILE_NAMES = {
180        "rfc3491",      /* RFC3491_NAMEPREP */
181        "rfc3530cs",    /* RFC3530_NFS4_CS_PREP */
182        "rfc3530csci",  /* RFC3530_NFS4_CS_PREP_CI */
183        "rfc3491",      /* RFC3530_NSF4_CIS_PREP */
184        "rfc3530mixp",  /* RFC3530_NSF4_MIXED_PREP_PREFIX */
185        "rfc3491",      /* RFC3530_NSF4_MIXED_PREP_SUFFIX */
186        "rfc3722",      /* RFC3722_ISCSI */
187        "rfc3920node",  /* RFC3920_NODEPREP */
188        "rfc3920res",   /* RFC3920_RESOURCEPREP */
189        "rfc4011",      /* RFC4011_MIB */
190        "rfc4013",      /* RFC4013_SASLPREP */
191        "rfc4505",      /* RFC4505_TRACE */
192        "rfc4518",      /* RFC4518_LDAP */
193        "rfc4518ci",    /* RFC4518_LDAP_CI */
194    };
195
196    @SuppressWarnings({"unchecked", "rawtypes"})
197    private static final WeakReference<StringPrep>[] CACHE = (WeakReference<StringPrep>[])new WeakReference[MAX_PROFILE+1];
198
199    private static final int UNASSIGNED        = 0x0000;
200    private static final int MAP               = 0x0001;
201    private static final int PROHIBITED        = 0x0002;
202    private static final int DELETE            = 0x0003;
203    private static final int TYPE_LIMIT        = 0x0004;
204
205    private static final int NORMALIZATION_ON  = 0x0001;
206    private static final int CHECK_BIDI_ON     = 0x0002;
207
208    private static final int TYPE_THRESHOLD       = 0xFFF0;
209    private static final int MAX_INDEX_VALUE      = 0x3FBF;   /*16139*/
210    //private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
211
212    /* indexes[] value names */
213//  private static final int INDEX_TRIE_SIZE                  =  0; /* number of bytes in normalization trie */
214    private static final int INDEX_MAPPING_DATA_SIZE          =  1; /* The array that contains the mapping   */
215    private static final int NORM_CORRECTNS_LAST_UNI_VERSION  =  2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
216    private static final int ONE_UCHAR_MAPPING_INDEX_START    =  3; /* The starting index of 1 UChar mapping index in the mapping data array */
217    private static final int TWO_UCHARS_MAPPING_INDEX_START   =  4; /* The starting index of 2 UChars mapping index in the mapping data array */
218    private static final int THREE_UCHARS_MAPPING_INDEX_START =  5;
219    private static final int FOUR_UCHARS_MAPPING_INDEX_START  =  6;
220    private static final int OPTIONS                          =  7; /* Bit set of options to turn on in the profile */
221    private static final int INDEX_TOP                        = 16;                          /* changing this requires a new formatVersion */
222
223
224    // CharTrie implmentation for reading the trie data
225    private CharTrie sprepTrie;
226    // Indexes read from the data file
227    private int[] indexes;
228    // mapping data read from the data file
229    private char[] mappingData;
230    // the version of Unicode supported by the data file
231    private VersionInfo sprepUniVer;
232    // the Unicode version of last entry in the
233    // NormalizationCorrections.txt file if normalization
234    // is turned on
235    private VersionInfo normCorrVer;
236    // Option to turn on Normalization
237    private boolean doNFKC;
238    // Option to turn on checking for BiDi rules
239    private boolean checkBiDi;
240    // bidi properties
241    private UBiDiProps bdp;
242
243    private char getCodePointValue(int ch){
244        return sprepTrie.getCodePointValue(ch);
245    }
246
247    private static VersionInfo getVersionInfo(int comp){
248        int micro = comp & 0xFF;
249        int milli =(comp >> 8)  & 0xFF;
250        int minor =(comp >> 16) & 0xFF;
251        int major =(comp >> 24) & 0xFF;
252        return VersionInfo.getInstance(major,minor,milli,micro);
253    }
254
255    private static VersionInfo getVersionInfo(byte[] version){
256        if(version.length != 4){
257            return null;
258        }
259        return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
260    }
261
262    /**
263     * Creates an StringPrep object after reading the input stream.
264     * The object does not hold a reference to the input steam, so the stream can be
265     * closed after the method returns.
266     *
267     * @param inputStream The stream for reading the StringPrep profile binarySun
268     * @throws IOException An exception occurs when I/O of the inputstream is invalid
269     * @stable ICU 2.8
270     */
271    public StringPrep(InputStream inputStream) throws IOException{
272        // TODO: Add a public constructor that takes ByteBuffer directly.
273        this(ICUBinary.getByteBufferFromInputStreamAndCloseStream(inputStream));
274    }
275
276    private StringPrep(ByteBuffer bytes) throws IOException {
277        StringPrepDataReader reader = new StringPrepDataReader(bytes);
278
279        // read the indexes
280        indexes = reader.readIndexes(INDEX_TOP);
281
282        sprepTrie = new CharTrie(bytes, null);
283
284        //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
285        // load the rest of the data data and initialize the data members
286        mappingData = reader.read(indexes[INDEX_MAPPING_DATA_SIZE]/2);
287
288        // get the options
289        doNFKC            = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
290        checkBiDi         = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
291        sprepUniVer   = getVersionInfo(reader.getUnicodeVersion());
292        normCorrVer   = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
293        VersionInfo normUniVer = UCharacter.getUnicodeVersion();
294        if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
295           normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
296           ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
297           ){
298            throw new IOException("Normalization Correction version not supported");
299        }
300
301        if(checkBiDi) {
302            bdp=UBiDiProps.INSTANCE;
303        }
304    }
305
306    /**
307     * Gets a StringPrep instance for the specified profile
308     *
309     * @param profile The profile passed to find the StringPrep instance.
310     * @stable ICU 4.2
311     */
312    public static StringPrep getInstance(int profile) {
313        if (profile < 0 || profile > MAX_PROFILE) {
314            throw new IllegalArgumentException("Bad profile type");
315        }
316
317        StringPrep instance = null;
318
319        // A StringPrep instance is immutable.  We use a single instance
320        // per type and store it in the internal cache.
321        synchronized (CACHE) {
322            WeakReference<StringPrep> ref = CACHE[profile];
323            if (ref != null) {
324                instance = ref.get();
325            }
326
327            if (instance == null) {
328                ByteBuffer bytes = ICUBinary.getRequiredData(PROFILE_NAMES[profile] + ".spp");
329                if (bytes != null) {
330                    try {
331                        instance = new StringPrep(bytes);
332                    } catch (IOException e) {
333                        throw new ICUUncheckedIOException(e);
334                    }
335                }
336                if (instance != null) {
337                    CACHE[profile] = new WeakReference<StringPrep>(instance);
338                }
339            }
340        }
341        return instance;
342    }
343
344    private static final class Values{
345        boolean isIndex;
346        int value;
347        int type;
348        public void reset(){
349            isIndex = false;
350            value = 0;
351            type = -1;
352        }
353    }
354
355    private static final void getValues(char trieWord,Values values){
356        values.reset();
357        if(trieWord == 0){
358            /*
359             * Initial value stored in the mapping table
360             * just return TYPE_LIMIT .. so that
361             * the source codepoint is copied to the destination
362             */
363            values.type = TYPE_LIMIT;
364        }else if(trieWord >= TYPE_THRESHOLD){
365            values.type = (trieWord - TYPE_THRESHOLD);
366        }else{
367            /* get the type */
368            values.type = MAP;
369            /* ascertain if the value is index or delta */
370            if((trieWord & 0x02)>0){
371                values.isIndex = true;
372                values.value = trieWord  >> 2; //mask off the lower 2 bits and shift
373
374            }else{
375                values.isIndex = false;
376                values.value = (trieWord<<16)>>16;
377                values.value =  (values.value >> 2);
378
379            }
380
381            if((trieWord>>2) == MAX_INDEX_VALUE){
382                values.type = DELETE;
383                values.isIndex = false;
384                values.value = 0;
385            }
386        }
387    }
388
389
390
391    private StringBuffer map( UCharacterIterator iter, int options)
392                            throws StringPrepParseException{
393
394        Values val = new Values();
395        char result = 0;
396        int ch  = UCharacterIterator.DONE;
397        StringBuffer dest = new StringBuffer();
398        boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
399
400        while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
401
402            result = getCodePointValue(ch);
403            getValues(result,val);
404
405            // check if the source codepoint is unassigned
406            if(val.type == UNASSIGNED && allowUnassigned == false){
407                 throw new StringPrepParseException("An unassigned code point was found in the input",
408                                          StringPrepParseException.UNASSIGNED_ERROR,
409                                          iter.getText(),iter.getIndex());
410            }else if((val.type == MAP)){
411                int index, length;
412
413                if(val.isIndex){
414                    index = val.value;
415                    if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
416                             index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
417                        length = 1;
418                    }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
419                             index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
420                        length = 2;
421                    }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
422                             index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
423                        length = 3;
424                    }else{
425                        length = mappingData[index++];
426                    }
427                    /* copy mapping to destination */
428                    dest.append(mappingData,index,length);
429                    continue;
430
431                }else{
432                    ch -= val.value;
433                }
434            }else if(val.type == DELETE){
435                // just consume the codepoint and contine
436                continue;
437            }
438            //copy the source into destination
439            UTF16.append(dest,ch);
440        }
441
442        return dest;
443    }
444
445
446    private StringBuffer normalize(StringBuffer src){
447        return new StringBuffer(
448            Normalizer.normalize(
449                src.toString(),
450                Normalizer.NFKC,
451                Normalizer.UNICODE_3_2));
452    }
453    /*
454    boolean isLabelSeparator(int ch){
455        int result = getCodePointValue(ch);
456        if( (result & 0x07)  == LABEL_SEPARATOR){
457            return true;
458        }
459        return false;
460    }
461    */
462     /*
463       1) Map -- For each character in the input, check if it has a mapping
464          and, if so, replace it with its mapping.
465
466       2) Normalize -- Possibly normalize the result of step 1 using Unicode
467          normalization.
468
469       3) Prohibit -- Check for any characters that are not allowed in the
470          output.  If any are found, return an error.
471
472       4) Check bidi -- Possibly check for right-to-left characters, and if
473          any are found, make sure that the whole string satisfies the
474          requirements for bidirectional strings.  If the string does not
475          satisfy the requirements for bidirectional strings, return an
476          error.
477          [Unicode3.2] defines several bidirectional categories; each character
478           has one bidirectional category assigned to it.  For the purposes of
479           the requirements below, an "RandALCat character" is a character that
480           has Unicode bidirectional categories "R" or "AL"; an "LCat character"
481           is a character that has Unicode bidirectional category "L".  Note
482
483
484           that there are many characters which fall in neither of the above
485           definitions; Latin digits (<U+0030> through <U+0039>) are examples of
486           this because they have bidirectional category "EN".
487
488           In any profile that specifies bidirectional character handling, all
489           three of the following requirements MUST be met:
490
491           1) The characters in section 5.8 MUST be prohibited.
492
493           2) If a string contains any RandALCat character, the string MUST NOT
494              contain any LCat character.
495
496           3) If a string contains any RandALCat character, a RandALCat
497              character MUST be the first character of the string, and a
498              RandALCat character MUST be the last character of the string.
499    */
500    /**
501     * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
502     * checks for prohibited and BiDi characters in the order defined by RFC 3454
503     * depending on the options specified in the profile.
504     *
505     * @param src           A UCharacterIterator object containing the source string
506     * @param options       A bit set of options:
507     *   <ul>
508     *     <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li>
509     *     <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input
510     *          as normal Unicode code points.</li>
511     *   </ul>
512     * @return StringBuffer A StringBuffer containing the output
513     * @throws StringPrepParseException An exception occurs when parsing a string is invalid.
514     * @stable ICU 2.8
515     */
516    public StringBuffer prepare(UCharacterIterator src, int options)
517                        throws StringPrepParseException{
518
519        // map
520        StringBuffer mapOut = map(src,options);
521        StringBuffer normOut = mapOut;// initialize
522
523        if(doNFKC){
524            // normalize
525            normOut = normalize(mapOut);
526        }
527
528        int ch;
529        char result;
530        UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
531        Values val = new Values();
532        int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
533            firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
534        int rtlPos=-1, ltrPos=-1;
535        boolean rightToLeft=false, leftToRight=false;
536
537        while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
538            result = getCodePointValue(ch);
539            getValues(result,val);
540
541            if(val.type == PROHIBITED ){
542                throw new StringPrepParseException("A prohibited code point was found in the input",
543                                         StringPrepParseException.PROHIBITED_ERROR,iter.getText(),val.value);
544            }
545
546            if(checkBiDi) {
547                direction = bdp.getClass(ch);
548                if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
549                    firstCharDir = direction;
550                }
551                if(direction == UCharacterDirection.LEFT_TO_RIGHT){
552                    leftToRight = true;
553                    ltrPos = iter.getIndex()-1;
554                }
555                if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
556                    rightToLeft = true;
557                    rtlPos = iter.getIndex()-1;
558                }
559            }
560        }
561        if(checkBiDi == true){
562            // satisfy 2
563            if( leftToRight == true && rightToLeft == true){
564                throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
565                                         StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(),
566                                         (rtlPos>ltrPos) ? rtlPos : ltrPos);
567             }
568
569            //satisfy 3
570            if( rightToLeft == true &&
571                !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
572                (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
573              ){
574                throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
575                                         StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(),
576                                         (rtlPos>ltrPos) ? rtlPos : ltrPos);
577            }
578        }
579        return normOut;
580
581      }
582
583    /**
584     * Prepare the input String for use in applications with the given profile. This operation maps, normalizes(NFKC),
585     * checks for prohibited and BiDi characters in the order defined by RFC 3454
586     * depending on the options specified in the profile.
587     *
588     * @param src           A string
589     * @param options       A bit set of options:
590     *   <ul>
591     *     <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li>
592     *     <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input
593     *          as normal Unicode code points.</li>
594     *   </ul>
595     * @return String A String containing the output
596     * @throws StringPrepParseException An exception when parsing or preparing a string is invalid.
597     * @stable ICU 4.2
598     */
599    public String prepare(String src, int options)
600        throws StringPrepParseException{
601        StringBuffer result = prepare(UCharacterIterator.getInstance(src), options);
602        return result.toString();
603    }
604}
605