1/*
2*******************************************************************************
3*
4*   Copyright (C) 2002-2011, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  punycode.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2002jan31
14*   created by: Markus W. Scherer
15*/
16
17
18/* This ICU code derived from: */
19/*
20punycode.c 0.4.0 (2001-Nov-17-Sat)
21http://www.cs.berkeley.edu/~amc/idn/
22Adam M. Costello
23http://www.nicemice.net/amc/
24
25Disclaimer and license
26
27    Regarding this entire document or any portion of it (including
28    the pseudocode and C code), the author makes no guarantees and
29    is not responsible for any damage resulting from its use.  The
30    author grants irrevocable permission to anyone to use, modify,
31    and distribute it in any way that does not diminish the rights
32    of anyone else to use, modify, and distribute it, provided that
33    redistributed derivative works do not contain misleading author or
34    version information.  Derivative works need not be licensed under
35    similar terms.
36*/
37/*
38 * ICU modifications:
39 * - ICU data types and coding conventions
40 * - ICU string buffer handling with implicit source lengths
41 *   and destination preflighting
42 * - UTF-16 handling
43 */
44
45#include "unicode/utypes.h"
46
47#if !UCONFIG_NO_IDNA
48
49#include "unicode/ustring.h"
50#include "unicode/utf.h"
51#include "unicode/utf16.h"
52#include "ustr_imp.h"
53#include "cstring.h"
54#include "cmemory.h"
55#include "punycode.h"
56#include "uassert.h"
57
58
59/* Punycode ----------------------------------------------------------------- */
60
61/* Punycode parameters for Bootstring */
62#define BASE            36
63#define TMIN            1
64#define TMAX            26
65#define SKEW            38
66#define DAMP            700
67#define INITIAL_BIAS    72
68#define INITIAL_N       0x80
69
70/* "Basic" Unicode/ASCII code points */
71#define _HYPHEN         0X2d
72#define DELIMITER       _HYPHEN
73
74#define _ZERO_          0X30
75#define _NINE           0x39
76
77#define _SMALL_A        0X61
78#define _SMALL_Z        0X7a
79
80#define _CAPITAL_A      0X41
81#define _CAPITAL_Z      0X5a
82
83#define IS_BASIC(c) ((c)<0x80)
84#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
85
86/**
87 * digitToBasic() returns the basic code point whose value
88 * (when used for representing integers) is d, which must be in the
89 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
90 * nonzero, in which case the uppercase form is used.
91 */
92static inline char
93digitToBasic(int32_t digit, UBool uppercase) {
94    /*  0..25 map to ASCII a..z or A..Z */
95    /* 26..35 map to ASCII 0..9         */
96    if(digit<26) {
97        if(uppercase) {
98            return (char)(_CAPITAL_A+digit);
99        } else {
100            return (char)(_SMALL_A+digit);
101        }
102    } else {
103        return (char)((_ZERO_-26)+digit);
104    }
105}
106
107/**
108 * basicToDigit[] contains the numeric value of a basic code
109 * point (for use in representing integers) in the range 0 to
110 * BASE-1, or -1 if b is does not represent a value.
111 */
112static const int8_t
113basicToDigit[256]={
114    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
115    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
116
117    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
118    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
119
120    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
121    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
122
123    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
124    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
125
126    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
127    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
128
129    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
130    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
131
132    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
133    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
134
135    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
136    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
137};
138
139static inline char
140asciiCaseMap(char b, UBool uppercase) {
141    if(uppercase) {
142        if(_SMALL_A<=b && b<=_SMALL_Z) {
143            b-=(_SMALL_A-_CAPITAL_A);
144        }
145    } else {
146        if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
147            b+=(_SMALL_A-_CAPITAL_A);
148        }
149    }
150    return b;
151}
152
153/* Punycode-specific Bootstring code ---------------------------------------- */
154
155/*
156 * The following code omits the {parts} of the pseudo-algorithm in the spec
157 * that are not used with the Punycode parameter set.
158 */
159
160/* Bias adaptation function. */
161static int32_t
162adaptBias(int32_t delta, int32_t length, UBool firstTime) {
163    int32_t count;
164
165    if(firstTime) {
166        delta/=DAMP;
167    } else {
168        delta/=2;
169    }
170
171    delta+=delta/length;
172    for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
173        delta/=(BASE-TMIN);
174    }
175
176    return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
177}
178
179#define MAX_CP_COUNT    200
180
181U_CFUNC int32_t
182u_strToPunycode(const UChar *src, int32_t srcLength,
183                UChar *dest, int32_t destCapacity,
184                const UBool *caseFlags,
185                UErrorCode *pErrorCode) {
186
187    int32_t cpBuffer[MAX_CP_COUNT];
188    int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
189    UChar c, c2;
190
191    /* argument checking */
192    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
193        return 0;
194    }
195
196    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
197        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
198        return 0;
199    }
200
201    /*
202     * Handle the basic code points and
203     * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
204     */
205    srcCPCount=destLength=0;
206    if(srcLength==-1) {
207        /* NUL-terminated input */
208        for(j=0; /* no condition */; ++j) {
209            if((c=src[j])==0) {
210                break;
211            }
212            if(srcCPCount==MAX_CP_COUNT) {
213                /* too many input code points */
214                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
215                return 0;
216            }
217            if(IS_BASIC(c)) {
218                cpBuffer[srcCPCount++]=0;
219                if(destLength<destCapacity) {
220                    dest[destLength]=
221                        caseFlags!=NULL ?
222                            asciiCaseMap((char)c, caseFlags[j]) :
223                            (char)c;
224                }
225                ++destLength;
226            } else {
227                n=(caseFlags!=NULL && caseFlags[j])<<31L;
228                if(U16_IS_SINGLE(c)) {
229                    n|=c;
230                } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
231                    ++j;
232                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
233                } else {
234                    /* error: unmatched surrogate */
235                    *pErrorCode=U_INVALID_CHAR_FOUND;
236                    return 0;
237                }
238                cpBuffer[srcCPCount++]=n;
239            }
240        }
241    } else {
242        /* length-specified input */
243        for(j=0; j<srcLength; ++j) {
244            if(srcCPCount==MAX_CP_COUNT) {
245                /* too many input code points */
246                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
247                return 0;
248            }
249            c=src[j];
250            if(IS_BASIC(c)) {
251                cpBuffer[srcCPCount++]=0;
252                if(destLength<destCapacity) {
253                    dest[destLength]=
254                        caseFlags!=NULL ?
255                            asciiCaseMap((char)c, caseFlags[j]) :
256                            (char)c;
257                }
258                ++destLength;
259            } else {
260                n=(caseFlags!=NULL && caseFlags[j])<<31L;
261                if(U16_IS_SINGLE(c)) {
262                    n|=c;
263                } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
264                    ++j;
265                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
266                } else {
267                    /* error: unmatched surrogate */
268                    *pErrorCode=U_INVALID_CHAR_FOUND;
269                    return 0;
270                }
271                cpBuffer[srcCPCount++]=n;
272            }
273        }
274    }
275
276    /* Finish the basic string - if it is not empty - with a delimiter. */
277    basicLength=destLength;
278    if(basicLength>0) {
279        if(destLength<destCapacity) {
280            dest[destLength]=DELIMITER;
281        }
282        ++destLength;
283    }
284
285    /*
286     * handledCPCount is the number of code points that have been handled
287     * basicLength is the number of basic code points
288     * destLength is the number of chars that have been output
289     */
290
291    /* Initialize the state: */
292    n=INITIAL_N;
293    delta=0;
294    bias=INITIAL_BIAS;
295
296    /* Main encoding loop: */
297    for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
298        /*
299         * All non-basic code points < n have been handled already.
300         * Find the next larger one:
301         */
302        for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
303            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
304            if(n<=q && q<m) {
305                m=q;
306            }
307        }
308
309        /*
310         * Increase delta enough to advance the decoder's
311         * <n,i> state to <m,0>, but guard against overflow:
312         */
313        if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
314            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
315            return 0;
316        }
317        delta+=(m-n)*(handledCPCount+1);
318        n=m;
319
320        /* Encode a sequence of same code points n */
321        for(j=0; j<srcCPCount; ++j) {
322            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
323            if(q<n) {
324                ++delta;
325            } else if(q==n) {
326                /* Represent delta as a generalized variable-length integer: */
327                for(q=delta, k=BASE; /* no condition */; k+=BASE) {
328
329                    /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
330
331                    t=k-bias;
332                    if(t<TMIN) {
333                        t=TMIN;
334                    } else if(t>TMAX) {
335                        t=TMAX;
336                    }
337                    */
338
339                    t=k-bias;
340                    if(t<TMIN) {
341                        t=TMIN;
342                    } else if(k>=(bias+TMAX)) {
343                        t=TMAX;
344                    }
345
346                    if(q<t) {
347                        break;
348                    }
349
350                    if(destLength<destCapacity) {
351                        dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
352                    }
353                    ++destLength;
354                    q=(q-t)/(BASE-t);
355                }
356
357                if(destLength<destCapacity) {
358                    dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
359                }
360                ++destLength;
361                bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
362                delta=0;
363                ++handledCPCount;
364            }
365        }
366
367        ++delta;
368        ++n;
369    }
370
371    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
372}
373
374U_CFUNC int32_t
375u_strFromPunycode(const UChar *src, int32_t srcLength,
376                  UChar *dest, int32_t destCapacity,
377                  UBool *caseFlags,
378                  UErrorCode *pErrorCode) {
379    int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
380            destCPCount, firstSupplementaryIndex, cpLength;
381    UChar b;
382
383    /* argument checking */
384    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
385        return 0;
386    }
387
388    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
389        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
390        return 0;
391    }
392
393    if(srcLength==-1) {
394        srcLength=u_strlen(src);
395    }
396
397    /*
398     * Handle the basic code points:
399     * Let basicLength be the number of input code points
400     * before the last delimiter, or 0 if there is none,
401     * then copy the first basicLength code points to the output.
402     *
403     * The two following loops iterate backward.
404     */
405    for(j=srcLength; j>0;) {
406        if(src[--j]==DELIMITER) {
407            break;
408        }
409    }
410    destLength=basicLength=destCPCount=j;
411    U_ASSERT(destLength>=0);
412
413    while(j>0) {
414        b=src[--j];
415        if(!IS_BASIC(b)) {
416            *pErrorCode=U_INVALID_CHAR_FOUND;
417            return 0;
418        }
419
420        if(j<destCapacity) {
421            dest[j]=(UChar)b;
422
423            if(caseFlags!=NULL) {
424                caseFlags[j]=IS_BASIC_UPPERCASE(b);
425            }
426        }
427    }
428
429    /* Initialize the state: */
430    n=INITIAL_N;
431    i=0;
432    bias=INITIAL_BIAS;
433    firstSupplementaryIndex=1000000000;
434
435    /*
436     * Main decoding loop:
437     * Start just after the last delimiter if any
438     * basic code points were copied; start at the beginning otherwise.
439     */
440    for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
441        /*
442         * in is the index of the next character to be consumed, and
443         * destCPCount is the number of code points in the output array.
444         *
445         * Decode a generalized variable-length integer into delta,
446         * which gets added to i.  The overflow checking is easier
447         * if we increase i as we go, then subtract off its starting
448         * value at the end to obtain delta.
449         */
450        for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
451            if(in>=srcLength) {
452                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
453                return 0;
454            }
455
456            digit=basicToDigit[(uint8_t)src[in++]];
457            if(digit<0) {
458                *pErrorCode=U_INVALID_CHAR_FOUND;
459                return 0;
460            }
461            if(digit>(0x7fffffff-i)/w) {
462                /* integer overflow */
463                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
464                return 0;
465            }
466
467            i+=digit*w;
468            /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
469            t=k-bias;
470            if(t<TMIN) {
471                t=TMIN;
472            } else if(t>TMAX) {
473                t=TMAX;
474            }
475            */
476            t=k-bias;
477            if(t<TMIN) {
478                t=TMIN;
479            } else if(k>=(bias+TMAX)) {
480                t=TMAX;
481            }
482            if(digit<t) {
483                break;
484            }
485
486            if(w>0x7fffffff/(BASE-t)) {
487                /* integer overflow */
488                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
489                return 0;
490            }
491            w*=BASE-t;
492        }
493
494        /*
495         * Modification from sample code:
496         * Increments destCPCount here,
497         * where needed instead of in for() loop tail.
498         */
499        ++destCPCount;
500        bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
501
502        /*
503         * i was supposed to wrap around from (incremented) destCPCount to 0,
504         * incrementing n each time, so we'll fix that now:
505         */
506        if(i/destCPCount>(0x7fffffff-n)) {
507            /* integer overflow */
508            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
509            return 0;
510        }
511
512        n+=i/destCPCount;
513        i%=destCPCount;
514        /* not needed for Punycode: */
515        /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
516
517        if(n>0x10ffff || U_IS_SURROGATE(n)) {
518            /* Unicode code point overflow */
519            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
520            return 0;
521        }
522
523        /* Insert n at position i of the output: */
524        cpLength=U16_LENGTH(n);
525        if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
526            int32_t codeUnitIndex;
527
528            /*
529             * Handle indexes when supplementary code points are present.
530             *
531             * In almost all cases, there will be only BMP code points before i
532             * and even in the entire string.
533             * This is handled with the same efficiency as with UTF-32.
534             *
535             * Only the rare cases with supplementary code points are handled
536             * more slowly - but not too bad since this is an insertion anyway.
537             */
538            if(i<=firstSupplementaryIndex) {
539                codeUnitIndex=i;
540                if(cpLength>1) {
541                    firstSupplementaryIndex=codeUnitIndex;
542                } else {
543                    ++firstSupplementaryIndex;
544                }
545            } else {
546                codeUnitIndex=firstSupplementaryIndex;
547                U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
548            }
549
550            /* use the UChar index codeUnitIndex instead of the code point index i */
551            if(codeUnitIndex<destLength) {
552                uprv_memmove(dest+codeUnitIndex+cpLength,
553                             dest+codeUnitIndex,
554                             (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
555                if(caseFlags!=NULL) {
556                    uprv_memmove(caseFlags+codeUnitIndex+cpLength,
557                                 caseFlags+codeUnitIndex,
558                                 destLength-codeUnitIndex);
559                }
560            }
561            if(cpLength==1) {
562                /* BMP, insert one code unit */
563                dest[codeUnitIndex]=(UChar)n;
564            } else {
565                /* supplementary character, insert two code units */
566                dest[codeUnitIndex]=U16_LEAD(n);
567                dest[codeUnitIndex+1]=U16_TRAIL(n);
568            }
569            if(caseFlags!=NULL) {
570                /* Case of last character determines uppercase flag: */
571                caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
572                if(cpLength==2) {
573                    caseFlags[codeUnitIndex+1]=FALSE;
574                }
575            }
576        }
577        destLength+=cpLength;
578        U_ASSERT(destLength>=0);
579        ++i;
580    }
581
582    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
583}
584
585/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
586
587#endif /* #if !UCONFIG_NO_IDNA */
588