1/*
2*******************************************************************************
3*
4*   Copyright (C) 2002-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  punycode.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2002jan31
14*   created by: Markus W. Scherer
15*/
16
17
18/* This ICU code derived from: */
19/*
20punycode.c 0.4.0 (2001-Nov-17-Sat)
21http://www.cs.berkeley.edu/~amc/idn/
22Adam M. Costello
23http://www.nicemice.net/amc/
24
25Disclaimer and license
26
27    Regarding this entire document or any portion of it (including
28    the pseudocode and C code), the author makes no guarantees and
29    is not responsible for any damage resulting from its use.  The
30    author grants irrevocable permission to anyone to use, modify,
31    and distribute it in any way that does not diminish the rights
32    of anyone else to use, modify, and distribute it, provided that
33    redistributed derivative works do not contain misleading author or
34    version information.  Derivative works need not be licensed under
35    similar terms.
36*/
37/*
38 * ICU modifications:
39 * - ICU data types and coding conventions
40 * - ICU string buffer handling with implicit source lengths
41 *   and destination preflighting
42 * - UTF-16 handling
43 */
44
45#include "unicode/utypes.h"
46
47#if !UCONFIG_NO_IDNA
48
49#include "ustr_imp.h"
50#include "cstring.h"
51#include "cmemory.h"
52#include "punycode.h"
53#include "unicode/ustring.h"
54
55
56/* Punycode ----------------------------------------------------------------- */
57
58/* Punycode parameters for Bootstring */
59#define BASE            36
60#define TMIN            1
61#define TMAX            26
62#define SKEW            38
63#define DAMP            700
64#define INITIAL_BIAS    72
65#define INITIAL_N       0x80
66
67/* "Basic" Unicode/ASCII code points */
68#define _HYPHEN         0X2d
69#define DELIMITER       _HYPHEN
70
71#define _ZERO_          0X30
72#define _NINE           0x39
73
74#define _SMALL_A        0X61
75#define _SMALL_Z        0X7a
76
77#define _CAPITAL_A      0X41
78#define _CAPITAL_Z      0X5a
79
80#define IS_BASIC(c) ((c)<0x80)
81#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
82
83/**
84 * digitToBasic() returns the basic code point whose value
85 * (when used for representing integers) is d, which must be in the
86 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
87 * nonzero, in which case the uppercase form is used.
88 */
89static U_INLINE char
90digitToBasic(int32_t digit, UBool uppercase) {
91    /*  0..25 map to ASCII a..z or A..Z */
92    /* 26..35 map to ASCII 0..9         */
93    if(digit<26) {
94        if(uppercase) {
95            return (char)(_CAPITAL_A+digit);
96        } else {
97            return (char)(_SMALL_A+digit);
98        }
99    } else {
100        return (char)((_ZERO_-26)+digit);
101    }
102}
103
104/**
105 * basicToDigit[] contains the numeric value of a basic code
106 * point (for use in representing integers) in the range 0 to
107 * BASE-1, or -1 if b is does not represent a value.
108 */
109static const int8_t
110basicToDigit[256]={
111    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
112    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
113
114    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
115    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
116
117    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
118    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
119
120    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
121    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
122
123    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
124    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
125
126    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
127    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
128
129    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
130    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
131
132    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
133    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
134};
135
136static U_INLINE char
137asciiCaseMap(char b, UBool uppercase) {
138    if(uppercase) {
139        if(_SMALL_A<=b && b<=_SMALL_Z) {
140            b-=(_SMALL_A-_CAPITAL_A);
141        }
142    } else {
143        if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
144            b+=(_SMALL_A-_CAPITAL_A);
145        }
146    }
147    return b;
148}
149
150/* Punycode-specific Bootstring code ---------------------------------------- */
151
152/*
153 * The following code omits the {parts} of the pseudo-algorithm in the spec
154 * that are not used with the Punycode parameter set.
155 */
156
157/* Bias adaptation function. */
158static int32_t
159adaptBias(int32_t delta, int32_t length, UBool firstTime) {
160    int32_t count;
161
162    if(firstTime) {
163        delta/=DAMP;
164    } else {
165        delta/=2;
166    }
167
168    delta+=delta/length;
169    for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
170        delta/=(BASE-TMIN);
171    }
172
173    return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
174}
175
176#define MAX_CP_COUNT    200
177
178U_CFUNC int32_t
179u_strToPunycode(const UChar *src, int32_t srcLength,
180                UChar *dest, int32_t destCapacity,
181                const UBool *caseFlags,
182                UErrorCode *pErrorCode) {
183
184    int32_t cpBuffer[MAX_CP_COUNT];
185    int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
186    UChar c, c2;
187
188    /* argument checking */
189    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
190        return 0;
191    }
192
193    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
194        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
195        return 0;
196    }
197
198    /*
199     * Handle the basic code points and
200     * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
201     */
202    srcCPCount=destLength=0;
203    if(srcLength==-1) {
204        /* NUL-terminated input */
205        for(j=0; /* no condition */; ++j) {
206            if((c=src[j])==0) {
207                break;
208            }
209            if(srcCPCount==MAX_CP_COUNT) {
210                /* too many input code points */
211                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
212                return 0;
213            }
214            if(IS_BASIC(c)) {
215                cpBuffer[srcCPCount++]=0;
216                if(destLength<destCapacity) {
217                    dest[destLength]=
218                        caseFlags!=NULL ?
219                            asciiCaseMap((char)c, caseFlags[j]) :
220                            (char)c;
221                }
222                ++destLength;
223            } else {
224                n=(caseFlags!=NULL && caseFlags[j])<<31L;
225                if(UTF_IS_SINGLE(c)) {
226                    n|=c;
227                } else if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(c2=src[j+1])) {
228                    ++j;
229                    n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
230                } else {
231                    /* error: unmatched surrogate */
232                    *pErrorCode=U_INVALID_CHAR_FOUND;
233                    return 0;
234                }
235                cpBuffer[srcCPCount++]=n;
236            }
237        }
238    } else {
239        /* length-specified input */
240        for(j=0; j<srcLength; ++j) {
241            if(srcCPCount==MAX_CP_COUNT) {
242                /* too many input code points */
243                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
244                return 0;
245            }
246            c=src[j];
247            if(IS_BASIC(c)) {
248                cpBuffer[srcCPCount++]=0;
249                if(destLength<destCapacity) {
250                    dest[destLength]=
251                        caseFlags!=NULL ?
252                            asciiCaseMap((char)c, caseFlags[j]) :
253                            (char)c;
254                }
255                ++destLength;
256            } else {
257                n=(caseFlags!=NULL && caseFlags[j])<<31L;
258                if(UTF_IS_SINGLE(c)) {
259                    n|=c;
260                } else if(UTF_IS_LEAD(c) && (j+1)<srcLength && UTF_IS_TRAIL(c2=src[j+1])) {
261                    ++j;
262                    n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
263                } else {
264                    /* error: unmatched surrogate */
265                    *pErrorCode=U_INVALID_CHAR_FOUND;
266                    return 0;
267                }
268                cpBuffer[srcCPCount++]=n;
269            }
270        }
271    }
272
273    /* Finish the basic string - if it is not empty - with a delimiter. */
274    basicLength=destLength;
275    if(basicLength>0) {
276        if(destLength<destCapacity) {
277            dest[destLength]=DELIMITER;
278        }
279        ++destLength;
280    }
281
282    /*
283     * handledCPCount is the number of code points that have been handled
284     * basicLength is the number of basic code points
285     * destLength is the number of chars that have been output
286     */
287
288    /* Initialize the state: */
289    n=INITIAL_N;
290    delta=0;
291    bias=INITIAL_BIAS;
292
293    /* Main encoding loop: */
294    for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
295        /*
296         * All non-basic code points < n have been handled already.
297         * Find the next larger one:
298         */
299        for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
300            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
301            if(n<=q && q<m) {
302                m=q;
303            }
304        }
305
306        /*
307         * Increase delta enough to advance the decoder's
308         * <n,i> state to <m,0>, but guard against overflow:
309         */
310        if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
311            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
312            return 0;
313        }
314        delta+=(m-n)*(handledCPCount+1);
315        n=m;
316
317        /* Encode a sequence of same code points n */
318        for(j=0; j<srcCPCount; ++j) {
319            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
320            if(q<n) {
321                ++delta;
322            } else if(q==n) {
323                /* Represent delta as a generalized variable-length integer: */
324                for(q=delta, k=BASE; /* no condition */; k+=BASE) {
325
326                    /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
327
328                    t=k-bias;
329                    if(t<TMIN) {
330                        t=TMIN;
331                    } else if(t>TMAX) {
332                        t=TMAX;
333                    }
334                    */
335
336                    t=k-bias;
337                    if(t<TMIN) {
338                        t=TMIN;
339                    } else if(k>=(bias+TMAX)) {
340                        t=TMAX;
341                    }
342
343                    if(q<t) {
344                        break;
345                    }
346
347                    if(destLength<destCapacity) {
348                        dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
349                    }
350                    ++destLength;
351                    q=(q-t)/(BASE-t);
352                }
353
354                if(destLength<destCapacity) {
355                    dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
356                }
357                ++destLength;
358                bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
359                delta=0;
360                ++handledCPCount;
361            }
362        }
363
364        ++delta;
365        ++n;
366    }
367
368    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
369}
370
371U_CFUNC int32_t
372u_strFromPunycode(const UChar *src, int32_t srcLength,
373                  UChar *dest, int32_t destCapacity,
374                  UBool *caseFlags,
375                  UErrorCode *pErrorCode) {
376    int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
377            destCPCount, firstSupplementaryIndex, cpLength;
378    UChar b;
379
380    /* argument checking */
381    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
382        return 0;
383    }
384
385    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
386        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
387        return 0;
388    }
389
390    if(srcLength==-1) {
391        srcLength=u_strlen(src);
392    }
393
394    /*
395     * Handle the basic code points:
396     * Let basicLength be the number of input code points
397     * before the last delimiter, or 0 if there is none,
398     * then copy the first basicLength code points to the output.
399     *
400     * The two following loops iterate backward.
401     */
402    for(j=srcLength; j>0;) {
403        if(src[--j]==DELIMITER) {
404            break;
405        }
406    }
407    destLength=basicLength=destCPCount=j;
408
409    while(j>0) {
410        b=src[--j];
411        if(!IS_BASIC(b)) {
412            *pErrorCode=U_INVALID_CHAR_FOUND;
413            return 0;
414        }
415
416        if(j<destCapacity) {
417            dest[j]=(UChar)b;
418
419            if(caseFlags!=NULL) {
420                caseFlags[j]=IS_BASIC_UPPERCASE(b);
421            }
422        }
423    }
424
425    /* Initialize the state: */
426    n=INITIAL_N;
427    i=0;
428    bias=INITIAL_BIAS;
429    firstSupplementaryIndex=1000000000;
430
431    /*
432     * Main decoding loop:
433     * Start just after the last delimiter if any
434     * basic code points were copied; start at the beginning otherwise.
435     */
436    for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
437        /*
438         * in is the index of the next character to be consumed, and
439         * destCPCount is the number of code points in the output array.
440         *
441         * Decode a generalized variable-length integer into delta,
442         * which gets added to i.  The overflow checking is easier
443         * if we increase i as we go, then subtract off its starting
444         * value at the end to obtain delta.
445         */
446        for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
447            if(in>=srcLength) {
448                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
449                return 0;
450            }
451
452            digit=basicToDigit[(uint8_t)src[in++]];
453            if(digit<0) {
454                *pErrorCode=U_INVALID_CHAR_FOUND;
455                return 0;
456            }
457            if(digit>(0x7fffffff-i)/w) {
458                /* integer overflow */
459                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
460                return 0;
461            }
462
463            i+=digit*w;
464            /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
465            t=k-bias;
466            if(t<TMIN) {
467                t=TMIN;
468            } else if(t>TMAX) {
469                t=TMAX;
470            }
471            */
472            t=k-bias;
473            if(t<TMIN) {
474                t=TMIN;
475            } else if(k>=(bias+TMAX)) {
476                t=TMAX;
477            }
478            if(digit<t) {
479                break;
480            }
481
482            if(w>0x7fffffff/(BASE-t)) {
483                /* integer overflow */
484                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
485                return 0;
486            }
487            w*=BASE-t;
488        }
489
490        /*
491         * Modification from sample code:
492         * Increments destCPCount here,
493         * where needed instead of in for() loop tail.
494         */
495        ++destCPCount;
496        bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
497
498        /*
499         * i was supposed to wrap around from (incremented) destCPCount to 0,
500         * incrementing n each time, so we'll fix that now:
501         */
502        if(i/destCPCount>(0x7fffffff-n)) {
503            /* integer overflow */
504            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
505            return 0;
506        }
507
508        n+=i/destCPCount;
509        i%=destCPCount;
510        /* not needed for Punycode: */
511        /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
512
513        if(n>0x10ffff || UTF_IS_SURROGATE(n)) {
514            /* Unicode code point overflow */
515            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
516            return 0;
517        }
518
519        /* Insert n at position i of the output: */
520        cpLength=UTF_CHAR_LENGTH(n);
521        if((destLength+cpLength)<=destCapacity) {
522            int32_t codeUnitIndex;
523
524            /*
525             * Handle indexes when supplementary code points are present.
526             *
527             * In almost all cases, there will be only BMP code points before i
528             * and even in the entire string.
529             * This is handled with the same efficiency as with UTF-32.
530             *
531             * Only the rare cases with supplementary code points are handled
532             * more slowly - but not too bad since this is an insertion anyway.
533             */
534            if(i<=firstSupplementaryIndex) {
535                codeUnitIndex=i;
536                if(cpLength>1) {
537                    firstSupplementaryIndex=codeUnitIndex;
538                } else {
539                    ++firstSupplementaryIndex;
540                }
541            } else {
542                codeUnitIndex=firstSupplementaryIndex;
543                UTF_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
544            }
545
546            /* use the UChar index codeUnitIndex instead of the code point index i */
547            if(codeUnitIndex<destLength) {
548                uprv_memmove(dest+codeUnitIndex+cpLength,
549                             dest+codeUnitIndex,
550                             (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
551                if(caseFlags!=NULL) {
552                    uprv_memmove(caseFlags+codeUnitIndex+cpLength,
553                                 caseFlags+codeUnitIndex,
554                                 destLength-codeUnitIndex);
555                }
556            }
557            if(cpLength==1) {
558                /* BMP, insert one code unit */
559                dest[codeUnitIndex]=(UChar)n;
560            } else {
561                /* supplementary character, insert two code units */
562                dest[codeUnitIndex]=UTF16_LEAD(n);
563                dest[codeUnitIndex+1]=UTF16_TRAIL(n);
564            }
565            if(caseFlags!=NULL) {
566                /* Case of last character determines uppercase flag: */
567                caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
568                if(cpLength==2) {
569                    caseFlags[codeUnitIndex+1]=FALSE;
570                }
571            }
572        }
573        destLength+=cpLength;
574        ++i;
575    }
576
577    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
578}
579
580/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
581
582#endif /* #if !UCONFIG_NO_IDNA */
583