1/*
2*******************************************************************************
3*
4*   Copyright (C) 2005-2011, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucasemap.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2005may06
14*   created by: Markus W. Scherer
15*
16*   Case mapping service object and functions using it.
17*/
18
19#include "unicode/utypes.h"
20#include "unicode/brkiter.h"
21#include "unicode/ubrk.h"
22#include "unicode/uloc.h"
23#include "unicode/ustring.h"
24#include "unicode/ucasemap.h"
25#if !UCONFIG_NO_BREAK_ITERATION
26#include "unicode/utext.h"
27#endif
28#include "unicode/utf.h"
29#include "unicode/utf8.h"
30#include "unicode/utf16.h"
31#include "cmemory.h"
32#include "cstring.h"
33#include "ucase.h"
34#include "ustr_imp.h"
35
36U_NAMESPACE_USE
37
38/* UCaseMap service object -------------------------------------------------- */
39
40U_CAPI UCaseMap * U_EXPORT2
41ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
42    UCaseMap *csm;
43
44    if(U_FAILURE(*pErrorCode)) {
45        return NULL;
46    }
47
48    csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap));
49    if(csm==NULL) {
50        return NULL;
51    }
52    uprv_memset(csm, 0, sizeof(UCaseMap));
53
54    csm->csp=ucase_getSingleton();
55    ucasemap_setLocale(csm, locale, pErrorCode);
56    if(U_FAILURE(*pErrorCode)) {
57        uprv_free(csm);
58        return NULL;
59    }
60
61    csm->options=options;
62    return csm;
63}
64
65U_CAPI void U_EXPORT2
66ucasemap_close(UCaseMap *csm) {
67    if(csm!=NULL) {
68#if !UCONFIG_NO_BREAK_ITERATION
69        // Do not call ubrk_close() so that we do not depend on all of the BreakIterator code.
70        delete reinterpret_cast<BreakIterator *>(csm->iter);
71#endif
72        uprv_free(csm);
73    }
74}
75
76U_CAPI const char * U_EXPORT2
77ucasemap_getLocale(const UCaseMap *csm) {
78    return csm->locale;
79}
80
81U_CAPI uint32_t U_EXPORT2
82ucasemap_getOptions(const UCaseMap *csm) {
83    return csm->options;
84}
85
86U_CAPI void U_EXPORT2
87ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
88    int32_t length;
89
90    if(U_FAILURE(*pErrorCode)) {
91        return;
92    }
93
94    length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
95    if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
96        *pErrorCode=U_ZERO_ERROR;
97        /* we only really need the language code for case mappings */
98        length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
99    }
100    if(length==sizeof(csm->locale)) {
101        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
102    }
103    csm->locCache=0;
104    if(U_SUCCESS(*pErrorCode)) {
105        ucase_getCaseLocale(csm->locale, &csm->locCache);
106    } else {
107        csm->locale[0]=0;
108    }
109}
110
111U_CAPI void U_EXPORT2
112ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode * /*pErrorCode*/) {
113    csm->options=options;
114}
115
116/* UTF-8 string case mappings ----------------------------------------------- */
117
118/* TODO(markus): Move to a new, separate utf8case.c file. */
119
120/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
121static inline int32_t
122appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
123             int32_t result, const UChar *s) {
124    UChar32 c;
125    int32_t length, destLength;
126    UErrorCode errorCode;
127
128    /* decode the result */
129    if(result<0) {
130        /* (not) original code point */
131        c=~result;
132        length=-1;
133    } else if(result<=UCASE_MAX_STRING_LENGTH) {
134        c=U_SENTINEL;
135        length=result;
136    } else {
137        c=result;
138        length=-1;
139    }
140
141    if(destIndex<destCapacity) {
142        /* append the result */
143        if(length<0) {
144            /* code point */
145            UBool isError=FALSE;
146            U8_APPEND(dest, destIndex, destCapacity, c, isError);
147            if(isError) {
148                /* overflow, nothing written */
149                destIndex+=U8_LENGTH(c);
150            }
151        } else {
152            /* string */
153            errorCode=U_ZERO_ERROR;
154            u_strToUTF8(
155                (char *)(dest+destIndex), destCapacity-destIndex, &destLength,
156                s, length,
157                &errorCode);
158            destIndex+=destLength;
159            /* we might have an overflow, but we know the actual length */
160        }
161    } else {
162        /* preflight */
163        if(length<0) {
164            destIndex+=U8_LENGTH(c);
165        } else {
166            errorCode=U_ZERO_ERROR;
167            u_strToUTF8(
168                NULL, 0, &destLength,
169                s, length,
170                &errorCode);
171            destIndex+=destLength;
172        }
173    }
174    return destIndex;
175}
176
177static UChar32 U_CALLCONV
178utf8_caseContextIterator(void *context, int8_t dir) {
179    UCaseContext *csc=(UCaseContext *)context;
180    UChar32 c;
181
182    if(dir<0) {
183        /* reset for backward iteration */
184        csc->index=csc->cpStart;
185        csc->dir=dir;
186    } else if(dir>0) {
187        /* reset for forward iteration */
188        csc->index=csc->cpLimit;
189        csc->dir=dir;
190    } else {
191        /* continue current iteration direction */
192        dir=csc->dir;
193    }
194
195    if(dir<0) {
196        if(csc->start<csc->index) {
197            U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
198            return c;
199        }
200    } else {
201        if(csc->index<csc->limit) {
202            U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
203            return c;
204        }
205    }
206    return U_SENTINEL;
207}
208
209/*
210 * Case-maps [srcStart..srcLimit[ but takes
211 * context [0..srcLength[ into account.
212 */
213static int32_t
214_caseMap(const UCaseMap *csm, UCaseMapFull *map,
215         uint8_t *dest, int32_t destCapacity,
216         const uint8_t *src, UCaseContext *csc,
217         int32_t srcStart, int32_t srcLimit,
218         UErrorCode *pErrorCode) {
219    const UChar *s;
220    UChar32 c, c2 = 0;
221    int32_t srcIndex, destIndex;
222    int32_t locCache;
223
224    locCache=csm->locCache;
225
226    /* case mapping loop */
227    srcIndex=srcStart;
228    destIndex=0;
229    while(srcIndex<srcLimit) {
230        csc->cpStart=srcIndex;
231        U8_NEXT(src, srcIndex, srcLimit, c);
232        csc->cpLimit=srcIndex;
233        if(c<0) {
234            int32_t i=csc->cpStart;
235            while(destIndex<destCapacity && i<srcIndex) {
236                dest[destIndex++]=src[i++];
237            }
238            continue;
239        }
240        c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache);
241        if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
242            /* fast path version of appendResult() for ASCII results */
243            dest[destIndex++]=(uint8_t)c2;
244        } else {
245            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
246        }
247    }
248
249    if(destIndex>destCapacity) {
250        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
251    }
252    return destIndex;
253}
254
255#if !UCONFIG_NO_BREAK_ITERATION
256
257U_CFUNC int32_t U_CALLCONV
258ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
259         uint8_t *dest, int32_t destCapacity,
260         const uint8_t *src, int32_t srcLength,
261         UErrorCode *pErrorCode) {
262    const UChar *s;
263    UChar32 c;
264    int32_t prev, titleStart, titleLimit, idx, destIndex, length;
265    UBool isFirstIndex;
266
267    if(U_FAILURE(*pErrorCode)) {
268        return 0;
269    }
270
271    // Use the C++ abstract base class to minimize dependencies.
272    // TODO: Change UCaseMap.iter to store a BreakIterator directly.
273    BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
274
275    /* set up local variables */
276    int32_t locCache=csm->locCache;
277    UCaseContext csc=UCASECONTEXT_INITIALIZER;
278    csc.p=(void *)src;
279    csc.limit=srcLength;
280    destIndex=0;
281    prev=0;
282    isFirstIndex=TRUE;
283
284    /* titlecasing loop */
285    while(prev<srcLength) {
286        /* find next index where to titlecase */
287        if(isFirstIndex) {
288            isFirstIndex=FALSE;
289            idx=bi->first();
290        } else {
291            idx=bi->next();
292        }
293        if(idx==UBRK_DONE || idx>srcLength) {
294            idx=srcLength;
295        }
296
297        /*
298         * Unicode 4 & 5 section 3.13 Default Case Operations:
299         *
300         * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
301         * #29, "Text Boundaries." Between each pair of word boundaries, find the first
302         * cased character F. If F exists, map F to default_title(F); then map each
303         * subsequent character C to default_lower(C).
304         *
305         * In this implementation, segment [prev..index[ into 3 parts:
306         * a) uncased characters (copy as-is) [prev..titleStart[
307         * b) first case letter (titlecase)         [titleStart..titleLimit[
308         * c) subsequent characters (lowercase)                 [titleLimit..index[
309         */
310        if(prev<idx) {
311            /* find and copy uncased characters [prev..titleStart[ */
312            titleStart=titleLimit=prev;
313            U8_NEXT(src, titleLimit, idx, c);
314            if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
315                /* Adjust the titlecasing index (titleStart) to the next cased character. */
316                for(;;) {
317                    titleStart=titleLimit;
318                    if(titleLimit==idx) {
319                        /*
320                         * only uncased characters in [prev..index[
321                         * stop with titleStart==titleLimit==index
322                         */
323                        break;
324                    }
325                    U8_NEXT(src, titleLimit, idx, c);
326                    if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
327                        break; /* cased letter at [titleStart..titleLimit[ */
328                    }
329                }
330                length=titleStart-prev;
331                if(length>0) {
332                    if((destIndex+length)<=destCapacity) {
333                        uprv_memcpy(dest+destIndex, src+prev, length);
334                    }
335                    destIndex+=length;
336                }
337            }
338
339            if(titleStart<titleLimit) {
340                /* titlecase c which is from [titleStart..titleLimit[ */
341                csc.cpStart=titleStart;
342                csc.cpLimit=titleLimit;
343                c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, &csc, &s, csm->locale, &locCache);
344                destIndex=appendResult(dest, destIndex, destCapacity, c, s);
345
346                /* Special case Dutch IJ titlecasing */
347                if ( titleStart+1 < idx &&
348                     ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_DUTCH &&
349                     ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) &&
350                     ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) {
351                            c=0x004A;
352                            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
353                            titleLimit++;
354                }
355                /* lowercase [titleLimit..index[ */
356                if(titleLimit<idx) {
357                    if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
358                        /* Normal operation: Lowercase the rest of the word. */
359                        destIndex+=
360                            _caseMap(
361                                csm, ucase_toFullLower,
362                                dest+destIndex, destCapacity-destIndex,
363                                src, &csc,
364                                titleLimit, idx,
365                                pErrorCode);
366                    } else {
367                        /* Optionally just copy the rest of the word unchanged. */
368                        length=idx-titleLimit;
369                        if((destIndex+length)<=destCapacity) {
370                            uprv_memcpy(dest+destIndex, src+titleLimit, length);
371                        }
372                        destIndex+=length;
373                    }
374                }
375            }
376        }
377
378        prev=idx;
379    }
380
381    if(destIndex>destCapacity) {
382        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
383    }
384    return destIndex;
385}
386
387#endif
388
389static int32_t U_CALLCONV
390ucasemap_internalUTF8ToLower(const UCaseMap *csm,
391                             uint8_t *dest, int32_t destCapacity,
392                             const uint8_t *src, int32_t srcLength,
393                             UErrorCode *pErrorCode) {
394    UCaseContext csc=UCASECONTEXT_INITIALIZER;
395    csc.p=(void *)src;
396    csc.limit=srcLength;
397    return _caseMap(
398        csm, ucase_toFullLower,
399        dest, destCapacity,
400        src, &csc, 0, srcLength,
401        pErrorCode);
402}
403
404static int32_t U_CALLCONV
405ucasemap_internalUTF8ToUpper(const UCaseMap *csm,
406                             uint8_t *dest, int32_t destCapacity,
407                             const uint8_t *src, int32_t srcLength,
408                             UErrorCode *pErrorCode) {
409    UCaseContext csc=UCASECONTEXT_INITIALIZER;
410    csc.p=(void *)src;
411    csc.limit=srcLength;
412    return _caseMap(
413        csm, ucase_toFullUpper,
414        dest, destCapacity,
415        src, &csc, 0, srcLength,
416        pErrorCode);
417}
418
419static int32_t
420utf8_foldCase(const UCaseProps *csp,
421              uint8_t *dest, int32_t destCapacity,
422              const uint8_t *src, int32_t srcLength,
423              uint32_t options,
424              UErrorCode *pErrorCode) {
425    int32_t srcIndex, destIndex;
426
427    const UChar *s;
428    UChar32 c, c2;
429    int32_t start;
430
431    /* case mapping loop */
432    srcIndex=destIndex=0;
433    while(srcIndex<srcLength) {
434        start=srcIndex;
435        U8_NEXT(src, srcIndex, srcLength, c);
436        if(c<0) {
437            while(destIndex<destCapacity && start<srcIndex) {
438                dest[destIndex++]=src[start++];
439            }
440            continue;
441        }
442        c=ucase_toFullFolding(csp, c, &s, options);
443        if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
444            /* fast path version of appendResult() for ASCII results */
445            dest[destIndex++]=(uint8_t)c2;
446        } else {
447            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
448        }
449    }
450
451    if(destIndex>destCapacity) {
452        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
453    }
454    return destIndex;
455}
456
457static int32_t U_CALLCONV
458ucasemap_internalUTF8Fold(const UCaseMap *csm,
459                          uint8_t *dest, int32_t destCapacity,
460                          const uint8_t *src, int32_t srcLength,
461                          UErrorCode *pErrorCode) {
462    return utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
463}
464
465U_CFUNC int32_t
466ucasemap_mapUTF8(const UCaseMap *csm,
467                 uint8_t *dest, int32_t destCapacity,
468                 const uint8_t *src, int32_t srcLength,
469                 UTF8CaseMapper *stringCaseMapper,
470                 UErrorCode *pErrorCode) {
471    int32_t destLength;
472
473    /* check argument values */
474    if(U_FAILURE(*pErrorCode)) {
475        return 0;
476    }
477    if( destCapacity<0 ||
478        (dest==NULL && destCapacity>0) ||
479        src==NULL ||
480        srcLength<-1
481    ) {
482        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
483        return 0;
484    }
485
486    /* get the string length */
487    if(srcLength==-1) {
488        srcLength=(int32_t)uprv_strlen((const char *)src);
489    }
490
491    /* check for overlapping source and destination */
492    if( dest!=NULL &&
493        ((src>=dest && src<(dest+destCapacity)) ||
494         (dest>=src && dest<(src+srcLength)))
495    ) {
496        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
497        return 0;
498    }
499
500    destLength=stringCaseMapper(csm, dest, destCapacity, src, srcLength, pErrorCode);
501    return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode);
502}
503
504/* public API functions */
505
506U_CAPI int32_t U_EXPORT2
507ucasemap_utf8ToLower(const UCaseMap *csm,
508                     char *dest, int32_t destCapacity,
509                     const char *src, int32_t srcLength,
510                     UErrorCode *pErrorCode) {
511    return ucasemap_mapUTF8(csm,
512                   (uint8_t *)dest, destCapacity,
513                   (const uint8_t *)src, srcLength,
514                   ucasemap_internalUTF8ToLower, pErrorCode);
515}
516
517U_CAPI int32_t U_EXPORT2
518ucasemap_utf8ToUpper(const UCaseMap *csm,
519                     char *dest, int32_t destCapacity,
520                     const char *src, int32_t srcLength,
521                     UErrorCode *pErrorCode) {
522    return ucasemap_mapUTF8(csm,
523                   (uint8_t *)dest, destCapacity,
524                   (const uint8_t *)src, srcLength,
525                   ucasemap_internalUTF8ToUpper, pErrorCode);
526}
527
528U_CAPI int32_t U_EXPORT2
529ucasemap_utf8FoldCase(const UCaseMap *csm,
530                      char *dest, int32_t destCapacity,
531                      const char *src, int32_t srcLength,
532                      UErrorCode *pErrorCode) {
533    return ucasemap_mapUTF8(csm,
534                   (uint8_t *)dest, destCapacity,
535                   (const uint8_t *)src, srcLength,
536                   ucasemap_internalUTF8Fold, pErrorCode);
537}
538