1/*
2*******************************************************************************
3*
4*   Copyright (C) 1999-2009, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  unistr_cnv.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:2
12*
13*   created on: 2004aug19
14*   created by: Markus W. Scherer
15*
16*   Character conversion functions moved here from unistr.cpp
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_CONVERSION
22
23#include "unicode/putil.h"
24#include "cstring.h"
25#include "cmemory.h"
26#include "unicode/ustring.h"
27#include "unicode/unistr.h"
28#include "unicode/ucnv.h"
29#include "ucnv_imp.h"
30#include "putilimp.h"
31#include "ustr_cnv.h"
32#include "ustr_imp.h"
33
34U_NAMESPACE_BEGIN
35
36//========================================
37// Constructors
38//========================================
39
40#if !U_CHARSET_IS_UTF8
41
42UnicodeString::UnicodeString(const char *codepageData)
43  : fShortLength(0),
44    fFlags(kShortString)
45{
46    if(codepageData != 0) {
47        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
48    }
49}
50
51UnicodeString::UnicodeString(const char *codepageData,
52                             int32_t dataLength)
53  : fShortLength(0),
54    fFlags(kShortString)
55{
56    if(codepageData != 0) {
57        doCodepageCreate(codepageData, dataLength, 0);
58    }
59}
60
61// else see unistr.cpp
62#endif
63
64UnicodeString::UnicodeString(const char *codepageData,
65                             const char *codepage)
66  : fShortLength(0),
67    fFlags(kShortString)
68{
69    if(codepageData != 0) {
70        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
71    }
72}
73
74UnicodeString::UnicodeString(const char *codepageData,
75                             int32_t dataLength,
76                             const char *codepage)
77  : fShortLength(0),
78    fFlags(kShortString)
79{
80    if(codepageData != 0) {
81        doCodepageCreate(codepageData, dataLength, codepage);
82    }
83}
84
85UnicodeString::UnicodeString(const char *src, int32_t srcLength,
86                             UConverter *cnv,
87                             UErrorCode &errorCode)
88  : fShortLength(0),
89    fFlags(kShortString)
90{
91    if(U_SUCCESS(errorCode)) {
92        // check arguments
93        if(src==NULL) {
94            // treat as an empty string, do nothing more
95        } else if(srcLength<-1) {
96            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
97        } else {
98            // get input length
99            if(srcLength==-1) {
100                srcLength=(int32_t)uprv_strlen(src);
101            }
102            if(srcLength>0) {
103                if(cnv!=0) {
104                    // use the provided converter
105                    ucnv_resetToUnicode(cnv);
106                    doCodepageCreate(src, srcLength, cnv, errorCode);
107                } else {
108                    // use the default converter
109                    cnv=u_getDefaultConverter(&errorCode);
110                    doCodepageCreate(src, srcLength, cnv, errorCode);
111                    u_releaseDefaultConverter(cnv);
112                }
113            }
114        }
115
116        if(U_FAILURE(errorCode)) {
117            setToBogus();
118        }
119    }
120}
121
122//========================================
123// Codeset conversion
124//========================================
125
126#if !U_CHARSET_IS_UTF8
127
128int32_t
129UnicodeString::extract(int32_t start,
130                       int32_t length,
131                       char *target,
132                       uint32_t dstSize) const {
133    return extract(start, length, target, dstSize, 0);
134}
135
136// else see unistr.cpp
137#endif
138
139int32_t
140UnicodeString::extract(int32_t start,
141                       int32_t length,
142                       char *target,
143                       uint32_t dstSize,
144                       const char *codepage) const
145{
146    // if the arguments are illegal, then do nothing
147    if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
148        return 0;
149    }
150
151    // pin the indices to legal values
152    pinIndices(start, length);
153
154    // We need to cast dstSize to int32_t for all subsequent code.
155    // I don't know why the API was defined with uint32_t but we are stuck with it.
156    // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
157    // as a limit in some functions, it may wrap around and yield a pointer
158    // that compares less-than target.
159    int32_t capacity;
160    if(dstSize < 0x7fffffff) {
161        // Assume that the capacity is real and a limit pointer won't wrap around.
162        capacity = (int32_t)dstSize;
163    } else {
164        char *targetLimit = target + 0x7fffffff;
165        if(targetLimit < target) {
166            // Pin the capacity so that a limit pointer does not wrap around.
167            targetLimit = (char *)U_MAX_PTR(target);
168            capacity = (int32_t)(targetLimit - target);
169        } else {
170            // Pin the capacity to the maximum int32_t value.
171            capacity = 0x7fffffff;
172        }
173    }
174
175    // create the converter
176    UConverter *converter;
177    UErrorCode status = U_ZERO_ERROR;
178
179    // just write the NUL if the string length is 0
180    if(length == 0) {
181        return u_terminateChars(target, capacity, 0, &status);
182    }
183
184    // if the codepage is the default, use our cache
185    // if it is an empty string, then use the "invariant character" conversion
186    if (codepage == 0) {
187        const char *defaultName = ucnv_getDefaultName();
188        if(UCNV_FAST_IS_UTF8(defaultName)) {
189            return toUTF8(start, length, target, capacity);
190        }
191        converter = u_getDefaultConverter(&status);
192    } else if (*codepage == 0) {
193        // use the "invariant characters" conversion
194        int32_t destLength;
195        if(length <= capacity) {
196            destLength = length;
197        } else {
198            destLength = capacity;
199        }
200        u_UCharsToChars(getArrayStart() + start, target, destLength);
201        return u_terminateChars(target, capacity, length, &status);
202    } else {
203        converter = ucnv_open(codepage, &status);
204    }
205
206    length = doExtract(start, length, target, capacity, converter, status);
207
208    // close the converter
209    if (codepage == 0) {
210        u_releaseDefaultConverter(converter);
211    } else {
212        ucnv_close(converter);
213    }
214
215    return length;
216}
217
218int32_t
219UnicodeString::extract(char *dest, int32_t destCapacity,
220                       UConverter *cnv,
221                       UErrorCode &errorCode) const
222{
223    if(U_FAILURE(errorCode)) {
224        return 0;
225    }
226
227    if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
228        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
229        return 0;
230    }
231
232    // nothing to do?
233    if(isEmpty()) {
234        return u_terminateChars(dest, destCapacity, 0, &errorCode);
235    }
236
237    // get the converter
238    UBool isDefaultConverter;
239    if(cnv==0) {
240        isDefaultConverter=TRUE;
241        cnv=u_getDefaultConverter(&errorCode);
242        if(U_FAILURE(errorCode)) {
243            return 0;
244        }
245    } else {
246        isDefaultConverter=FALSE;
247        ucnv_resetFromUnicode(cnv);
248    }
249
250    // convert
251    int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
252
253    // release the converter
254    if(isDefaultConverter) {
255        u_releaseDefaultConverter(cnv);
256    }
257
258    return len;
259}
260
261int32_t
262UnicodeString::doExtract(int32_t start, int32_t length,
263                         char *dest, int32_t destCapacity,
264                         UConverter *cnv,
265                         UErrorCode &errorCode) const
266{
267    if(U_FAILURE(errorCode)) {
268        if(destCapacity!=0) {
269            *dest=0;
270        }
271        return 0;
272    }
273
274    const UChar *src=getArrayStart()+start, *srcLimit=src+length;
275    char *originalDest=dest;
276    const char *destLimit;
277
278    if(destCapacity==0) {
279        destLimit=dest=0;
280    } else if(destCapacity==-1) {
281        // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
282        destLimit=(char*)U_MAX_PTR(dest);
283        // for NUL-termination, translate into highest int32_t
284        destCapacity=0x7fffffff;
285    } else {
286        destLimit=dest+destCapacity;
287    }
288
289    // perform the conversion
290    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
291    length=(int32_t)(dest-originalDest);
292
293    // if an overflow occurs, then get the preflighting length
294    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
295        char buffer[1024];
296
297        destLimit=buffer+sizeof(buffer);
298        do {
299            dest=buffer;
300            errorCode=U_ZERO_ERROR;
301            ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
302            length+=(int32_t)(dest-buffer);
303        } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
304    }
305
306    return u_terminateChars(originalDest, destCapacity, length, &errorCode);
307}
308
309void
310UnicodeString::doCodepageCreate(const char *codepageData,
311                                int32_t dataLength,
312                                const char *codepage)
313{
314    // if there's nothing to convert, do nothing
315    if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
316        return;
317    }
318    if(dataLength == -1) {
319        dataLength = (int32_t)uprv_strlen(codepageData);
320    }
321
322    UErrorCode status = U_ZERO_ERROR;
323
324    // create the converter
325    // if the codepage is the default, use our cache
326    // if it is an empty string, then use the "invariant character" conversion
327    UConverter *converter;
328    if (codepage == 0) {
329        const char *defaultName = ucnv_getDefaultName();
330        if(UCNV_FAST_IS_UTF8(defaultName)) {
331            setToUTF8(StringPiece(codepageData, dataLength));
332            return;
333        }
334        converter = u_getDefaultConverter(&status);
335    } else if(*codepage == 0) {
336        // use the "invariant characters" conversion
337        if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
338            u_charsToUChars(codepageData, getArrayStart(), dataLength);
339            setLength(dataLength);
340        } else {
341            setToBogus();
342        }
343        return;
344    } else {
345        converter = ucnv_open(codepage, &status);
346    }
347
348    // if we failed, set the appropriate flags and return
349    if(U_FAILURE(status)) {
350        setToBogus();
351        return;
352    }
353
354    // perform the conversion
355    doCodepageCreate(codepageData, dataLength, converter, status);
356    if(U_FAILURE(status)) {
357        setToBogus();
358    }
359
360    // close the converter
361    if(codepage == 0) {
362        u_releaseDefaultConverter(converter);
363    } else {
364        ucnv_close(converter);
365    }
366}
367
368void
369UnicodeString::doCodepageCreate(const char *codepageData,
370                                int32_t dataLength,
371                                UConverter *converter,
372                                UErrorCode &status)
373{
374    if(U_FAILURE(status)) {
375        return;
376    }
377
378    // set up the conversion parameters
379    const char *mySource     = codepageData;
380    const char *mySourceEnd  = mySource + dataLength;
381    UChar *array, *myTarget;
382
383    // estimate the size needed:
384    int32_t arraySize;
385    if(dataLength <= US_STACKBUF_SIZE) {
386        // try to use the stack buffer
387        arraySize = US_STACKBUF_SIZE;
388    } else {
389        // 1.25 UChar's per source byte should cover most cases
390        arraySize = dataLength + (dataLength >> 2);
391    }
392
393    // we do not care about the current contents
394    UBool doCopyArray = FALSE;
395    for(;;) {
396        if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
397            setToBogus();
398            break;
399        }
400
401        // perform the conversion
402        array = getArrayStart();
403        myTarget = array + length();
404        ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
405            &mySource, mySourceEnd, 0, TRUE, &status);
406
407        // update the conversion parameters
408        setLength((int32_t)(myTarget - array));
409
410        // allocate more space and copy data, if needed
411        if(status == U_BUFFER_OVERFLOW_ERROR) {
412            // reset the error code
413            status = U_ZERO_ERROR;
414
415            // keep the previous conversion results
416            doCopyArray = TRUE;
417
418            // estimate the new size needed, larger than before
419            // try 2 UChar's per remaining source byte
420            arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
421        } else {
422            break;
423        }
424    }
425}
426
427U_NAMESPACE_END
428
429#endif
430