1/*
2*******************************************************************************
3*
4*   Copyright (C) 1999-2014, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  unistr_cnv.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:2
12*
13*   created on: 2004aug19
14*   created by: Markus W. Scherer
15*
16*   Character conversion functions moved here from unistr.cpp
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_CONVERSION
22
23#include "unicode/putil.h"
24#include "cstring.h"
25#include "cmemory.h"
26#include "unicode/ustring.h"
27#include "unicode/unistr.h"
28#include "unicode/ucnv.h"
29#include "ucnv_imp.h"
30#include "putilimp.h"
31#include "ustr_cnv.h"
32#include "ustr_imp.h"
33
34U_NAMESPACE_BEGIN
35
36//========================================
37// Constructors
38//========================================
39
40#if !U_CHARSET_IS_UTF8
41
42UnicodeString::UnicodeString(const char *codepageData) {
43    fUnion.fFields.fLengthAndFlags = kShortString;
44    if(codepageData != 0) {
45        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
46    }
47}
48
49UnicodeString::UnicodeString(const char *codepageData,
50                             int32_t dataLength) {
51    fUnion.fFields.fLengthAndFlags = kShortString;
52    if(codepageData != 0) {
53        doCodepageCreate(codepageData, dataLength, 0);
54    }
55}
56
57// else see unistr.cpp
58#endif
59
60UnicodeString::UnicodeString(const char *codepageData,
61                             const char *codepage) {
62    fUnion.fFields.fLengthAndFlags = kShortString;
63    if(codepageData != 0) {
64        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
65    }
66}
67
68UnicodeString::UnicodeString(const char *codepageData,
69                             int32_t dataLength,
70                             const char *codepage) {
71    fUnion.fFields.fLengthAndFlags = kShortString;
72    if(codepageData != 0) {
73        doCodepageCreate(codepageData, dataLength, codepage);
74    }
75}
76
77UnicodeString::UnicodeString(const char *src, int32_t srcLength,
78                             UConverter *cnv,
79                             UErrorCode &errorCode) {
80    fUnion.fFields.fLengthAndFlags = kShortString;
81    if(U_SUCCESS(errorCode)) {
82        // check arguments
83        if(src==NULL) {
84            // treat as an empty string, do nothing more
85        } else if(srcLength<-1) {
86            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
87        } else {
88            // get input length
89            if(srcLength==-1) {
90                srcLength=(int32_t)uprv_strlen(src);
91            }
92            if(srcLength>0) {
93                if(cnv!=0) {
94                    // use the provided converter
95                    ucnv_resetToUnicode(cnv);
96                    doCodepageCreate(src, srcLength, cnv, errorCode);
97                } else {
98                    // use the default converter
99                    cnv=u_getDefaultConverter(&errorCode);
100                    doCodepageCreate(src, srcLength, cnv, errorCode);
101                    u_releaseDefaultConverter(cnv);
102                }
103            }
104        }
105
106        if(U_FAILURE(errorCode)) {
107            setToBogus();
108        }
109    }
110}
111
112//========================================
113// Codeset conversion
114//========================================
115
116#if !U_CHARSET_IS_UTF8
117
118int32_t
119UnicodeString::extract(int32_t start,
120                       int32_t length,
121                       char *target,
122                       uint32_t dstSize) const {
123    return extract(start, length, target, dstSize, 0);
124}
125
126// else see unistr.cpp
127#endif
128
129int32_t
130UnicodeString::extract(int32_t start,
131                       int32_t length,
132                       char *target,
133                       uint32_t dstSize,
134                       const char *codepage) const
135{
136    // if the arguments are illegal, then do nothing
137    if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
138        return 0;
139    }
140
141    // pin the indices to legal values
142    pinIndices(start, length);
143
144    // We need to cast dstSize to int32_t for all subsequent code.
145    // I don't know why the API was defined with uint32_t but we are stuck with it.
146    // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
147    // as a limit in some functions, it may wrap around and yield a pointer
148    // that compares less-than target.
149    int32_t capacity;
150    if(dstSize < 0x7fffffff) {
151        // Assume that the capacity is real and a limit pointer won't wrap around.
152        capacity = (int32_t)dstSize;
153    } else {
154        // Pin the capacity so that a limit pointer does not wrap around.
155        char *targetLimit = (char *)U_MAX_PTR(target);
156        // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
157        // greater than target and does not wrap around the top of the address space.
158        capacity = (int32_t)(targetLimit - target);
159    }
160
161    // create the converter
162    UConverter *converter;
163    UErrorCode status = U_ZERO_ERROR;
164
165    // just write the NUL if the string length is 0
166    if(length == 0) {
167        return u_terminateChars(target, capacity, 0, &status);
168    }
169
170    // if the codepage is the default, use our cache
171    // if it is an empty string, then use the "invariant character" conversion
172    if (codepage == 0) {
173        const char *defaultName = ucnv_getDefaultName();
174        if(UCNV_FAST_IS_UTF8(defaultName)) {
175            return toUTF8(start, length, target, capacity);
176        }
177        converter = u_getDefaultConverter(&status);
178    } else if (*codepage == 0) {
179        // use the "invariant characters" conversion
180        int32_t destLength;
181        if(length <= capacity) {
182            destLength = length;
183        } else {
184            destLength = capacity;
185        }
186        u_UCharsToChars(getArrayStart() + start, target, destLength);
187        return u_terminateChars(target, capacity, length, &status);
188    } else {
189        converter = ucnv_open(codepage, &status);
190    }
191
192    length = doExtract(start, length, target, capacity, converter, status);
193
194    // close the converter
195    if (codepage == 0) {
196        u_releaseDefaultConverter(converter);
197    } else {
198        ucnv_close(converter);
199    }
200
201    return length;
202}
203
204int32_t
205UnicodeString::extract(char *dest, int32_t destCapacity,
206                       UConverter *cnv,
207                       UErrorCode &errorCode) const
208{
209    if(U_FAILURE(errorCode)) {
210        return 0;
211    }
212
213    if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
214        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
215        return 0;
216    }
217
218    // nothing to do?
219    if(isEmpty()) {
220        return u_terminateChars(dest, destCapacity, 0, &errorCode);
221    }
222
223    // get the converter
224    UBool isDefaultConverter;
225    if(cnv==0) {
226        isDefaultConverter=TRUE;
227        cnv=u_getDefaultConverter(&errorCode);
228        if(U_FAILURE(errorCode)) {
229            return 0;
230        }
231    } else {
232        isDefaultConverter=FALSE;
233        ucnv_resetFromUnicode(cnv);
234    }
235
236    // convert
237    int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
238
239    // release the converter
240    if(isDefaultConverter) {
241        u_releaseDefaultConverter(cnv);
242    }
243
244    return len;
245}
246
247int32_t
248UnicodeString::doExtract(int32_t start, int32_t length,
249                         char *dest, int32_t destCapacity,
250                         UConverter *cnv,
251                         UErrorCode &errorCode) const
252{
253    if(U_FAILURE(errorCode)) {
254        if(destCapacity!=0) {
255            *dest=0;
256        }
257        return 0;
258    }
259
260    const UChar *src=getArrayStart()+start, *srcLimit=src+length;
261    char *originalDest=dest;
262    const char *destLimit;
263
264    if(destCapacity==0) {
265        destLimit=dest=0;
266    } else if(destCapacity==-1) {
267        // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
268        destLimit=(char*)U_MAX_PTR(dest);
269        // for NUL-termination, translate into highest int32_t
270        destCapacity=0x7fffffff;
271    } else {
272        destLimit=dest+destCapacity;
273    }
274
275    // perform the conversion
276    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
277    length=(int32_t)(dest-originalDest);
278
279    // if an overflow occurs, then get the preflighting length
280    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
281        char buffer[1024];
282
283        destLimit=buffer+sizeof(buffer);
284        do {
285            dest=buffer;
286            errorCode=U_ZERO_ERROR;
287            ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
288            length+=(int32_t)(dest-buffer);
289        } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
290    }
291
292    return u_terminateChars(originalDest, destCapacity, length, &errorCode);
293}
294
295void
296UnicodeString::doCodepageCreate(const char *codepageData,
297                                int32_t dataLength,
298                                const char *codepage)
299{
300    // if there's nothing to convert, do nothing
301    if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
302        return;
303    }
304    if(dataLength == -1) {
305        dataLength = (int32_t)uprv_strlen(codepageData);
306    }
307
308    UErrorCode status = U_ZERO_ERROR;
309
310    // create the converter
311    // if the codepage is the default, use our cache
312    // if it is an empty string, then use the "invariant character" conversion
313    UConverter *converter;
314    if (codepage == 0) {
315        const char *defaultName = ucnv_getDefaultName();
316        if(UCNV_FAST_IS_UTF8(defaultName)) {
317            setToUTF8(StringPiece(codepageData, dataLength));
318            return;
319        }
320        converter = u_getDefaultConverter(&status);
321    } else if(*codepage == 0) {
322        // use the "invariant characters" conversion
323        if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
324            u_charsToUChars(codepageData, getArrayStart(), dataLength);
325            setLength(dataLength);
326        } else {
327            setToBogus();
328        }
329        return;
330    } else {
331        converter = ucnv_open(codepage, &status);
332    }
333
334    // if we failed, set the appropriate flags and return
335    if(U_FAILURE(status)) {
336        setToBogus();
337        return;
338    }
339
340    // perform the conversion
341    doCodepageCreate(codepageData, dataLength, converter, status);
342    if(U_FAILURE(status)) {
343        setToBogus();
344    }
345
346    // close the converter
347    if(codepage == 0) {
348        u_releaseDefaultConverter(converter);
349    } else {
350        ucnv_close(converter);
351    }
352}
353
354void
355UnicodeString::doCodepageCreate(const char *codepageData,
356                                int32_t dataLength,
357                                UConverter *converter,
358                                UErrorCode &status)
359{
360    if(U_FAILURE(status)) {
361        return;
362    }
363
364    // set up the conversion parameters
365    const char *mySource     = codepageData;
366    const char *mySourceEnd  = mySource + dataLength;
367    UChar *array, *myTarget;
368
369    // estimate the size needed:
370    int32_t arraySize;
371    if(dataLength <= US_STACKBUF_SIZE) {
372        // try to use the stack buffer
373        arraySize = US_STACKBUF_SIZE;
374    } else {
375        // 1.25 UChar's per source byte should cover most cases
376        arraySize = dataLength + (dataLength >> 2);
377    }
378
379    // we do not care about the current contents
380    UBool doCopyArray = FALSE;
381    for(;;) {
382        if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
383            setToBogus();
384            break;
385        }
386
387        // perform the conversion
388        array = getArrayStart();
389        myTarget = array + length();
390        ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
391            &mySource, mySourceEnd, 0, TRUE, &status);
392
393        // update the conversion parameters
394        setLength((int32_t)(myTarget - array));
395
396        // allocate more space and copy data, if needed
397        if(status == U_BUFFER_OVERFLOW_ERROR) {
398            // reset the error code
399            status = U_ZERO_ERROR;
400
401            // keep the previous conversion results
402            doCopyArray = TRUE;
403
404            // estimate the new size needed, larger than before
405            // try 2 UChar's per remaining source byte
406            arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
407        } else {
408            break;
409        }
410    }
411}
412
413U_NAMESPACE_END
414
415#endif
416