1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 1999-2014, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*   file name:  unistr_cnv.cpp
11*   encoding:   UTF-8
12*   tab size:   8 (not used)
13*   indentation:2
14*
15*   created on: 2004aug19
16*   created by: Markus W. Scherer
17*
18*   Character conversion functions moved here from unistr.cpp
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_CONVERSION
24
25#include "unicode/putil.h"
26#include "cstring.h"
27#include "cmemory.h"
28#include "unicode/ustring.h"
29#include "unicode/unistr.h"
30#include "unicode/ucnv.h"
31#include "ucnv_imp.h"
32#include "putilimp.h"
33#include "ustr_cnv.h"
34#include "ustr_imp.h"
35
36U_NAMESPACE_BEGIN
37
38//========================================
39// Constructors
40//========================================
41
42#if !U_CHARSET_IS_UTF8
43
44UnicodeString::UnicodeString(const char *codepageData) {
45    fUnion.fFields.fLengthAndFlags = kShortString;
46    if(codepageData != 0) {
47        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
48    }
49}
50
51UnicodeString::UnicodeString(const char *codepageData,
52                             int32_t dataLength) {
53    fUnion.fFields.fLengthAndFlags = kShortString;
54    if(codepageData != 0) {
55        doCodepageCreate(codepageData, dataLength, 0);
56    }
57}
58
59// else see unistr.cpp
60#endif
61
62UnicodeString::UnicodeString(const char *codepageData,
63                             const char *codepage) {
64    fUnion.fFields.fLengthAndFlags = kShortString;
65    if(codepageData != 0) {
66        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
67    }
68}
69
70UnicodeString::UnicodeString(const char *codepageData,
71                             int32_t dataLength,
72                             const char *codepage) {
73    fUnion.fFields.fLengthAndFlags = kShortString;
74    if(codepageData != 0) {
75        doCodepageCreate(codepageData, dataLength, codepage);
76    }
77}
78
79UnicodeString::UnicodeString(const char *src, int32_t srcLength,
80                             UConverter *cnv,
81                             UErrorCode &errorCode) {
82    fUnion.fFields.fLengthAndFlags = kShortString;
83    if(U_SUCCESS(errorCode)) {
84        // check arguments
85        if(src==NULL) {
86            // treat as an empty string, do nothing more
87        } else if(srcLength<-1) {
88            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
89        } else {
90            // get input length
91            if(srcLength==-1) {
92                srcLength=(int32_t)uprv_strlen(src);
93            }
94            if(srcLength>0) {
95                if(cnv!=0) {
96                    // use the provided converter
97                    ucnv_resetToUnicode(cnv);
98                    doCodepageCreate(src, srcLength, cnv, errorCode);
99                } else {
100                    // use the default converter
101                    cnv=u_getDefaultConverter(&errorCode);
102                    doCodepageCreate(src, srcLength, cnv, errorCode);
103                    u_releaseDefaultConverter(cnv);
104                }
105            }
106        }
107
108        if(U_FAILURE(errorCode)) {
109            setToBogus();
110        }
111    }
112}
113
114//========================================
115// Codeset conversion
116//========================================
117
118#if !U_CHARSET_IS_UTF8
119
120int32_t
121UnicodeString::extract(int32_t start,
122                       int32_t length,
123                       char *target,
124                       uint32_t dstSize) const {
125    return extract(start, length, target, dstSize, 0);
126}
127
128// else see unistr.cpp
129#endif
130
131int32_t
132UnicodeString::extract(int32_t start,
133                       int32_t length,
134                       char *target,
135                       uint32_t dstSize,
136                       const char *codepage) const
137{
138    // if the arguments are illegal, then do nothing
139    if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
140        return 0;
141    }
142
143    // pin the indices to legal values
144    pinIndices(start, length);
145
146    // We need to cast dstSize to int32_t for all subsequent code.
147    // I don't know why the API was defined with uint32_t but we are stuck with it.
148    // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
149    // as a limit in some functions, it may wrap around and yield a pointer
150    // that compares less-than target.
151    int32_t capacity;
152    if(dstSize < 0x7fffffff) {
153        // Assume that the capacity is real and a limit pointer won't wrap around.
154        capacity = (int32_t)dstSize;
155    } else {
156        // Pin the capacity so that a limit pointer does not wrap around.
157        char *targetLimit = (char *)U_MAX_PTR(target);
158        // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
159        // greater than target and does not wrap around the top of the address space.
160        capacity = (int32_t)(targetLimit - target);
161    }
162
163    // create the converter
164    UConverter *converter;
165    UErrorCode status = U_ZERO_ERROR;
166
167    // just write the NUL if the string length is 0
168    if(length == 0) {
169        return u_terminateChars(target, capacity, 0, &status);
170    }
171
172    // if the codepage is the default, use our cache
173    // if it is an empty string, then use the "invariant character" conversion
174    if (codepage == 0) {
175        const char *defaultName = ucnv_getDefaultName();
176        if(UCNV_FAST_IS_UTF8(defaultName)) {
177            return toUTF8(start, length, target, capacity);
178        }
179        converter = u_getDefaultConverter(&status);
180    } else if (*codepage == 0) {
181        // use the "invariant characters" conversion
182        int32_t destLength;
183        if(length <= capacity) {
184            destLength = length;
185        } else {
186            destLength = capacity;
187        }
188        u_UCharsToChars(getArrayStart() + start, target, destLength);
189        return u_terminateChars(target, capacity, length, &status);
190    } else {
191        converter = ucnv_open(codepage, &status);
192    }
193
194    length = doExtract(start, length, target, capacity, converter, status);
195
196    // close the converter
197    if (codepage == 0) {
198        u_releaseDefaultConverter(converter);
199    } else {
200        ucnv_close(converter);
201    }
202
203    return length;
204}
205
206int32_t
207UnicodeString::extract(char *dest, int32_t destCapacity,
208                       UConverter *cnv,
209                       UErrorCode &errorCode) const
210{
211    if(U_FAILURE(errorCode)) {
212        return 0;
213    }
214
215    if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
216        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
217        return 0;
218    }
219
220    // nothing to do?
221    if(isEmpty()) {
222        return u_terminateChars(dest, destCapacity, 0, &errorCode);
223    }
224
225    // get the converter
226    UBool isDefaultConverter;
227    if(cnv==0) {
228        isDefaultConverter=TRUE;
229        cnv=u_getDefaultConverter(&errorCode);
230        if(U_FAILURE(errorCode)) {
231            return 0;
232        }
233    } else {
234        isDefaultConverter=FALSE;
235        ucnv_resetFromUnicode(cnv);
236    }
237
238    // convert
239    int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
240
241    // release the converter
242    if(isDefaultConverter) {
243        u_releaseDefaultConverter(cnv);
244    }
245
246    return len;
247}
248
249int32_t
250UnicodeString::doExtract(int32_t start, int32_t length,
251                         char *dest, int32_t destCapacity,
252                         UConverter *cnv,
253                         UErrorCode &errorCode) const
254{
255    if(U_FAILURE(errorCode)) {
256        if(destCapacity!=0) {
257            *dest=0;
258        }
259        return 0;
260    }
261
262    const UChar *src=getArrayStart()+start, *srcLimit=src+length;
263    char *originalDest=dest;
264    const char *destLimit;
265
266    if(destCapacity==0) {
267        destLimit=dest=0;
268    } else if(destCapacity==-1) {
269        // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
270        destLimit=(char*)U_MAX_PTR(dest);
271        // for NUL-termination, translate into highest int32_t
272        destCapacity=0x7fffffff;
273    } else {
274        destLimit=dest+destCapacity;
275    }
276
277    // perform the conversion
278    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
279    length=(int32_t)(dest-originalDest);
280
281    // if an overflow occurs, then get the preflighting length
282    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
283        char buffer[1024];
284
285        destLimit=buffer+sizeof(buffer);
286        do {
287            dest=buffer;
288            errorCode=U_ZERO_ERROR;
289            ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
290            length+=(int32_t)(dest-buffer);
291        } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
292    }
293
294    return u_terminateChars(originalDest, destCapacity, length, &errorCode);
295}
296
297void
298UnicodeString::doCodepageCreate(const char *codepageData,
299                                int32_t dataLength,
300                                const char *codepage)
301{
302    // if there's nothing to convert, do nothing
303    if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
304        return;
305    }
306    if(dataLength == -1) {
307        dataLength = (int32_t)uprv_strlen(codepageData);
308    }
309
310    UErrorCode status = U_ZERO_ERROR;
311
312    // create the converter
313    // if the codepage is the default, use our cache
314    // if it is an empty string, then use the "invariant character" conversion
315    UConverter *converter;
316    if (codepage == 0) {
317        const char *defaultName = ucnv_getDefaultName();
318        if(UCNV_FAST_IS_UTF8(defaultName)) {
319            setToUTF8(StringPiece(codepageData, dataLength));
320            return;
321        }
322        converter = u_getDefaultConverter(&status);
323    } else if(*codepage == 0) {
324        // use the "invariant characters" conversion
325        if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
326            u_charsToUChars(codepageData, getArrayStart(), dataLength);
327            setLength(dataLength);
328        } else {
329            setToBogus();
330        }
331        return;
332    } else {
333        converter = ucnv_open(codepage, &status);
334    }
335
336    // if we failed, set the appropriate flags and return
337    if(U_FAILURE(status)) {
338        setToBogus();
339        return;
340    }
341
342    // perform the conversion
343    doCodepageCreate(codepageData, dataLength, converter, status);
344    if(U_FAILURE(status)) {
345        setToBogus();
346    }
347
348    // close the converter
349    if(codepage == 0) {
350        u_releaseDefaultConverter(converter);
351    } else {
352        ucnv_close(converter);
353    }
354}
355
356void
357UnicodeString::doCodepageCreate(const char *codepageData,
358                                int32_t dataLength,
359                                UConverter *converter,
360                                UErrorCode &status)
361{
362    if(U_FAILURE(status)) {
363        return;
364    }
365
366    // set up the conversion parameters
367    const char *mySource     = codepageData;
368    const char *mySourceEnd  = mySource + dataLength;
369    UChar *array, *myTarget;
370
371    // estimate the size needed:
372    int32_t arraySize;
373    if(dataLength <= US_STACKBUF_SIZE) {
374        // try to use the stack buffer
375        arraySize = US_STACKBUF_SIZE;
376    } else {
377        // 1.25 UChar's per source byte should cover most cases
378        arraySize = dataLength + (dataLength >> 2);
379    }
380
381    // we do not care about the current contents
382    UBool doCopyArray = FALSE;
383    for(;;) {
384        if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
385            setToBogus();
386            break;
387        }
388
389        // perform the conversion
390        array = getArrayStart();
391        myTarget = array + length();
392        ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
393            &mySource, mySourceEnd, 0, TRUE, &status);
394
395        // update the conversion parameters
396        setLength((int32_t)(myTarget - array));
397
398        // allocate more space and copy data, if needed
399        if(status == U_BUFFER_OVERFLOW_ERROR) {
400            // reset the error code
401            status = U_ZERO_ERROR;
402
403            // keep the previous conversion results
404            doCopyArray = TRUE;
405
406            // estimate the new size needed, larger than before
407            // try 2 UChar's per remaining source byte
408            arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
409        } else {
410            break;
411        }
412    }
413}
414
415U_NAMESPACE_END
416
417#endif
418