1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*******************************************************************************
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
427f654740f2a26ad62a5c155af9199af9e69b889claireho*   Copyright (C) 1999-2010, International Business Machines
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*******************************************************************************
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   file name:  unistr_cnv.cpp
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   encoding:   US-ASCII
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   tab size:   8 (not used)
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   indentation:2
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created on: 2004aug19
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   created by: Markus W. Scherer
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*   Character conversion functions moved here from unistr.cpp
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_CONVERSION
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/putil.h"
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cstring.h"
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h"
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ustring.h"
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unistr.h"
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ucnv.h"
2985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#include "ucnv_imp.h"
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "putilimp.h"
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "ustr_cnv.h"
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "ustr_imp.h"
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//========================================
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Constructors
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//========================================
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
4085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#if !U_CHARSET_IS_UTF8
4185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
4285bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoUnicodeString::UnicodeString(const char *codepageData)
4385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho  : fShortLength(0),
4485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    fFlags(kShortString)
4585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho{
4685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    if(codepageData != 0) {
4785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
4885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    }
4985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho}
5085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
5185bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoUnicodeString::UnicodeString(const char *codepageData,
5285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                             int32_t dataLength)
5385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho  : fShortLength(0),
5485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    fFlags(kShortString)
5585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho{
5685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    if(codepageData != 0) {
5785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        doCodepageCreate(codepageData, dataLength, 0);
5885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    }
5985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho}
6085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
6185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho// else see unistr.cpp
6285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#endif
6385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeString::UnicodeString(const char *codepageData,
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                             const char *codepage)
6685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho  : fShortLength(0),
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fFlags(kShortString)
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(codepageData != 0) {
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeString::UnicodeString(const char *codepageData,
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                             int32_t dataLength,
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                             const char *codepage)
7785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho  : fShortLength(0),
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fFlags(kShortString)
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(codepageData != 0) {
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        doCodepageCreate(codepageData, dataLength, codepage);
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeString::UnicodeString(const char *src, int32_t srcLength,
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                             UConverter *cnv,
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                             UErrorCode &errorCode)
8885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho  : fShortLength(0),
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fFlags(kShortString)
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_SUCCESS(errorCode)) {
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // check arguments
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(src==NULL) {
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // treat as an empty string, do nothing more
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else if(srcLength<-1) {
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // get input length
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(srcLength==-1) {
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                srcLength=(int32_t)uprv_strlen(src);
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            if(srcLength>0) {
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                if(cnv!=0) {
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // use the provided converter
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    ucnv_resetToUnicode(cnv);
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    doCodepageCreate(src, srcLength, cnv, errorCode);
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                } else {
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    // use the default converter
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    cnv=u_getDefaultConverter(&errorCode);
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    doCodepageCreate(src, srcLength, cnv, errorCode);
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                    u_releaseDefaultConverter(cnv);
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                }
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            }
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_FAILURE(errorCode)) {
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            setToBogus();
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//========================================
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Codeset conversion
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//========================================
12585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
12685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#if !U_CHARSET_IS_UTF8
12785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
12885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Hoint32_t
12985bf2e2fbc60a9f938064abc8127d61da7d19882Claire HoUnicodeString::extract(int32_t start,
13085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                       int32_t length,
13185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                       char *target,
13285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                       uint32_t dstSize) const {
13385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    return extract(start, length, target, dstSize, 0);
13485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho}
13585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
13685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho// else see unistr.cpp
13785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#endif
13885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeString::extract(int32_t start,
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                       int32_t length,
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                       char *target,
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                       uint32_t dstSize,
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                       const char *codepage) const
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // if the arguments are illegal, then do nothing
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // pin the indices to legal values
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    pinIndices(start, length);
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
15485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    // We need to cast dstSize to int32_t for all subsequent code.
15585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    // I don't know why the API was defined with uint32_t but we are stuck with it.
15685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
15785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    // as a limit in some functions, it may wrap around and yield a pointer
15885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    // that compares less-than target.
15985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    int32_t capacity;
16085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    if(dstSize < 0x7fffffff) {
16185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        // Assume that the capacity is real and a limit pointer won't wrap around.
16285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        capacity = (int32_t)dstSize;
16385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    } else {
16427f654740f2a26ad62a5c155af9199af9e69b889claireho        // Pin the capacity so that a limit pointer does not wrap around.
16527f654740f2a26ad62a5c155af9199af9e69b889claireho        char *targetLimit = (char *)U_MAX_PTR(target);
16627f654740f2a26ad62a5c155af9199af9e69b889claireho        // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
16727f654740f2a26ad62a5c155af9199af9e69b889claireho        // greater than target and does not wrap around the top of the address space.
16827f654740f2a26ad62a5c155af9199af9e69b889claireho        capacity = (int32_t)(targetLimit - target);
16985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    }
17085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // create the converter
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UConverter *converter;
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // just write the NUL if the string length is 0
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(length == 0) {
17785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        return u_terminateChars(target, capacity, 0, &status);
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // if the codepage is the default, use our cache
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // if it is an empty string, then use the "invariant character" conversion
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (codepage == 0) {
18385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        const char *defaultName = ucnv_getDefaultName();
18485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        if(UCNV_FAST_IS_UTF8(defaultName)) {
18585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            return toUTF8(start, length, target, capacity);
18685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        }
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        converter = u_getDefaultConverter(&status);
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else if (*codepage == 0) {
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // use the "invariant characters" conversion
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        int32_t destLength;
19185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        if(length <= capacity) {
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            destLength = length;
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
19485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            destLength = capacity;
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        u_UCharsToChars(getArrayStart() + start, target, destLength);
19785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        return u_terminateChars(target, capacity, length, &status);
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        converter = ucnv_open(codepage, &status);
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
20285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    length = doExtract(start, length, target, capacity, converter, status);
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // close the converter
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (codepage == 0) {
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        u_releaseDefaultConverter(converter);
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ucnv_close(converter);
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return length;
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeString::extract(char *dest, int32_t destCapacity,
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                       UConverter *cnv,
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                       UErrorCode &errorCode) const
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(errorCode)) {
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // nothing to do?
22985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    if(isEmpty()) {
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return u_terminateChars(dest, destCapacity, 0, &errorCode);
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // get the converter
234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool isDefaultConverter;
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(cnv==0) {
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        isDefaultConverter=TRUE;
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        cnv=u_getDefaultConverter(&errorCode);
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(U_FAILURE(errorCode)) {
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            return 0;
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        isDefaultConverter=FALSE;
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ucnv_resetFromUnicode(cnv);
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // convert
24785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // release the converter
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(isDefaultConverter) {
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        u_releaseDefaultConverter(cnv);
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
25485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    return len;
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeString::doExtract(int32_t start, int32_t length,
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                         char *dest, int32_t destCapacity,
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                         UConverter *cnv,
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                         UErrorCode &errorCode) const
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(errorCode)) {
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(destCapacity!=0) {
265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            *dest=0;
266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return 0;
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
27085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    const UChar *src=getArrayStart()+start, *srcLimit=src+length;
271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    char *originalDest=dest;
272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *destLimit;
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(destCapacity==0) {
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        destLimit=dest=0;
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else if(destCapacity==-1) {
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        destLimit=(char*)U_MAX_PTR(dest);
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // for NUL-termination, translate into highest int32_t
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        destCapacity=0x7fffffff;
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        destLimit=dest+destCapacity;
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // perform the conversion
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    length=(int32_t)(dest-originalDest);
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // if an overflow occurs, then get the preflighting length
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        char buffer[1024];
292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        destLimit=buffer+sizeof(buffer);
294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        do {
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            dest=buffer;
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            errorCode=U_ZERO_ERROR;
297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            length+=(int32_t)(dest-buffer);
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return u_terminateChars(originalDest, destCapacity, length, &errorCode);
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeString::doCodepageCreate(const char *codepageData,
307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                int32_t dataLength,
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                const char *codepage)
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // if there's nothing to convert, do nothing
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(dataLength == -1) {
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        dataLength = (int32_t)uprv_strlen(codepageData);
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // create the converter
321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // if the codepage is the default, use our cache
322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // if it is an empty string, then use the "invariant character" conversion
32385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    UConverter *converter;
32485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    if (codepage == 0) {
32585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        const char *defaultName = ucnv_getDefaultName();
32685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        if(UCNV_FAST_IS_UTF8(defaultName)) {
32785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            setToUTF8(StringPiece(codepageData, dataLength));
32885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            return;
32985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        }
33085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        converter = u_getDefaultConverter(&status);
33185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    } else if(*codepage == 0) {
332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // use the "invariant characters" conversion
333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            u_charsToUChars(codepageData, getArrayStart(), dataLength);
33585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            setLength(dataLength);
336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            setToBogus();
338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
34085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    } else {
34185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        converter = ucnv_open(codepage, &status);
34285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    }
34385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
34485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    // if we failed, set the appropriate flags and return
34585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    if(U_FAILURE(status)) {
34685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        setToBogus();
34785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        return;
348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
35085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    // perform the conversion
351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    doCodepageCreate(codepageData, dataLength, converter, status);
352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(status)) {
353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        setToBogus();
354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // close the converter
357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(codepage == 0) {
358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        u_releaseDefaultConverter(converter);
359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    } else {
360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ucnv_close(converter);
361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid
365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUnicodeString::doCodepageCreate(const char *codepageData,
366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                int32_t dataLength,
367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                UConverter *converter,
368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                UErrorCode &status)
369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(U_FAILURE(status)) {
371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // set up the conversion parameters
375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *mySource     = codepageData;
376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    const char *mySourceEnd  = mySource + dataLength;
37785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    UChar *array, *myTarget;
378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // estimate the size needed:
38085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    int32_t arraySize;
38185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    if(dataLength <= US_STACKBUF_SIZE) {
38285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        // try to use the stack buffer
38385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        arraySize = US_STACKBUF_SIZE;
38485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    } else {
38585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        // 1.25 UChar's per source byte should cover most cases
38685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        arraySize = dataLength + (dataLength >> 2);
38785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    }
388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // we do not care about the current contents
390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool doCopyArray = FALSE;
391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for(;;) {
392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            setToBogus();
394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // perform the conversion
39885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        array = getArrayStart();
39985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        myTarget = array + length();
40085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            &mySource, mySourceEnd, 0, TRUE, &status);
402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // update the conversion parameters
40485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        setLength((int32_t)(myTarget - array));
405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        // allocate more space and copy data, if needed
407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if(status == U_BUFFER_OVERFLOW_ERROR) {
408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // reset the error code
409ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            status = U_ZERO_ERROR;
410ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
411ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // keep the previous conversion results
412ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            doCopyArray = TRUE;
413ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
414ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // estimate the new size needed, larger than before
415ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            // try 2 UChar's per remaining source byte
41685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho            arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
417ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        } else {
418ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
419ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
420ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
421ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
422ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
423ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
424ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
425ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
426