1/*
2 * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 * Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
5 * Copyright (C) 2009 Dominik Röttsches <dominik.roettsches@access-company.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
20 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "config.h"
30#include "TextCodecGtk.h"
31
32#include <gio/gio.h>
33#include "GOwnPtr.h"
34#include "Logging.h"
35#include "PlatformString.h"
36#include <wtf/Assertions.h>
37#include <wtf/HashMap.h>
38#include <wtf/text/CString.h>
39
40using std::min;
41
42namespace WebCore {
43
44// TextCodec's appendOmittingBOM() is gone (http://trac.webkit.org/changeset/33380).
45// That's why we need to avoid generating extra BOM's for the conversion result.
46// This can be achieved by specifying the UTF-16 codecs' endianness explicitly when initializing GLib.
47
48#if (G_BYTE_ORDER == G_BIG_ENDIAN)
49static const gchar* internalEncodingName = "UTF-16BE";
50#else
51static const gchar* internalEncodingName = "UTF-16LE";
52#endif
53
54
55const size_t ConversionBufferSize = 16384;
56
57
58static PassOwnPtr<TextCodec> newTextCodecGtk(const TextEncoding& encoding, const void*)
59{
60    return new TextCodecGtk(encoding);
61}
62
63static bool isEncodingAvailable(const gchar* encodingName)
64{
65    GIConv tester;
66    // test decoding
67    tester = g_iconv_open(internalEncodingName, encodingName);
68    if (tester == reinterpret_cast<GIConv>(-1)) {
69        return false;
70    } else {
71        g_iconv_close(tester);
72        // test encoding
73        tester = g_iconv_open(encodingName, internalEncodingName);
74        if (tester == reinterpret_cast<GIConv>(-1)) {
75            return false;
76        } else {
77            g_iconv_close(tester);
78            return true;
79        }
80    }
81}
82
83static bool registerEncodingNameIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName)
84{
85    if (isEncodingAvailable(canonicalName)) {
86        registrar(canonicalName, canonicalName);
87        return true;
88    }
89
90    return false;
91}
92
93static void registerEncodingAliasIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName, const char* aliasName)
94{
95    if (isEncodingAvailable(aliasName))
96        registrar(aliasName, canonicalName);
97}
98
99static void registerCodecIfAvailable(TextCodecRegistrar registrar, const char* codecName)
100{
101    if (isEncodingAvailable(codecName))
102        registrar(codecName, newTextCodecGtk, 0);
103}
104
105void TextCodecGtk::registerBaseEncodingNames(EncodingNameRegistrar registrar)
106{
107    // Unicode
108    registerEncodingNameIfAvailable(registrar, "UTF-8");
109    registerEncodingNameIfAvailable(registrar, "UTF-32");
110    registerEncodingNameIfAvailable(registrar, "UTF-32BE");
111    registerEncodingNameIfAvailable(registrar, "UTF-32LE");
112
113    // Western
114    if (registerEncodingNameIfAvailable(registrar, "ISO-8859-1")) {
115        registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CP819");
116        registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "IBM819");
117        registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO-IR-100");
118        registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO8859-1");
119        registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1");
120        registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1:1987");
121        registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "L1");
122        registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "LATIN1");
123        registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CSISOLATIN1");
124    }
125}
126
127void TextCodecGtk::registerBaseCodecs(TextCodecRegistrar registrar)
128{
129    // Unicode
130    registerCodecIfAvailable(registrar, "UTF-8");
131    registerCodecIfAvailable(registrar, "UTF-32");
132    registerCodecIfAvailable(registrar, "UTF-32BE");
133    registerCodecIfAvailable(registrar, "UTF-32LE");
134
135    // Western
136    registerCodecIfAvailable(registrar, "ISO-8859-1");
137}
138
139void TextCodecGtk::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
140{
141    // Western
142    if (registerEncodingNameIfAvailable(registrar, "MACROMAN")) {
143        registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MAC");
144        registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MACINTOSH");
145        registerEncodingAliasIfAvailable(registrar, "MACROMAN", "CSMACINTOSH");
146    }
147
148    // Japanese
149    if (registerEncodingNameIfAvailable(registrar, "Shift_JIS")) {
150        registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "MS_KANJI");
151        registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SHIFT-JIS");
152        registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SJIS");
153        registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "CSSHIFTJIS");
154    }
155    if (registerEncodingNameIfAvailable(registrar, "EUC-JP")) {
156        registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUC_JP");
157        registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUCJP");
158        registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE");
159        registerEncodingAliasIfAvailable(registrar, "EUC-JP", "CSEUCPKDFMTJAPANESE");
160    }
161    registerEncodingNameIfAvailable(registrar, "ISO-2022-JP");
162
163    // Traditional Chinese
164    if (registerEncodingNameIfAvailable(registrar, "BIG5")) {
165        registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-5");
166        registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-FIVE");
167        registerEncodingAliasIfAvailable(registrar, "BIG5", "BIGFIVE");
168        registerEncodingAliasIfAvailable(registrar, "BIG5", "CN-BIG5");
169        registerEncodingAliasIfAvailable(registrar, "BIG5", "CSBIG5");
170    }
171    if (registerEncodingNameIfAvailable(registrar, "BIG5-HKSCS")) {
172        registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5-HKSCS:2004");
173        registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5HKSCS");
174    }
175    registerEncodingNameIfAvailable(registrar, "CP950");
176
177    // Korean
178    if (registerEncodingNameIfAvailable(registrar, "ISO-2022-KR"))
179        registerEncodingAliasIfAvailable(registrar, "ISO-2022-KR", "CSISO2022KR");
180    if (registerEncodingNameIfAvailable(registrar, "CP949"))
181        registerEncodingAliasIfAvailable(registrar, "CP949", "UHC");
182    if (registerEncodingNameIfAvailable(registrar, "EUC-KR"))
183        registerEncodingAliasIfAvailable(registrar, "EUC-KR", "CSEUCKR");
184
185    // Arabic
186    if (registerEncodingNameIfAvailable(registrar, "ISO-8859-6")) {
187        registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ARABIC");
188        registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ASMO-708");
189        registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ECMA-114");
190        registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO-IR-127");
191        registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO8859-6");
192        registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6");
193        registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6:1987");
194        registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "CSISOLATINARABIC");
195    }
196    // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case
197    if (registerEncodingNameIfAvailable(registrar, "windows-1256")) {
198        registerEncodingAliasIfAvailable(registrar, "windows-1256", "CP1256");
199        registerEncodingAliasIfAvailable(registrar, "windows-1256", "MS-ARAB");
200    }
201
202    // Hebrew
203    if (registerEncodingNameIfAvailable(registrar, "ISO-8859-8")) {
204        registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "HEBREW");
205        registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-8859-8");
206        registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-IR-138");
207        registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO8859-8");
208        registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8");
209        registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8:1988");
210        registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "CSISOLATINHEBREW");
211    }
212    // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html
213    if (registerEncodingNameIfAvailable(registrar, "windows-1255")) {
214        registerEncodingAliasIfAvailable(registrar, "windows-1255", "CP1255");
215        registerEncodingAliasIfAvailable(registrar, "windows-1255", "MS-HEBR");
216    }
217
218    // Greek
219    if (registerEncodingNameIfAvailable(registrar, "ISO-8859-7")) {
220        registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ECMA-118");
221        registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ELOT_928");
222        registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK");
223        registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK8");
224        registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO-IR-126");
225        registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO8859-7");
226        registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7");
227        registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:1987");
228        registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:2003");
229        registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "CSI");
230    }
231    if (registerEncodingNameIfAvailable(registrar, "CP869")) {
232        registerEncodingAliasIfAvailable(registrar, "CP869", "869");
233        registerEncodingAliasIfAvailable(registrar, "CP869", "CP-GR");
234        registerEncodingAliasIfAvailable(registrar, "CP869", "IBM869");
235        registerEncodingAliasIfAvailable(registrar, "CP869", "CSIBM869");
236    }
237    registerEncodingNameIfAvailable(registrar, "WINDOWS-1253");
238
239    // Cyrillic
240    if (registerEncodingNameIfAvailable(registrar, "ISO-8859-5")) {
241        registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CYRILLIC");
242        registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO-IR-144");
243        registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO8859-5");
244        registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5");
245        registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5:1988");
246        registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CSISOLATINCYRILLIC");
247    }
248    if (registerEncodingNameIfAvailable(registrar, "KOI8-R"))
249        registerEncodingAliasIfAvailable(registrar, "KOI8-R", "CSKOI8R");
250    if (registerEncodingNameIfAvailable(registrar, "CP866")) {
251        registerEncodingAliasIfAvailable(registrar, "CP866", "866");
252        registerEncodingAliasIfAvailable(registrar, "CP866", "IBM866");
253        registerEncodingAliasIfAvailable(registrar, "CP866", "CSIBM866");
254    }
255    registerEncodingNameIfAvailable(registrar, "KOI8-U");
256    // CP1251 added to pass /fast/encoding/charset-cp1251.html
257    if (registerEncodingNameIfAvailable(registrar, "windows-1251"))
258        registerEncodingAliasIfAvailable(registrar, "windows-1251", "CP1251");
259    if (registerEncodingNameIfAvailable(registrar, "mac-cyrillic")) {
260        registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "MACCYRILLIC");
261        registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "x-mac-cyrillic");
262    }
263
264    // Thai
265    if (registerEncodingNameIfAvailable(registrar, "CP874"))
266        registerEncodingAliasIfAvailable(registrar, "CP874", "WINDOWS-874");
267    registerEncodingNameIfAvailable(registrar, "TIS-620");
268
269    // Simplified Chinese
270    registerEncodingNameIfAvailable(registrar, "GBK");
271    if (registerEncodingNameIfAvailable(registrar, "HZ"))
272        registerEncodingAliasIfAvailable(registrar, "HZ", "HZ-GB-2312");
273    registerEncodingNameIfAvailable(registrar, "GB18030");
274    if (registerEncodingNameIfAvailable(registrar, "EUC-CN")) {
275        registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUCCN");
276        registerEncodingAliasIfAvailable(registrar, "EUC-CN", "GB2312");
277        registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CN-GB");
278        registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CSGB2312");
279        registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUC_CN");
280    }
281    if (registerEncodingNameIfAvailable(registrar, "GB_2312-80")) {
282        registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "CHINESE");
283        registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "csISO58GB231280");
284        registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "GB2312.1980-0");
285        registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "ISO-IR-58");
286    }
287
288    // Central European
289    if (registerEncodingNameIfAvailable(registrar, "ISO-8859-2")) {
290        registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO-IR-101");
291        registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO8859-2");
292        registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2");
293        registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2:1987");
294        registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "L2");
295        registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "LATIN2");
296        registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "CSISOLATIN2");
297    }
298    if (registerEncodingNameIfAvailable(registrar, "CP1250")) {
299        registerEncodingAliasIfAvailable(registrar, "CP1250", "MS-EE");
300        registerEncodingAliasIfAvailable(registrar, "CP1250", "WINDOWS-1250");
301    }
302    registerEncodingNameIfAvailable(registrar, "MAC-CENTRALEUROPE");
303
304    // Vietnamese
305    if (registerEncodingNameIfAvailable(registrar, "CP1258"))
306        registerEncodingAliasIfAvailable(registrar, "CP1258", "WINDOWS-1258");
307
308    // Turkish
309    if (registerEncodingNameIfAvailable(registrar, "CP1254")) {
310        registerEncodingAliasIfAvailable(registrar, "CP1254", "MS-TURK");
311        registerEncodingAliasIfAvailable(registrar, "CP1254", "WINDOWS-1254");
312    }
313    if (registerEncodingNameIfAvailable(registrar, "ISO-8859-9")) {
314        registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO-IR-148");
315        registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO8859-9");
316        registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9");
317        registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9:1989");
318        registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "L5");
319        registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "LATIN5");
320        registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "CSISOLATIN5");
321    }
322
323    // Baltic
324    if (registerEncodingNameIfAvailable(registrar, "CP1257")) {
325        registerEncodingAliasIfAvailable(registrar, "CP1257", "WINBALTRIM");
326        registerEncodingAliasIfAvailable(registrar, "CP1257", "WINDOWS-1257");
327    }
328    if (registerEncodingNameIfAvailable(registrar, "ISO-8859-4")) {
329        registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO-IR-110");
330        registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO8859-4");
331        registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4");
332        registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4:1988");
333        registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "L4");
334        registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "LATIN4");
335        registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "CSISOLATIN4");
336    }
337}
338
339void TextCodecGtk::registerExtendedCodecs(TextCodecRegistrar registrar)
340{
341    // Western
342    registerCodecIfAvailable(registrar, "MACROMAN");
343
344    // Japanese
345    registerCodecIfAvailable(registrar, "Shift_JIS");
346    registerCodecIfAvailable(registrar, "EUC-JP");
347    registerCodecIfAvailable(registrar, "ISO-2022-JP");
348
349    // Traditional Chinese
350    registerCodecIfAvailable(registrar, "BIG5");
351    registerCodecIfAvailable(registrar, "BIG5-HKSCS");
352    registerCodecIfAvailable(registrar, "CP950");
353
354    // Korean
355    registerCodecIfAvailable(registrar, "ISO-2022-KR");
356    registerCodecIfAvailable(registrar, "CP949");
357    registerCodecIfAvailable(registrar, "EUC-KR");
358
359    // Arabic
360    registerCodecIfAvailable(registrar, "ISO-8859-6");
361    // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case
362    registerCodecIfAvailable(registrar, "windows-1256");
363
364    // Hebrew
365    registerCodecIfAvailable(registrar, "ISO-8859-8");
366    // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html
367    registerCodecIfAvailable(registrar, "windows-1255");
368
369    // Greek
370    registerCodecIfAvailable(registrar, "ISO-8859-7");
371    registerCodecIfAvailable(registrar, "CP869");
372    registerCodecIfAvailable(registrar, "WINDOWS-1253");
373
374    // Cyrillic
375    registerCodecIfAvailable(registrar, "ISO-8859-5");
376    registerCodecIfAvailable(registrar, "KOI8-R");
377    registerCodecIfAvailable(registrar, "CP866");
378    registerCodecIfAvailable(registrar, "KOI8-U");
379    // CP1251 added to pass /fast/encoding/charset-cp1251.html
380    registerCodecIfAvailable(registrar, "windows-1251");
381    registerCodecIfAvailable(registrar, "mac-cyrillic");
382
383    // Thai
384    registerCodecIfAvailable(registrar, "CP874");
385    registerCodecIfAvailable(registrar, "TIS-620");
386
387    // Simplified Chinese
388    registerCodecIfAvailable(registrar, "GBK");
389    registerCodecIfAvailable(registrar, "HZ");
390    registerCodecIfAvailable(registrar, "GB18030");
391    registerCodecIfAvailable(registrar, "EUC-CN");
392    registerCodecIfAvailable(registrar, "GB_2312-80");
393
394    // Central European
395    registerCodecIfAvailable(registrar, "ISO-8859-2");
396    registerCodecIfAvailable(registrar, "CP1250");
397    registerCodecIfAvailable(registrar, "MAC-CENTRALEUROPE");
398
399    // Vietnamese
400    registerCodecIfAvailable(registrar, "CP1258");
401
402    // Turkish
403    registerCodecIfAvailable(registrar, "CP1254");
404    registerCodecIfAvailable(registrar, "ISO-8859-9");
405
406    // Baltic
407    registerCodecIfAvailable(registrar, "CP1257");
408    registerCodecIfAvailable(registrar, "ISO-8859-4");
409}
410
411TextCodecGtk::TextCodecGtk(const TextEncoding& encoding)
412    : m_encoding(encoding)
413    , m_numBufferedBytes(0)
414{
415}
416
417TextCodecGtk::~TextCodecGtk()
418{
419}
420
421void TextCodecGtk::createIConvDecoder() const
422{
423    ASSERT(!m_iconvDecoder);
424
425    m_iconvDecoder = adoptGRef(g_charset_converter_new(internalEncodingName, m_encoding.name(), 0));
426}
427
428void TextCodecGtk::createIConvEncoder() const
429{
430    ASSERT(!m_iconvEncoder);
431
432    m_iconvEncoder = adoptGRef(g_charset_converter_new(m_encoding.name(), internalEncodingName, 0));
433}
434
435String TextCodecGtk::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
436{
437    // Get a converter for the passed-in encoding.
438    if (!m_iconvDecoder)
439        createIConvDecoder();
440    if (!m_iconvDecoder) {
441        LOG_ERROR("Error creating IConv encoder even though encoding was in table.");
442        return String();
443    }
444
445    Vector<UChar> result;
446
447    gsize bytesRead = 0;
448    gsize bytesWritten = 0;
449    const gchar* input = bytes;
450    gsize inputLength = length;
451    gchar buffer[ConversionBufferSize];
452    int flags = !length ? G_CONVERTER_INPUT_AT_END : G_CONVERTER_NO_FLAGS;
453    if (flush)
454        flags |= G_CONVERTER_FLUSH;
455
456    bool bufferWasFull = false;
457    char* prefixedBytes = 0;
458
459    if (m_numBufferedBytes) {
460        inputLength = length + m_numBufferedBytes;
461        prefixedBytes = static_cast<char*>(fastMalloc(inputLength));
462        memcpy(prefixedBytes, m_bufferedBytes, m_numBufferedBytes);
463        memcpy(prefixedBytes + m_numBufferedBytes, bytes, length);
464
465        input = prefixedBytes;
466
467        // all buffered bytes are consumed now
468        m_numBufferedBytes = 0;
469    }
470
471    do {
472        GOwnPtr<GError> error;
473        GConverterResult res = g_converter_convert(G_CONVERTER(m_iconvDecoder.get()),
474                                                   input, inputLength,
475                                                   buffer, sizeof(buffer),
476                                                   static_cast<GConverterFlags>(flags),
477                                                   &bytesRead, &bytesWritten,
478                                                   &error.outPtr());
479        input += bytesRead;
480        inputLength -= bytesRead;
481
482        if (res == G_CONVERTER_ERROR) {
483            if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) {
484                // There is not enough input to fully determine what the conversion should produce,
485                // save it to a buffer to prepend it to the next input.
486                memcpy(m_bufferedBytes, input, inputLength);
487                m_numBufferedBytes = inputLength;
488                inputLength = 0;
489            } else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_NO_SPACE))
490                bufferWasFull = true;
491            else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) {
492                if (stopOnError)
493                    sawError = true;
494                if (inputLength) {
495                    // Ignore invalid character.
496                    input += 1;
497                    inputLength -= 1;
498                }
499            } else {
500                sawError = true;
501                LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message);
502                m_numBufferedBytes = 0; // Reset state for subsequent calls to decode.
503                fastFree(prefixedBytes);
504                return String();
505            }
506        }
507
508        result.append(reinterpret_cast<UChar*>(buffer), bytesWritten / sizeof(UChar));
509    } while ((inputLength || bufferWasFull) && !sawError);
510
511    fastFree(prefixedBytes);
512
513    return String::adopt(result);
514}
515
516CString TextCodecGtk::encode(const UChar* characters, size_t length, UnencodableHandling handling)
517{
518    if (!length)
519        return "";
520
521    if (!m_iconvEncoder)
522        createIConvEncoder();
523    if (!m_iconvEncoder) {
524        LOG_ERROR("Error creating IConv encoder even though encoding was in table.");
525        return CString();
526    }
527
528    gsize bytesRead = 0;
529    gsize bytesWritten = 0;
530    const gchar* input = reinterpret_cast<const char*>(characters);
531    gsize inputLength = length * sizeof(UChar);
532    gchar buffer[ConversionBufferSize];
533    Vector<char> result;
534    GOwnPtr<GError> error;
535
536    size_t size = 0;
537    do {
538        g_converter_convert(G_CONVERTER(m_iconvEncoder.get()),
539                            input, inputLength,
540                            buffer, sizeof(buffer),
541                            G_CONVERTER_INPUT_AT_END,
542                            &bytesRead, &bytesWritten,
543                            &error.outPtr());
544        input += bytesRead;
545        inputLength -= bytesRead;
546        if (bytesWritten > 0) {
547            result.grow(size + bytesWritten);
548            memcpy(result.data() + size, buffer, bytesWritten);
549            size += bytesWritten;
550        }
551
552        if (error && g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) {
553            UChar codePoint = reinterpret_cast<const UChar*>(input)[0];
554            UnencodableReplacementArray replacement;
555            int replacementLength = TextCodec::getUnencodableReplacement(codePoint, handling, replacement);
556
557            // Consume the invalid character.
558            input += sizeof(UChar);
559            inputLength -= sizeof(UChar);
560
561            // Append replacement string to result buffer.
562            result.grow(size + replacementLength);
563            memcpy(result.data() + size, replacement, replacementLength);
564            size += replacementLength;
565
566            error.clear();
567        }
568    } while (inputLength && !error.get());
569
570    if (error) {
571        LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message);
572        return CString();
573    }
574
575    return CString(result.data(), size);
576}
577
578} // namespace WebCore
579