1// Locale support (codecvt) -*- C++ -*-
2
3// Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
4// 2008, 2009, 2010
5// Free Software Foundation, Inc.
6//
7// This file is part of the GNU ISO C++ Library.  This library is free
8// software; you can redistribute it and/or modify it under the
9// terms of the GNU General Public License as published by the
10// Free Software Foundation; either version 3, or (at your option)
11// any later version.
12
13// This library is distributed in the hope that it will be useful,
14// but WITHOUT ANY WARRANTY; without even the implied warranty of
15// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16// GNU General Public License for more details.
17
18// Under Section 7 of GPL version 3, you are granted additional
19// permissions described in the GCC Runtime Library Exception, version
20// 3.1, as published by the Free Software Foundation.
21
22// You should have received a copy of the GNU General Public License and
23// a copy of the GCC Runtime Library Exception along with this program;
24// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
25// <http://www.gnu.org/licenses/>.
26
27//
28// ISO C++ 14882: 22.2.1.5 Template class codecvt
29//
30
31// Written by Benjamin Kosnik <bkoz@redhat.com>
32
33/** @file ext/codecvt_specializations.h
34 *  This file is a GNU extension to the Standard C++ Library.
35 */
36
37#ifndef _EXT_CODECVT_SPECIALIZATIONS_H
38#define _EXT_CODECVT_SPECIALIZATIONS_H 1
39
40#include <bits/c++config.h>
41#include <locale>
42#include <iconv.h>
43
44namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
45{
46_GLIBCXX_BEGIN_NAMESPACE_VERSION
47
48  /// Extension to use iconv for dealing with character encodings.
49  // This includes conversions and comparisons between various character
50  // sets.  This object encapsulates data that may need to be shared between
51  // char_traits, codecvt and ctype.
52  class encoding_state
53  {
54  public:
55    // Types:
56    // NB: A conversion descriptor subsumes and enhances the
57    // functionality of a simple state type such as mbstate_t.
58    typedef iconv_t	descriptor_type;
59
60  protected:
61    // Name of internal character set encoding.
62    std::string	       	_M_int_enc;
63
64    // Name of external character set encoding.
65    std::string  	_M_ext_enc;
66
67    // Conversion descriptor between external encoding to internal encoding.
68    descriptor_type	_M_in_desc;
69
70    // Conversion descriptor between internal encoding to external encoding.
71    descriptor_type	_M_out_desc;
72
73    // The byte-order marker for the external encoding, if necessary.
74    int			_M_ext_bom;
75
76    // The byte-order marker for the internal encoding, if necessary.
77    int			_M_int_bom;
78
79    // Number of external bytes needed to construct one complete
80    // character in the internal encoding.
81    // NB: -1 indicates variable, or stateful, encodings.
82    int 		_M_bytes;
83
84  public:
85    explicit
86    encoding_state()
87    : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
88    { }
89
90    explicit
91    encoding_state(const char* __int, const char* __ext,
92		   int __ibom = 0, int __ebom = 0, int __bytes = 1)
93    : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
94      _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
95    { init(); }
96
97    // 21.1.2 traits typedefs
98    // p4
99    // typedef STATE_T state_type
100    // requires: state_type shall meet the requirements of
101    // CopyConstructible types (20.1.3)
102    // NB: This does not preserve the actual state of the conversion
103    // descriptor member, but it does duplicate the encoding
104    // information.
105    encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
106    { construct(__obj); }
107
108    // Need assignment operator as well.
109    encoding_state&
110    operator=(const encoding_state& __obj)
111    {
112      construct(__obj);
113      return *this;
114    }
115
116    ~encoding_state()
117    { destroy(); }
118
119    bool
120    good() const throw()
121    {
122      const descriptor_type __err = (iconv_t)(-1);
123      bool __test = _M_in_desc && _M_in_desc != __err;
124      __test &=  _M_out_desc && _M_out_desc != __err;
125      return __test;
126    }
127
128    int
129    character_ratio() const
130    { return _M_bytes; }
131
132    const std::string
133    internal_encoding() const
134    { return _M_int_enc; }
135
136    int
137    internal_bom() const
138    { return _M_int_bom; }
139
140    const std::string
141    external_encoding() const
142    { return _M_ext_enc; }
143
144    int
145    external_bom() const
146    { return _M_ext_bom; }
147
148    const descriptor_type&
149    in_descriptor() const
150    { return _M_in_desc; }
151
152    const descriptor_type&
153    out_descriptor() const
154    { return _M_out_desc; }
155
156  protected:
157    void
158    init()
159    {
160      const descriptor_type __err = (iconv_t)(-1);
161      const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
162      if (!_M_in_desc && __have_encodings)
163	{
164	  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
165	  if (_M_in_desc == __err)
166	    std::__throw_runtime_error(__N("encoding_state::_M_init "
167				    "creating iconv input descriptor failed"));
168	}
169      if (!_M_out_desc && __have_encodings)
170	{
171	  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
172	  if (_M_out_desc == __err)
173	    std::__throw_runtime_error(__N("encoding_state::_M_init "
174				  "creating iconv output descriptor failed"));
175	}
176    }
177
178    void
179    construct(const encoding_state& __obj)
180    {
181      destroy();
182      _M_int_enc = __obj._M_int_enc;
183      _M_ext_enc = __obj._M_ext_enc;
184      _M_ext_bom = __obj._M_ext_bom;
185      _M_int_bom = __obj._M_int_bom;
186      _M_bytes = __obj._M_bytes;
187      init();
188    }
189
190    void
191    destroy() throw()
192    {
193      const descriptor_type __err = (iconv_t)(-1);
194      if (_M_in_desc && _M_in_desc != __err)
195	{
196	  iconv_close(_M_in_desc);
197	  _M_in_desc = 0;
198	}
199      if (_M_out_desc && _M_out_desc != __err)
200	{
201	  iconv_close(_M_out_desc);
202	  _M_out_desc = 0;
203	}
204    }
205  };
206
207  /// encoding_char_traits
208  // Custom traits type with encoding_state for the state type, and the
209  // associated fpos<encoding_state> for the position type, all other
210  // bits equivalent to the required char_traits instantiations.
211  template<typename _CharT>
212    struct encoding_char_traits : public std::char_traits<_CharT>
213    {
214      typedef encoding_state				state_type;
215      typedef typename std::fpos<state_type>		pos_type;
216    };
217
218_GLIBCXX_END_NAMESPACE_VERSION
219} // namespace
220
221
222namespace std _GLIBCXX_VISIBILITY(default)
223{
224_GLIBCXX_BEGIN_NAMESPACE_VERSION
225
226  using __gnu_cxx::encoding_state;
227
228  /// codecvt<InternT, _ExternT, encoding_state> specialization.
229  // This partial specialization takes advantage of iconv to provide
230  // code conversions between a large number of character encodings.
231  template<typename _InternT, typename _ExternT>
232    class codecvt<_InternT, _ExternT, encoding_state>
233    : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
234    {
235    public:
236      // Types:
237      typedef codecvt_base::result			result;
238      typedef _InternT 					intern_type;
239      typedef _ExternT 					extern_type;
240      typedef __gnu_cxx::encoding_state 		state_type;
241      typedef state_type::descriptor_type 		descriptor_type;
242
243      // Data Members:
244      static locale::id 		id;
245
246      explicit
247      codecvt(size_t __refs = 0)
248      : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
249      { }
250
251      explicit
252      codecvt(state_type& __enc, size_t __refs = 0)
253      : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
254      { }
255
256     protected:
257      virtual
258      ~codecvt() { }
259
260      virtual result
261      do_out(state_type& __state, const intern_type* __from,
262	     const intern_type* __from_end, const intern_type*& __from_next,
263	     extern_type* __to, extern_type* __to_end,
264	     extern_type*& __to_next) const;
265
266      virtual result
267      do_unshift(state_type& __state, extern_type* __to,
268		 extern_type* __to_end, extern_type*& __to_next) const;
269
270      virtual result
271      do_in(state_type& __state, const extern_type* __from,
272	    const extern_type* __from_end, const extern_type*& __from_next,
273	    intern_type* __to, intern_type* __to_end,
274	    intern_type*& __to_next) const;
275
276      virtual int
277      do_encoding() const throw();
278
279      virtual bool
280      do_always_noconv() const throw();
281
282      virtual int
283      do_length(state_type&, const extern_type* __from,
284		const extern_type* __end, size_t __max) const;
285
286      virtual int
287      do_max_length() const throw();
288    };
289
290  template<typename _InternT, typename _ExternT>
291    locale::id
292    codecvt<_InternT, _ExternT, encoding_state>::id;
293
294  // This adaptor works around the signature problems of the second
295  // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
296  // uses 'char**', which matches the POSIX 1003.1-2001 standard.
297  // Using this adaptor, g++ will do the work for us.
298  template<typename _Tp>
299    inline size_t
300    __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
301                    iconv_t __cd, char** __inbuf, size_t* __inbytes,
302                    char** __outbuf, size_t* __outbytes)
303    { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
304
305  template<typename _InternT, typename _ExternT>
306    codecvt_base::result
307    codecvt<_InternT, _ExternT, encoding_state>::
308    do_out(state_type& __state, const intern_type* __from,
309	   const intern_type* __from_end, const intern_type*& __from_next,
310	   extern_type* __to, extern_type* __to_end,
311	   extern_type*& __to_next) const
312    {
313      result __ret = codecvt_base::error;
314      if (__state.good())
315	{
316	  const descriptor_type& __desc = __state.out_descriptor();
317	  const size_t __fmultiple = sizeof(intern_type);
318	  size_t __fbytes = __fmultiple * (__from_end - __from);
319	  const size_t __tmultiple = sizeof(extern_type);
320	  size_t __tbytes = __tmultiple * (__to_end - __to);
321
322	  // Argument list for iconv specifies a byte sequence. Thus,
323	  // all to/from arrays must be brutally casted to char*.
324	  char* __cto = reinterpret_cast<char*>(__to);
325	  char* __cfrom;
326	  size_t __conv;
327
328	  // Some encodings need a byte order marker as the first item
329	  // in the byte stream, to designate endian-ness. The default
330	  // value for the byte order marker is NULL, so if this is
331	  // the case, it's not necessary and we can just go on our
332	  // merry way.
333	  int __int_bom = __state.internal_bom();
334	  if (__int_bom)
335	    {
336	      size_t __size = __from_end - __from;
337	      intern_type* __cfixed = static_cast<intern_type*>
338		(__builtin_alloca(sizeof(intern_type) * (__size + 1)));
339	      __cfixed[0] = static_cast<intern_type>(__int_bom);
340	      char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
341	      __cfrom = reinterpret_cast<char*>(__cfixed);
342	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
343                                        &__fbytes, &__cto, &__tbytes);
344	    }
345	  else
346	    {
347	      intern_type* __cfixed = const_cast<intern_type*>(__from);
348	      __cfrom = reinterpret_cast<char*>(__cfixed);
349	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
350				       &__cto, &__tbytes);
351	    }
352
353	  if (__conv != size_t(-1))
354	    {
355	      __from_next = reinterpret_cast<const intern_type*>(__cfrom);
356	      __to_next = reinterpret_cast<extern_type*>(__cto);
357	      __ret = codecvt_base::ok;
358	    }
359	  else
360	    {
361	      if (__fbytes < __fmultiple * (__from_end - __from))
362		{
363		  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
364		  __to_next = reinterpret_cast<extern_type*>(__cto);
365		  __ret = codecvt_base::partial;
366		}
367	      else
368		__ret = codecvt_base::error;
369	    }
370	}
371      return __ret;
372    }
373
374  template<typename _InternT, typename _ExternT>
375    codecvt_base::result
376    codecvt<_InternT, _ExternT, encoding_state>::
377    do_unshift(state_type& __state, extern_type* __to,
378	       extern_type* __to_end, extern_type*& __to_next) const
379    {
380      result __ret = codecvt_base::error;
381      if (__state.good())
382	{
383	  const descriptor_type& __desc = __state.in_descriptor();
384	  const size_t __tmultiple = sizeof(intern_type);
385	  size_t __tlen = __tmultiple * (__to_end - __to);
386
387	  // Argument list for iconv specifies a byte sequence. Thus,
388	  // all to/from arrays must be brutally casted to char*.
389	  char* __cto = reinterpret_cast<char*>(__to);
390	  size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
391                                          &__cto, &__tlen);
392
393	  if (__conv != size_t(-1))
394	    {
395	      __to_next = reinterpret_cast<extern_type*>(__cto);
396	      if (__tlen == __tmultiple * (__to_end - __to))
397		__ret = codecvt_base::noconv;
398	      else if (__tlen == 0)
399		__ret = codecvt_base::ok;
400	      else
401		__ret = codecvt_base::partial;
402	    }
403	  else
404	    __ret = codecvt_base::error;
405	}
406      return __ret;
407    }
408
409  template<typename _InternT, typename _ExternT>
410    codecvt_base::result
411    codecvt<_InternT, _ExternT, encoding_state>::
412    do_in(state_type& __state, const extern_type* __from,
413	  const extern_type* __from_end, const extern_type*& __from_next,
414	  intern_type* __to, intern_type* __to_end,
415	  intern_type*& __to_next) const
416    {
417      result __ret = codecvt_base::error;
418      if (__state.good())
419	{
420	  const descriptor_type& __desc = __state.in_descriptor();
421	  const size_t __fmultiple = sizeof(extern_type);
422	  size_t __flen = __fmultiple * (__from_end - __from);
423	  const size_t __tmultiple = sizeof(intern_type);
424	  size_t __tlen = __tmultiple * (__to_end - __to);
425
426	  // Argument list for iconv specifies a byte sequence. Thus,
427	  // all to/from arrays must be brutally casted to char*.
428	  char* __cto = reinterpret_cast<char*>(__to);
429	  char* __cfrom;
430	  size_t __conv;
431
432	  // Some encodings need a byte order marker as the first item
433	  // in the byte stream, to designate endian-ness. The default
434	  // value for the byte order marker is NULL, so if this is
435	  // the case, it's not necessary and we can just go on our
436	  // merry way.
437	  int __ext_bom = __state.external_bom();
438	  if (__ext_bom)
439	    {
440	      size_t __size = __from_end - __from;
441	      extern_type* __cfixed =  static_cast<extern_type*>
442		(__builtin_alloca(sizeof(extern_type) * (__size + 1)));
443	      __cfixed[0] = static_cast<extern_type>(__ext_bom);
444	      char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
445	      __cfrom = reinterpret_cast<char*>(__cfixed);
446	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
447                                       &__flen, &__cto, &__tlen);
448	    }
449	  else
450	    {
451	      extern_type* __cfixed = const_cast<extern_type*>(__from);
452	      __cfrom = reinterpret_cast<char*>(__cfixed);
453	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
454                                       &__flen, &__cto, &__tlen);
455	    }
456
457
458	  if (__conv != size_t(-1))
459	    {
460	      __from_next = reinterpret_cast<const extern_type*>(__cfrom);
461	      __to_next = reinterpret_cast<intern_type*>(__cto);
462	      __ret = codecvt_base::ok;
463	    }
464	  else
465	    {
466	      if (__flen < static_cast<size_t>(__from_end - __from))
467		{
468		  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
469		  __to_next = reinterpret_cast<intern_type*>(__cto);
470		  __ret = codecvt_base::partial;
471		}
472	      else
473		__ret = codecvt_base::error;
474	    }
475	}
476      return __ret;
477    }
478
479  template<typename _InternT, typename _ExternT>
480    int
481    codecvt<_InternT, _ExternT, encoding_state>::
482    do_encoding() const throw()
483    {
484      int __ret = 0;
485      if (sizeof(_ExternT) <= sizeof(_InternT))
486	__ret = sizeof(_InternT) / sizeof(_ExternT);
487      return __ret;
488    }
489
490  template<typename _InternT, typename _ExternT>
491    bool
492    codecvt<_InternT, _ExternT, encoding_state>::
493    do_always_noconv() const throw()
494    { return false; }
495
496  template<typename _InternT, typename _ExternT>
497    int
498    codecvt<_InternT, _ExternT, encoding_state>::
499    do_length(state_type&, const extern_type* __from,
500	      const extern_type* __end, size_t __max) const
501    { return std::min(__max, static_cast<size_t>(__end - __from)); }
502
503  // _GLIBCXX_RESOLVE_LIB_DEFECTS
504  // 74.  Garbled text for codecvt::do_max_length
505  template<typename _InternT, typename _ExternT>
506    int
507    codecvt<_InternT, _ExternT, encoding_state>::
508    do_max_length() const throw()
509    { return 1; }
510
511_GLIBCXX_END_NAMESPACE_VERSION
512} // namespace
513
514#endif
515