1// Locale support (codecvt) -*- C++ -*-
2
3// Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009
4//  Free Software Foundation, Inc.
5//
6// This file is part of the GNU ISO C++ Library.  This library is free
7// software; you can redistribute it and/or modify it under the
8// terms of the GNU General Public License as published by the
9// Free Software Foundation; either version 3, or (at your option)
10// any later version.
11
12// This library is distributed in the hope that it will be useful,
13// but WITHOUT ANY WARRANTY; without even the implied warranty of
14// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15// GNU General Public License for more details.
16
17// Under Section 7 of GPL version 3, you are granted additional
18// permissions described in the GCC Runtime Library Exception, version
19// 3.1, as published by the Free Software Foundation.
20
21// You should have received a copy of the GNU General Public License and
22// a copy of the GCC Runtime Library Exception along with this program;
23// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24// <http://www.gnu.org/licenses/>.
25
26//
27// ISO C++ 14882: 22.2.1.5 Template class codecvt
28//
29
30// Written by Benjamin Kosnik <bkoz@redhat.com>
31
32/** @file ext/codecvt_specializations.h
33 *  This file is a GNU extension to the Standard C++ Library.
34 */
35
36#ifndef _EXT_CODECVT_SPECIALIZATIONS_H
37#define _EXT_CODECVT_SPECIALIZATIONS_H 1
38
39#include <bits/c++config.h>
40#include <locale>
41#include <iconv.h>
42
43_GLIBCXX_BEGIN_NAMESPACE(__gnu_cxx)
44
45  /// Extension to use iconv for dealing with character encodings.
46  // This includes conversions and comparisons between various character
47  // sets.  This object encapsulates data that may need to be shared between
48  // char_traits, codecvt and ctype.
49  class encoding_state
50  {
51  public:
52    // Types:
53    // NB: A conversion descriptor subsumes and enhances the
54    // functionality of a simple state type such as mbstate_t.
55    typedef iconv_t	descriptor_type;
56
57  protected:
58    // Name of internal character set encoding.
59    std::string	       	_M_int_enc;
60
61    // Name of external character set encoding.
62    std::string  	_M_ext_enc;
63
64    // Conversion descriptor between external encoding to internal encoding.
65    descriptor_type	_M_in_desc;
66
67    // Conversion descriptor between internal encoding to external encoding.
68    descriptor_type	_M_out_desc;
69
70    // The byte-order marker for the external encoding, if necessary.
71    int			_M_ext_bom;
72
73    // The byte-order marker for the internal encoding, if necessary.
74    int			_M_int_bom;
75
76    // Number of external bytes needed to construct one complete
77    // character in the internal encoding.
78    // NB: -1 indicates variable, or stateful, encodings.
79    int 		_M_bytes;
80
81  public:
82    explicit
83    encoding_state()
84    : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
85    { }
86
87    explicit
88    encoding_state(const char* __int, const char* __ext,
89		   int __ibom = 0, int __ebom = 0, int __bytes = 1)
90    : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
91      _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
92    { init(); }
93
94    // 21.1.2 traits typedefs
95    // p4
96    // typedef STATE_T state_type
97    // requires: state_type shall meet the requirements of
98    // CopyConstructible types (20.1.3)
99    // NB: This does not preserve the actual state of the conversion
100    // descriptor member, but it does duplicate the encoding
101    // information.
102    encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
103    { construct(__obj); }
104
105    // Need assignment operator as well.
106    encoding_state&
107    operator=(const encoding_state& __obj)
108    {
109      construct(__obj);
110      return *this;
111    }
112
113    ~encoding_state()
114    { destroy(); }
115
116    bool
117    good() const throw()
118    {
119      const descriptor_type __err = (iconv_t)(-1);
120      bool __test = _M_in_desc && _M_in_desc != __err;
121      __test &=  _M_out_desc && _M_out_desc != __err;
122      return __test;
123    }
124
125    int
126    character_ratio() const
127    { return _M_bytes; }
128
129    const std::string
130    internal_encoding() const
131    { return _M_int_enc; }
132
133    int
134    internal_bom() const
135    { return _M_int_bom; }
136
137    const std::string
138    external_encoding() const
139    { return _M_ext_enc; }
140
141    int
142    external_bom() const
143    { return _M_ext_bom; }
144
145    const descriptor_type&
146    in_descriptor() const
147    { return _M_in_desc; }
148
149    const descriptor_type&
150    out_descriptor() const
151    { return _M_out_desc; }
152
153  protected:
154    void
155    init()
156    {
157      const descriptor_type __err = (iconv_t)(-1);
158      const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
159      if (!_M_in_desc && __have_encodings)
160	{
161	  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
162	  if (_M_in_desc == __err)
163	    std::__throw_runtime_error(__N("encoding_state::_M_init "
164				    "creating iconv input descriptor failed"));
165	}
166      if (!_M_out_desc && __have_encodings)
167	{
168	  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
169	  if (_M_out_desc == __err)
170	    std::__throw_runtime_error(__N("encoding_state::_M_init "
171				  "creating iconv output descriptor failed"));
172	}
173    }
174
175    void
176    construct(const encoding_state& __obj)
177    {
178      destroy();
179      _M_int_enc = __obj._M_int_enc;
180      _M_ext_enc = __obj._M_ext_enc;
181      _M_ext_bom = __obj._M_ext_bom;
182      _M_int_bom = __obj._M_int_bom;
183      _M_bytes = __obj._M_bytes;
184      init();
185    }
186
187    void
188    destroy() throw()
189    {
190      const descriptor_type __err = (iconv_t)(-1);
191      if (_M_in_desc && _M_in_desc != __err)
192	{
193	  iconv_close(_M_in_desc);
194	  _M_in_desc = 0;
195	}
196      if (_M_out_desc && _M_out_desc != __err)
197	{
198	  iconv_close(_M_out_desc);
199	  _M_out_desc = 0;
200	}
201    }
202  };
203
204  /// encoding_char_traits
205  // Custom traits type with encoding_state for the state type, and the
206  // associated fpos<encoding_state> for the position type, all other
207  // bits equivalent to the required char_traits instantiations.
208  template<typename _CharT>
209    struct encoding_char_traits : public std::char_traits<_CharT>
210    {
211      typedef encoding_state				state_type;
212      typedef typename std::fpos<state_type>		pos_type;
213    };
214
215_GLIBCXX_END_NAMESPACE
216
217
218_GLIBCXX_BEGIN_NAMESPACE(std)
219
220  using __gnu_cxx::encoding_state;
221
222  /// codecvt<InternT, _ExternT, encoding_state> specialization.
223  // This partial specialization takes advantage of iconv to provide
224  // code conversions between a large number of character encodings.
225  template<typename _InternT, typename _ExternT>
226    class codecvt<_InternT, _ExternT, encoding_state>
227    : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
228    {
229    public:
230      // Types:
231      typedef codecvt_base::result			result;
232      typedef _InternT 					intern_type;
233      typedef _ExternT 					extern_type;
234      typedef __gnu_cxx::encoding_state 		state_type;
235      typedef state_type::descriptor_type 		descriptor_type;
236
237      // Data Members:
238      static locale::id 		id;
239
240      explicit
241      codecvt(size_t __refs = 0)
242      : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
243      { }
244
245      explicit
246      codecvt(state_type& __enc, size_t __refs = 0)
247      : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
248      { }
249
250     protected:
251      virtual
252      ~codecvt() { }
253
254      virtual result
255      do_out(state_type& __state, const intern_type* __from,
256	     const intern_type* __from_end, const intern_type*& __from_next,
257	     extern_type* __to, extern_type* __to_end,
258	     extern_type*& __to_next) const;
259
260      virtual result
261      do_unshift(state_type& __state, extern_type* __to,
262		 extern_type* __to_end, extern_type*& __to_next) const;
263
264      virtual result
265      do_in(state_type& __state, const extern_type* __from,
266	    const extern_type* __from_end, const extern_type*& __from_next,
267	    intern_type* __to, intern_type* __to_end,
268	    intern_type*& __to_next) const;
269
270      virtual int
271      do_encoding() const throw();
272
273      virtual bool
274      do_always_noconv() const throw();
275
276      virtual int
277      do_length(state_type&, const extern_type* __from,
278		const extern_type* __end, size_t __max) const;
279
280      virtual int
281      do_max_length() const throw();
282    };
283
284  template<typename _InternT, typename _ExternT>
285    locale::id
286    codecvt<_InternT, _ExternT, encoding_state>::id;
287
288  // This adaptor works around the signature problems of the second
289  // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
290  // uses 'char**', which matches the POSIX 1003.1-2001 standard.
291  // Using this adaptor, g++ will do the work for us.
292  template<typename _Tp>
293    inline size_t
294    __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
295                    iconv_t __cd, char** __inbuf, size_t* __inbytes,
296                    char** __outbuf, size_t* __outbytes)
297    { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
298
299  template<typename _InternT, typename _ExternT>
300    codecvt_base::result
301    codecvt<_InternT, _ExternT, encoding_state>::
302    do_out(state_type& __state, const intern_type* __from,
303	   const intern_type* __from_end, const intern_type*& __from_next,
304	   extern_type* __to, extern_type* __to_end,
305	   extern_type*& __to_next) const
306    {
307      result __ret = codecvt_base::error;
308      if (__state.good())
309	{
310	  const descriptor_type& __desc = __state.out_descriptor();
311	  const size_t __fmultiple = sizeof(intern_type);
312	  size_t __fbytes = __fmultiple * (__from_end - __from);
313	  const size_t __tmultiple = sizeof(extern_type);
314	  size_t __tbytes = __tmultiple * (__to_end - __to);
315
316	  // Argument list for iconv specifies a byte sequence. Thus,
317	  // all to/from arrays must be brutally casted to char*.
318	  char* __cto = reinterpret_cast<char*>(__to);
319	  char* __cfrom;
320	  size_t __conv;
321
322	  // Some encodings need a byte order marker as the first item
323	  // in the byte stream, to designate endian-ness. The default
324	  // value for the byte order marker is NULL, so if this is
325	  // the case, it's not necessary and we can just go on our
326	  // merry way.
327	  int __int_bom = __state.internal_bom();
328	  if (__int_bom)
329	    {
330	      size_t __size = __from_end - __from;
331	      intern_type* __cfixed = static_cast<intern_type*>
332		(__builtin_alloca(sizeof(intern_type) * (__size + 1)));
333	      __cfixed[0] = static_cast<intern_type>(__int_bom);
334	      char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
335	      __cfrom = reinterpret_cast<char*>(__cfixed);
336	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
337                                        &__fbytes, &__cto, &__tbytes);
338	    }
339	  else
340	    {
341	      intern_type* __cfixed = const_cast<intern_type*>(__from);
342	      __cfrom = reinterpret_cast<char*>(__cfixed);
343	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
344				       &__cto, &__tbytes);
345	    }
346
347	  if (__conv != size_t(-1))
348	    {
349	      __from_next = reinterpret_cast<const intern_type*>(__cfrom);
350	      __to_next = reinterpret_cast<extern_type*>(__cto);
351	      __ret = codecvt_base::ok;
352	    }
353	  else
354	    {
355	      if (__fbytes < __fmultiple * (__from_end - __from))
356		{
357		  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
358		  __to_next = reinterpret_cast<extern_type*>(__cto);
359		  __ret = codecvt_base::partial;
360		}
361	      else
362		__ret = codecvt_base::error;
363	    }
364	}
365      return __ret;
366    }
367
368  template<typename _InternT, typename _ExternT>
369    codecvt_base::result
370    codecvt<_InternT, _ExternT, encoding_state>::
371    do_unshift(state_type& __state, extern_type* __to,
372	       extern_type* __to_end, extern_type*& __to_next) const
373    {
374      result __ret = codecvt_base::error;
375      if (__state.good())
376	{
377	  const descriptor_type& __desc = __state.in_descriptor();
378	  const size_t __tmultiple = sizeof(intern_type);
379	  size_t __tlen = __tmultiple * (__to_end - __to);
380
381	  // Argument list for iconv specifies a byte sequence. Thus,
382	  // all to/from arrays must be brutally casted to char*.
383	  char* __cto = reinterpret_cast<char*>(__to);
384	  size_t __conv = __iconv_adaptor(iconv,__desc, NULL, NULL,
385                                          &__cto, &__tlen);
386
387	  if (__conv != size_t(-1))
388	    {
389	      __to_next = reinterpret_cast<extern_type*>(__cto);
390	      if (__tlen == __tmultiple * (__to_end - __to))
391		__ret = codecvt_base::noconv;
392	      else if (__tlen == 0)
393		__ret = codecvt_base::ok;
394	      else
395		__ret = codecvt_base::partial;
396	    }
397	  else
398	    __ret = codecvt_base::error;
399	}
400      return __ret;
401    }
402
403  template<typename _InternT, typename _ExternT>
404    codecvt_base::result
405    codecvt<_InternT, _ExternT, encoding_state>::
406    do_in(state_type& __state, const extern_type* __from,
407	  const extern_type* __from_end, const extern_type*& __from_next,
408	  intern_type* __to, intern_type* __to_end,
409	  intern_type*& __to_next) const
410    {
411      result __ret = codecvt_base::error;
412      if (__state.good())
413	{
414	  const descriptor_type& __desc = __state.in_descriptor();
415	  const size_t __fmultiple = sizeof(extern_type);
416	  size_t __flen = __fmultiple * (__from_end - __from);
417	  const size_t __tmultiple = sizeof(intern_type);
418	  size_t __tlen = __tmultiple * (__to_end - __to);
419
420	  // Argument list for iconv specifies a byte sequence. Thus,
421	  // all to/from arrays must be brutally casted to char*.
422	  char* __cto = reinterpret_cast<char*>(__to);
423	  char* __cfrom;
424	  size_t __conv;
425
426	  // Some encodings need a byte order marker as the first item
427	  // in the byte stream, to designate endian-ness. The default
428	  // value for the byte order marker is NULL, so if this is
429	  // the case, it's not necessary and we can just go on our
430	  // merry way.
431	  int __ext_bom = __state.external_bom();
432	  if (__ext_bom)
433	    {
434	      size_t __size = __from_end - __from;
435	      extern_type* __cfixed =  static_cast<extern_type*>
436		(__builtin_alloca(sizeof(extern_type) * (__size + 1)));
437	      __cfixed[0] = static_cast<extern_type>(__ext_bom);
438	      char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
439	      __cfrom = reinterpret_cast<char*>(__cfixed);
440	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
441                                       &__flen, &__cto, &__tlen);
442	    }
443	  else
444	    {
445	      extern_type* __cfixed = const_cast<extern_type*>(__from);
446	      __cfrom = reinterpret_cast<char*>(__cfixed);
447	      __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
448                                       &__flen, &__cto, &__tlen);
449	    }
450
451
452	  if (__conv != size_t(-1))
453	    {
454	      __from_next = reinterpret_cast<const extern_type*>(__cfrom);
455	      __to_next = reinterpret_cast<intern_type*>(__cto);
456	      __ret = codecvt_base::ok;
457	    }
458	  else
459	    {
460	      if (__flen < static_cast<size_t>(__from_end - __from))
461		{
462		  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
463		  __to_next = reinterpret_cast<intern_type*>(__cto);
464		  __ret = codecvt_base::partial;
465		}
466	      else
467		__ret = codecvt_base::error;
468	    }
469	}
470      return __ret;
471    }
472
473  template<typename _InternT, typename _ExternT>
474    int
475    codecvt<_InternT, _ExternT, encoding_state>::
476    do_encoding() const throw()
477    {
478      int __ret = 0;
479      if (sizeof(_ExternT) <= sizeof(_InternT))
480	__ret = sizeof(_InternT) / sizeof(_ExternT);
481      return __ret;
482    }
483
484  template<typename _InternT, typename _ExternT>
485    bool
486    codecvt<_InternT, _ExternT, encoding_state>::
487    do_always_noconv() const throw()
488    { return false; }
489
490  template<typename _InternT, typename _ExternT>
491    int
492    codecvt<_InternT, _ExternT, encoding_state>::
493    do_length(state_type&, const extern_type* __from,
494	      const extern_type* __end, size_t __max) const
495    { return std::min(__max, static_cast<size_t>(__end - __from)); }
496
497  // _GLIBCXX_RESOLVE_LIB_DEFECTS
498  // 74.  Garbled text for codecvt::do_max_length
499  template<typename _InternT, typename _ExternT>
500    int
501    codecvt<_InternT, _ExternT, encoding_state>::
502    do_max_length() const throw()
503    { return 1; }
504
505_GLIBCXX_END_NAMESPACE
506
507#endif
508