1#include <string>
2
3#if !defined (STLPORT) || !defined (_STLP_USE_NO_IOSTREAMS)
4#  include <fstream>
5#  include <locale>
6#  include <stdexcept>
7#  include <cstdio> // for WEOF
8
9#  include "cppunit/cppunit_proxy.h"
10
11#  if !defined (STLPORT) || defined(_STLP_USE_NAMESPACES)
12using namespace std;
13#  endif
14
15//
16// TestCase class
17//
18class CodecvtTest : public CPPUNIT_NS::TestCase
19{
20  CPPUNIT_TEST_SUITE(CodecvtTest);
21#if defined (STLPORT) && defined (_STLP_NO_MEMBER_TEMPLATES)
22  CPPUNIT_IGNORE;
23#endif
24  CPPUNIT_TEST(variable_encoding);
25  CPPUNIT_STOP_IGNORE;
26#if defined (STLPORT) && (defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS))
27  CPPUNIT_IGNORE;
28#endif
29  CPPUNIT_TEST(in_out_test);
30  CPPUNIT_TEST(length_test);
31  CPPUNIT_TEST(imbue_while_reading);
32  CPPUNIT_TEST(special_encodings);
33  CPPUNIT_TEST_SUITE_END();
34
35protected:
36  void variable_encoding();
37  void in_out_test();
38  void length_test();
39  void imbue_while_reading();
40  void special_encodings();
41};
42
43CPPUNIT_TEST_SUITE_REGISTRATION(CodecvtTest);
44
45#if defined (STLPORT)
46#  define __NO_THROW _STLP_NOTHROW
47#else
48#  define __NO_THROW throw()
49#endif
50
51
52/* Codecvt facet eating some characters from the external buffer.
53 * Transform '01' in 'a'
54 */
55struct eater_codecvt : public codecvt<char, char, mbstate_t> {
56  typedef codecvt<char,char,mbstate_t> base;
57
58  explicit eater_codecvt(size_t refs = 0) : base(refs) {}
59
60  // primitive conversion
61  virtual base::result
62  do_in(mbstate_t& mb,
63        const char* ebegin, const char* eend, const char*& ecur,
64        char* ibegin, char* iend, char*& icur) const __NO_THROW {
65      char *state = (char*)&mb;
66      ecur = ebegin;
67      icur = ibegin;
68
69      while (ecur != eend) {
70          if (icur == iend)
71              return partial;
72          if (*ecur == '0' || *state == 1) {
73            if (*state != 1) {
74              ++ecur;
75            }
76            if (ecur == eend) {
77              *state = 1;
78              return ok;
79            }
80
81            if (*ecur == '1') {
82              *icur = 'a';
83            }
84            else {
85              *(icur++) = '0';
86              if (icur == iend) {
87                if (*state != 1) {
88                  --ecur;
89                }
90                return partial;
91              }
92              *icur = *ecur;
93            }
94          }
95          else {
96            *icur = *ecur;
97          }
98
99          *state = 0;
100          ++icur;
101          ++ecur;
102      }
103
104      return ok;
105  }
106
107  // claim it's not a null-conversion
108  virtual bool do_always_noconv() const __NO_THROW
109  { return false; }
110
111  // claim it doesn't have a fixed-length encoding
112  virtual int do_encoding() const __NO_THROW
113  { return 0; }
114
115  // implemented for consistency with do_in overload
116  virtual int do_length(mbstate_t &state,
117                        const char *efrom, const char *eend, size_t m) const {
118    char *ibegin = new char[m];
119    const char *ecur = efrom;
120    char *icur = ibegin;
121    mbstate_t tmp = state;
122    do_in(tmp, efrom, eend, ecur, ibegin, ibegin + m, icur);
123    delete[] ibegin;
124    return ecur - efrom;
125  }
126
127  virtual int do_max_length() const __NO_THROW
128  { return 2; }
129
130#ifdef __DMC__
131  static locale::id id;
132#endif
133};
134
135#ifdef __DMC__
136locale::id eater_codecvt::id;
137
138locale::id& _GetFacetId(const eater_codecvt*)
139{ return eater_codecvt::id; }
140#endif
141
142/* Codecvt facet generating more characters than the ones read from the
143 * external buffer, transform '01' in 'abc'
144 * This kind of facet do not allow systematical positionning in the external
145 * buffer (tellg -> -1), when you just read a 'a' you are at an undefined
146 * external buffer position.
147 */
148struct generator_codecvt : public codecvt<char, char, mbstate_t> {
149  typedef codecvt<char,char,mbstate_t> base;
150
151  explicit generator_codecvt(size_t refs = 0) : base(refs) {}
152
153  // primitive conversion
154  virtual base::result
155  do_in(mbstate_t& mb,
156        const char* ebegin, const char* eend, const char*& ecur,
157        char* ibegin, char* iend, char*& icur) const __NO_THROW {
158      //Access the mbstate information in a portable way:
159      char *state = (char*)&mb;
160      ecur = ebegin;
161      icur = ibegin;
162
163      if (icur == iend) return ok;
164
165      if (*state == 2) {
166        *(icur++) = 'b';
167        if (icur == iend) {
168          *state = 3;
169          return ok;
170        }
171        *(icur++) = 'c';
172        *state = 0;
173      }
174      else if (*state == 3) {
175        *(icur++) = 'c';
176        *state = 0;
177      }
178
179      while (ecur != eend) {
180          if (icur == iend)
181              return ok;
182          if (*ecur == '0' || *state == 1) {
183            if (*state != 1) {
184              ++ecur;
185            }
186            if (ecur == eend) {
187              *state = 1;
188              return partial;
189            }
190
191            if (*ecur == '1') {
192              *(icur++) = 'a';
193              if (icur == iend) {
194                *state = 2;
195                return ok;
196              }
197              *(icur++) = 'b';
198              if (icur == iend) {
199                *state = 3;
200                return ok;
201              }
202              *icur = 'c';
203            }
204            else {
205              *(icur++) = '0';
206              if (icur == iend) {
207                if (*state != 1) {
208                  --ecur;
209                }
210                return ok;
211              }
212              *icur = *ecur;
213            }
214          }
215          else {
216            *icur = *ecur;
217          }
218
219          *state = 0;
220          ++icur;
221          ++ecur;
222      }
223
224      return ok;
225  }
226
227  // claim it's not a null-conversion
228  virtual bool do_always_noconv() const __NO_THROW
229  { return false; }
230
231  // claim it doesn't have a fixed-length encoding
232  virtual int do_encoding() const __NO_THROW
233  { return 0; }
234
235  // implemented for consistency with do_in overload
236  virtual int do_length(mbstate_t &mb,
237                        const char *efrom, const char *eend, size_t m) const {
238    const char *state = (const char*)&mb;
239    int offset = 0;
240    if (*state == 2)
241      offset = 2;
242    else if (*state == 3)
243      offset = 1;
244
245    char *ibegin = new char[m + offset];
246    const char *ecur = efrom;
247    char *icur = ibegin;
248    mbstate_t tmpState = mb;
249    do_in(tmpState, efrom, eend, ecur, ibegin, ibegin + m + offset, icur);
250    /*
251    char *state = (char*)&tmpState;
252    if (*state != 0) {
253      if (*state == 1)
254        --ecur;
255      else if (*state == 2 || *state == 3) {
256        //Undefined position, we return -1:
257        ecur = efrom - 1;
258      }
259    }
260    else {
261      if (*((char*)&mb) != 0) {
262        //We take into account the character that hasn't been counted yet in
263        //the previous decoding step:
264        ecur++;
265      }
266    }
267    */
268    delete[] ibegin;
269    return (int)min((size_t)(ecur - efrom), m);
270  }
271
272  virtual int do_max_length() const __NO_THROW
273  { return 0; }
274#ifdef __DMC__
275  static locale::id id;
276#endif
277};
278
279#ifdef __DMC__
280locale::id generator_codecvt::id;
281
282locale::id& _GetFacetId(const generator_codecvt*)
283{ return generator_codecvt::id; }
284#endif
285
286//
287// tests implementation
288//
289#include <iostream>
290void CodecvtTest::variable_encoding()
291{
292#if !defined (STLPORT) || !defined (_STLP_NO_MEMBER_TEMPLATES)
293  //We first generate the file used for test:
294  const char* fileName = "test_file.txt";
295  {
296    ofstream ostr(fileName);
297    //Maybe we simply do not have write access to repository
298    CPPUNIT_ASSERT( ostr.good() );
299    for (int i = 0; i < 2048; ++i) {
300      ostr << "0123456789";
301    }
302    CPPUNIT_ASSERT( ostr.good() );
303  }
304
305  {
306    ifstream istr(fileName);
307    CPPUNIT_ASSERT( istr.good() );
308    CPPUNIT_ASSERT( !istr.eof() );
309
310    eater_codecvt codec(1);
311    locale loc(locale::classic(), &codec);
312
313    istr.imbue(loc);
314    CPPUNIT_ASSERT( istr.good() );
315    CPPUNIT_ASSERT( (int)istr.tellg() == 0 );
316
317    int theoricalPos = 0;
318    do {
319      int c = istr.get();
320      if (char_traits<char>::eq_int_type(c, char_traits<char>::eof())) {
321        break;
322      }
323      ++theoricalPos;
324      if (c == 'a') {
325        ++theoricalPos;
326      }
327
328      CPPUNIT_ASSERT( (int)istr.tellg() == theoricalPos );
329    }
330    while (!istr.eof());
331    cerr << "out!\n";
332    CPPUNIT_ASSERT( istr.eof() );
333    cerr << "fin!\n";
334  }
335
336#  if 0
337  /* This test is broken, not sure if it is really possible to get a position in
338   * a locale having a codecvt such as generator_codecvt. Maybe generator_codecvt
339   * is not a valid theorical example of codecvt implementation. */
340  {
341    ifstream istr(fileName);
342    CPPUNIT_ASSERT( istr.good() );
343    CPPUNIT_ASSERT( !istr.eof() );
344
345    generator_codecvt codec(1);
346    locale loc(locale::classic(), &codec);
347
348    istr.imbue(loc);
349    CPPUNIT_ASSERT( istr.good() );
350    CPPUNIT_ASSERT( (int)istr.tellg() == 0 );
351
352    int theoricalPos = 0;
353    int theoricalTellg;
354    do {
355      char c = istr.get();
356      if (c == char_traits<char>::eof()) {
357        break;
358      }
359      switch (c) {
360        case 'a':
361        case 'b':
362          theoricalTellg = -1;
363          break;
364        case 'c':
365          ++theoricalPos;
366        default:
367          ++theoricalPos;
368          theoricalTellg = theoricalPos;
369          break;
370      }
371
372      if ((int)istr.tellg() != theoricalTellg) {
373        CPPUNIT_ASSERT( (int)istr.tellg() == theoricalTellg );
374      }
375    }
376    while (!istr.eof());
377
378    CPPUNIT_ASSERT( istr.eof() );
379  }
380#  endif
381#endif
382}
383
384void CodecvtTest::in_out_test()
385{
386#if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS))
387  try {
388    locale loc("");
389
390    typedef codecvt<wchar_t, char, mbstate_t> cdecvt_type;
391    if (has_facet<cdecvt_type>(loc)) {
392      cdecvt_type const& cdect = use_facet<cdecvt_type>(loc);
393      {
394        cdecvt_type::state_type state;
395        memset(&state, 0, sizeof(cdecvt_type::state_type));
396        string from("abcdef");
397        const char* next_from;
398        wchar_t to[1];
399        wchar_t *next_to;
400        cdecvt_type::result res = cdect.in(state, from.data(), from.data() + from.size(), next_from,
401                                           to, to + sizeof(to) / sizeof(wchar_t), next_to);
402        CPPUNIT_ASSERT( res == cdecvt_type::ok );
403        CPPUNIT_ASSERT( next_from == from.data() + 1 );
404        CPPUNIT_ASSERT( next_to == &to[0] + 1 );
405        CPPUNIT_ASSERT( to[0] == L'a');
406      }
407      {
408        cdecvt_type::state_type state;
409        memset(&state, 0, sizeof(cdecvt_type::state_type));
410        wstring from(L"abcdef");
411        const wchar_t* next_from;
412        char to[1];
413        char *next_to;
414        cdecvt_type::result res = cdect.out(state, from.data(), from.data() + from.size(), next_from,
415                                            to, to + sizeof(to) / sizeof(char), next_to);
416        CPPUNIT_ASSERT( res == cdecvt_type::ok );
417        CPPUNIT_ASSERT( next_from == from.data() + 1 );
418        CPPUNIT_ASSERT( next_to == &to[0] + 1 );
419        CPPUNIT_ASSERT( to[0] == 'a');
420      }
421    }
422  }
423  catch (runtime_error const&) {
424  }
425  catch (...) {
426    CPPUNIT_FAIL;
427  }
428#endif
429}
430
431void CodecvtTest::length_test()
432{
433#if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS))
434  try {
435    locale loc("");
436
437    typedef codecvt<wchar_t, char, mbstate_t> cdecvt_type;
438    if (has_facet<cdecvt_type>(loc)) {
439      cdecvt_type const& cdect = use_facet<cdecvt_type>(loc);
440      {
441        cdecvt_type::state_type state;
442        memset(&state, 0, sizeof(cdecvt_type::state_type));
443        string from("abcdef");
444        int res = cdect.length(state, from.data(), from.data() + from.size(), from.size());
445        CPPUNIT_ASSERT( (size_t)res == from.size() );
446      }
447    }
448  }
449  catch (runtime_error const&) {
450  }
451  catch (...) {
452    CPPUNIT_FAIL;
453  }
454#endif
455}
456
457#if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS))
458typedef std::codecvt<wchar_t, char, mbstate_t> my_codecvt_base;
459
460class my_codecvt : public my_codecvt_base {
461public:
462  explicit my_codecvt(size_t r = 0)
463   : my_codecvt_base(r) {}
464
465protected:
466  virtual result do_in(state_type& /*state*/, const extern_type* first1,
467                       const extern_type* last1, const extern_type*& next1,
468                       intern_type* first2, intern_type* last2,
469                       intern_type*& next2) const {
470    for ( next1 = first1, next2 = first2; next1 < last1; next1 += 2 ) {
471      if ( (last1 - next1) < 2 || (last2 - next2) < 1 )
472        return partial;
473      *next2++ = (intern_type)((*(next1 + 1) << 8) | (*next1 & 255));
474    }
475    return ok;
476  }
477  virtual bool do_always_noconv() const __NO_THROW
478  { return false; }
479  virtual int do_max_length() const __NO_THROW
480  { return 2; }
481  virtual int do_encoding() const __NO_THROW
482  { return 2; }
483};
484#endif
485
486void CodecvtTest::imbue_while_reading()
487{
488#if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS))
489  {
490    wofstream ofs( "test.txt" );
491    const wchar_t buf[] = L" ";
492    for ( int i = 0; i < 4098; ++i ) {
493      ofs << buf[0];
494    }
495  }
496
497  wifstream ifs("test.txt"); // a file containing 4098 wchars
498
499  ifs.imbue( locale(locale(), new my_codecvt) );
500  ifs.get();
501  ifs.seekg(0);
502  ifs.imbue( locale() );
503  ifs.ignore(4096);
504  int ch = ifs.get();
505  CPPUNIT_CHECK( ch != (int)WEOF );
506#endif
507}
508
509void CodecvtTest::special_encodings()
510{
511#if !defined (STLPORT) || (!defined (_STLP_NO_WCHAR_T) && defined (_STLP_USE_EXCEPTIONS))
512  {
513    locale loc(locale::classic(), new codecvt_byname<wchar_t, char, mbstate_t>("C"));
514    codecvt<wchar_t, char, mbstate_t> const& cvt = use_facet<codecvt<wchar_t, char, mbstate_t> >(loc);
515    mbstate_t state;
516    memset(&state, 0, sizeof(mbstate_t));
517    char c = '0';
518    const char *from_next;
519    wchar_t wc;
520    wchar_t *to_next;
521    CPPUNIT_ASSERT( cvt.in(state, &c, &c + 1, from_next, &wc, &wc, to_next) == codecvt_base::ok );
522    CPPUNIT_ASSERT( to_next == &wc );
523    CPPUNIT_ASSERT( cvt.in(state, &c, &c + 1, from_next, &wc, &wc + 1, to_next) == codecvt_base::ok );
524    CPPUNIT_ASSERT( wc == L'0' );
525    CPPUNIT_ASSERT( to_next == &wc + 1 );
526  }
527  try
528  {
529    wstring cp936_wstr;
530    const string cp936_str = "\xd6\xd0\xb9\xfa\xc9\xe7\xbb\xe1\xbf\xc6\xd1\xa7\xd4\xba\xb7\xa2\xb2\xbc\x32\x30\x30\x38\xc4\xea\xa1\xb6\xbe\xad\xbc\xc3\xc0\xb6\xc6\xa4\xca\xe9\xa1\xb7\xd6\xb8\xb3\xf6\xa3\xac\x32\x30\x30\x37\xc4\xea\xd6\xd0\xb9\xfa\xbe\xad\xbc\xc3\xd4\xf6\xb3\xa4\xd3\xc9\xc6\xab\xbf\xec\xd7\xaa\xcf\xf2\xb9\xfd\xc8\xc8\xb5\xc4\xc7\xf7\xca\xc6\xc3\xf7\xcf\xd4\xd4\xa4\xbc\xc6\xc8\xab\xc4\xea\x47\x44\x50\xd4\xf6\xcb\xd9\xbd\xab\xb4\xef\x31\x31\x2e\x36\x25\xa1\xa3";
531    locale loc(locale::classic(), ".936", locale::ctype);
532    codecvt<wchar_t, char, mbstate_t> const& cvt = use_facet<codecvt<wchar_t, char, mbstate_t> >(loc);
533    mbstate_t state;
534    memset(&state, 0, sizeof(mbstate_t));
535
536    codecvt_base::result res;
537
538    {
539      wchar_t wbuf[4096];
540      // Check we will have enough room for the generated wide string generated from the whole char buffer:
541      int len = cvt.length(state, cp936_str.data(), cp936_str.data() + cp936_str.size(), sizeof(wbuf) / sizeof(wchar_t));
542      CPPUNIT_ASSERT( cp936_str.size() == (size_t)len );
543
544      const char *from_next;
545      wchar_t *to_next;
546      res = cvt.in(state, cp936_str.data(), cp936_str.data() + cp936_str.size(), from_next,
547                          wbuf, wbuf + sizeof(wbuf) / sizeof(wchar_t), to_next);
548      CPPUNIT_ASSERT( res == codecvt_base::ok );
549      CPPUNIT_ASSERT( from_next == cp936_str.data() + cp936_str.size() );
550      cp936_wstr.assign(wbuf, to_next);
551    }
552
553    {
554      const wchar_t *from_next;
555      char buf[4096];
556      char *to_next;
557      res = cvt.out(state, cp936_wstr.data(), cp936_wstr.data() + cp936_wstr.size(), from_next,
558                           buf, buf + sizeof(buf), to_next);
559      CPPUNIT_ASSERT( res == codecvt_base::ok );
560      CPPUNIT_CHECK( string(buf, to_next) == cp936_str );
561    }
562  }
563  catch (const runtime_error&)
564  {
565    CPPUNIT_MESSAGE("Not enough platform localization support to check 936 code page encoding.");
566  }
567  try
568  {
569    const string utf8_str = "\xe4\xb8\xad\xe5\x9b\xbd\xe7\xa4\xbe\xe4\xbc\x9a\xe7\xa7\x91\xe5\xad\xa6\xe9\x99\xa2\xe5\x8f\x91\xe5\xb8\x83\x32\x30\x30\x38\xe5\xb9\xb4\xe3\x80\x8a\xe7\xbb\x8f\xe6\xb5\x8e\xe8\x93\x9d\xe7\x9a\xae\xe4\xb9\xa6\xe3\x80\x8b\xe6\x8c\x87\xe5\x87\xba\xef\xbc\x8c\x32\x30\x30\x37\xe5\xb9\xb4\xe4\xb8\xad\xe5\x9b\xbd\xe7\xbb\x8f\xe6\xb5\x8e\xe5\xa2\x9e\xe9\x95\xbf\xe7\x94\xb1\xe5\x81\x8f\xe5\xbf\xab\xe8\xbd\xac\xe5\x90\x91\xe8\xbf\x87\xe7\x83\xad\xe7\x9a\x84\xe8\xb6\x8b\xe5\x8a\xbf\xe6\x98\x8e\xe6\x98\xbe\xe9\xa2\x84\xe8\xae\xa1\xe5\x85\xa8\xe5\xb9\xb4\x47\x44\x50\xe5\xa2\x9e\xe9\x80\x9f\xe5\xb0\x86\xe8\xbe\xbe\x31\x31\x2e\x36\x25\xe3\x80\x82";
570    wstring utf8_wstr;
571    locale loc(locale::classic(), new codecvt_byname<wchar_t, char, mbstate_t>(".utf8"));
572    codecvt<wchar_t, char, mbstate_t> const& cvt = use_facet<codecvt<wchar_t, char, mbstate_t> >(loc);
573    mbstate_t state;
574    memset(&state, 0, sizeof(mbstate_t));
575
576    codecvt_base::result res;
577
578    {
579      wchar_t wbuf[4096];
580      // Check we will have enough room for the wide string generated from the whole char buffer:
581      int len = cvt.length(state, utf8_str.data(), utf8_str.data() + utf8_str.size(), sizeof(wbuf) / sizeof(wchar_t));
582      CPPUNIT_ASSERT( utf8_str.size() == (size_t)len );
583
584      const char *from_next;
585      wchar_t *to_next;
586      res = cvt.in(state, utf8_str.data(), utf8_str.data() + utf8_str.size(), from_next,
587                          wbuf, wbuf + sizeof(wbuf) / sizeof(wchar_t), to_next);
588      CPPUNIT_ASSERT( res == codecvt_base::ok );
589      CPPUNIT_ASSERT( from_next == utf8_str.data() + utf8_str.size() );
590      utf8_wstr.assign(wbuf, to_next);
591
592      // Try to read one char after the other:
593      wchar_t wc;
594      const char* from = utf8_str.data();
595      const char* from_end = from + utf8_str.size();
596      from_next = utf8_str.data();
597      size_t length = 1;
598      size_t windex = 0;
599      while (from + length <= from_end) {
600        res = cvt.in(state, from, from + length, from_next,
601                            &wc, &wc + 1, to_next);
602        switch (res) {
603          case codecvt_base::ok:
604            // reset length:
605            from = from_next;
606            length = 1;
607            CPPUNIT_ASSERT( wc == utf8_wstr[windex++] );
608            wc = 0;
609            break;
610          case codecvt_base::partial:
611            if (from_next == from)
612              // from_next hasn't move so we have to pass more chars
613              ++length;
614            else
615              // char between from and from_next has been eaten, we simply restart
616              // conversion from from_next:
617              from = from_next;
618            continue;
619          case codecvt_base::error:
620          case codecvt_base::noconv:
621            CPPUNIT_FAIL;
622            //break;
623        }
624      }
625      CPPUNIT_ASSERT( windex == utf8_wstr.size() );
626    }
627
628    {
629      const wchar_t *from_next;
630      char buf[4096];
631      char *to_next;
632      res = cvt.out(state, utf8_wstr.data(), utf8_wstr.data() + utf8_wstr.size(), from_next,
633                           buf, buf + sizeof(buf), to_next);
634      CPPUNIT_ASSERT( res == codecvt_base::ok );
635      CPPUNIT_CHECK( string(buf, to_next) == utf8_str );
636    }
637
638    {
639      // Check that an obviously wrong UTF8 encoded string is correctly detected:
640      const string bad_utf8_str("\xdf\xdf\xdf\xdf\xdf");
641      wchar_t wc;
642      const char *from_next;
643      wchar_t *to_next;
644      res = cvt.in(state, bad_utf8_str.data(), bad_utf8_str.data() + bad_utf8_str.size(), from_next,
645                          &wc, &wc + 1, to_next);
646      CPPUNIT_ASSERT( res == codecvt_base::error );
647    }
648  }
649  catch (const runtime_error&)
650  {
651    CPPUNIT_MESSAGE("Not enough platform localization support to check UTF8 encoding.");
652  }
653#endif
654}
655
656#endif
657