test-stlport/unit/codecvt_test.cpp

#include <string>

#if !defined (STLPORT) || !defined (_STLP_USE_NO_IOSTREAMS)
#  include <fstream>
#  include <locale>
#  include <stdexcept>
#  include <cstdio> // for WEOF

#  include "cppunit/cppunit_proxy.h"

#  if !defined (STLPORT) || defined(_STLP_USE_NAMESPACES)
using namespace std;
#  endif

//
// TestCase class
//
class CodecvtTest : public CPPUNIT_NS::TestCase
{
  CPPUNIT_TEST_SUITE(CodecvtTest);
#if defined (STLPORT) && defined (_STLP_NO_MEMBER_TEMPLATES)
  CPPUNIT_IGNORE;
#endif
  CPPUNIT_TEST(variable_encoding);
  CPPUNIT_STOP_IGNORE;
#if defined (STLPORT) && (defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS))
  CPPUNIT_IGNORE;
#endif
  CPPUNIT_TEST(in_out_test);
  CPPUNIT_TEST(length_test);
  CPPUNIT_TEST(imbue_while_reading);
  CPPUNIT_TEST(special_encodings);
  CPPUNIT_TEST_SUITE_END();

protected:
  void variable_encoding();
  void in_out_test();
  void length_test();
  void imbue_while_reading();
  void special_encodings();
};

CPPUNIT_TEST_SUITE_REGISTRATION(CodecvtTest);

#if defined (STLPORT)
#  define __NO_THROW _STLP_NOTHROW
#else
#  define __NO_THROW throw()
#endif


/* Codecvt facet eating some characters from the external buffer.
 * Transform '01' in 'a'
 */
struct eater_codecvt : public codecvt<char, char, mbstate_t> {
  typedef codecvt<char,char,mbstate_t> base;

  explicit eater_codecvt(size_t refs = 0) : base(refs) {}

  // primitive conversion
  virtual base::result
  do_in(mbstate_t& mb,
        const char* ebegin, const char* eend, const char*& ecur,
        char* ibegin, char* iend, char*& icur) const __NO_THROW {
      char *state = (char*)&mb;
      ecur = ebegin;
      icur = ibegin;

      while (ecur != eend) {
          if (icur == iend)
              return partial;
          if (*ecur == '0' || *state == 1) {
            if (*state != 1) {
              ++ecur;
            }
            if (ecur == eend) {
              *state = 1;
              return ok;
            }

            if (*ecur == '1') {
              *icur = 'a';
            }
            else {
              *(icur++) = '0';
              if (icur == iend) {
                if (*state != 1) {
                  --ecur;
                }
                return partial;
              }
              *icur = *ecur;
            }
          }
          else {
            *icur = *ecur;
          }

          *state = 0;
          ++icur;
          ++ecur;
      }

      return ok;
  }

  // claim it's not a null-conversion
  virtual bool do_always_noconv() const __NO_THROW
  { return false; }

  // claim it doesn't have a fixed-length encoding
  virtual int do_encoding() const __NO_THROW
  { return 0; }

  // implemented for consistency with do_in overload
  virtual int do_length(mbstate_t &state,
                        const char *efrom, const char *eend, size_t m) const {
    char *ibegin = new char[m];
    const char *ecur = efrom;
    char *icur = ibegin;
    mbstate_t tmp = state;
    do_in(tmp, efrom, eend, ecur, ibegin, ibegin + m, icur);
    delete[] ibegin;
    return ecur - efrom;
  }

  virtual int do_max_length() const __NO_THROW
  { return 2; }

#ifdef __DMC__
  static locale::id id;
#endif
};

#ifdef __DMC__
locale::id eater_codecvt::id;

locale::id& _GetFacetId(const eater_codecvt*)
{ return eater_codecvt::id; }
#endif

/* Codecvt facet generating more characters than the ones read from the
 * external buffer, transform '01' in 'abc'
 * This kind of facet do not allow systematical positionning in the external
 * buffer (tellg -> -1), when you just read a 'a' you are at an undefined
 * external buffer position.
 */
struct generator_codecvt : public codecvt<char, char, mbstate_t> {
  typedef codecvt<char,char,mbstate_t> base;

  explicit generator_codecvt(size_t refs = 0) : base(refs) {}

  // primitive conversion
  virtual base::result
  do_in(mbstate_t& mb,
        const char* ebegin, const char* eend, const char*& ecur,
        char* ibegin, char* iend, char*& icur) const __NO_THROW {
      //Access the mbstate information in a portable way:
      char *state = (char*)&mb;
      ecur = ebegin;
      icur = ibegin;

      if (icur == iend) return ok;

      if (*state == 2) {
        *(icur++) = 'b';
        if (icur == iend) {
          *state = 3;
          return ok;
        }
        *(icur++) = 'c';
        *state = 0;
      }
      else if (*state == 3) {
        *(icur++) = 'c';
        *state = 0;
      }

      while (ecur != eend) {
          if (icur == iend)
              return ok;
          if (*ecur == '0' || *state == 1) {
            if (*state != 1) {
              ++ecur;
            }
            if (ecur == eend) {
              *state = 1;
              return partial;
            }

            if (*ecur == '1') {
              *(icur++) = 'a';
              if (icur == iend) {
                *state = 2;
                return ok;
              }
              *(icur++) = 'b';
              if (icur == iend) {
                *state = 3;
                return ok;
              }
              *icur = 'c';
            }
            else {
              *(icur++) = '0';
              if (icur == iend) {
                if (*state != 1) {
                  --ecur;
                }
                return ok;
              }
              *icur = *ecur;
            }
          }
          else {
            *icur = *ecur;
          }

          *state = 0;
          ++icur;
          ++ecur;
      }

      return ok;
  }

  // claim it's not a null-conversion
  virtual bool do_always_noconv() const __NO_THROW
  { return false; }

  // claim it doesn't have a fixed-length encoding
  virtual int do_encoding() const __NO_THROW
  { return 0; }

  // implemented for consistency with do_in overload
  virtual int do_length(mbstate_t &mb,
                        const char *efrom, const char *eend, size_t m) const {
    const char *state = (const char*)&mb;
    int offset = 0;
    if (*state == 2)
      offset = 2;
    else if (*state == 3)
      offset = 1;

    char *ibegin = new char[m + offset];
    const char *ecur = efrom;
    char *icur = ibegin;
    mbstate_t tmpState = mb;
    do_in(tmpState, efrom, eend, ecur, ibegin, ibegin + m + offset, icur);
    /*
    char *state = (char*)&tmpState;
    if (*state != 0) {
      if (*state == 1)
        --ecur;
      else if (*state == 2 || *state == 3) {
        //Undefined position, we return -1:
        ecur = efrom - 1;
      }
    }
    else {
      if (*((char*)&mb) != 0) {
        //We take into account the character that hasn't been counted yet in
        //the previous decoding step:
        ecur++;
      }
    }
    */
    delete[] ibegin;
    return (int)min((size_t)(ecur - efrom), m);
  }

  virtual int do_max_length() const __NO_THROW
  { return 0; }
#ifdef __DMC__
  static locale::id id;
#endif
};

#ifdef __DMC__
locale::id generator_codecvt::id;

locale::id& _GetFacetId(const generator_codecvt*)
{ return generator_codecvt::id; }
#endif

//
// tests implementation
//
#include <iostream>
void CodecvtTest::variable_encoding()
{
#if !defined (STLPORT) || !defined (_STLP_NO_MEMBER_TEMPLATES)
  //We first generate the file used for test:
  const char* fileName = "test_file.txt";
  {
    ofstream ostr(fileName);
    //Maybe we simply do not have write access to repository
    CPPUNIT_ASSERT( ostr.good() );
    for (int i = 0; i < 2048; ++i) {
      ostr << "0123456789";
    }
    CPPUNIT_ASSERT( ostr.good() );
  }

  {
    ifstream istr(fileName);
    CPPUNIT_ASSERT( istr.good() );
    CPPUNIT_ASSERT( !istr.eof() );

    eater_codecvt codec(1);
    locale loc(locale::classic(), &codec);

    istr.imbue(loc);
    CPPUNIT_ASSERT( istr.good() );
    CPPUNIT_ASSERT( (int)istr.tellg() == 0 );

    int theoricalPos = 0;
    do {
      int c = istr.get();
      if (char_traits<char>::eq_int_type(c, char_traits<char>::eof())) {
        break;
      }
      ++theoricalPos;
      if (c == 'a') {
        ++theoricalPos;
      }

      CPPUNIT_ASSERT( (int)istr.tellg() == theoricalPos );
    }
    while (!istr.eof());
    cerr << "out!\n";
    CPPUNIT_ASSERT( istr.eof() );
    cerr << "fin!\n";
  }

#  if 0
  /* This test is broken, not sure if it is really possible to get a position in
   * a locale having a codecvt such as generator_codecvt. Maybe generator_codecvt
   * is not a valid theorical example of codecvt implementation. */
  {
    ifstream istr(fileName);
    CPPUNIT_ASSERT( istr.good() );
    CPPUNIT_ASSERT( !istr.eof() );

    generator_codecvt codec(1);
    locale loc(locale::classic(), &codec);

    istr.imbue(loc);
    CPPUNIT_ASSERT( istr.good() );
    CPPUNIT_ASSERT( (int)istr.tellg() == 0 );

    int theoricalPos = 0;
    int theoricalTellg;
    do {
      char c = istr.get();
      if (c == char_traits<char>::eof()) {
        break;
      }
      switch (c) {
        case 'a':
        case 'b':
          theoricalTellg = -1;
          break;
        case 'c':
          ++theoricalPos;
        default:
          ++theoricalPos;
          theoricalTellg = theoricalPos;
          break;
      }

      if ((int)istr.tellg() != theoricalTellg) {
        CPPUNIT_ASSERT( (int)istr.tellg() == theoricalTellg );
      }
    }
    while (!istr.eof());

    CPPUNIT_ASSERT( istr.eof() );
  }
#  endif
#endif
}

void CodecvtTest::in_out_test()
{
#if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS))
  try {
    locale loc("");

    typedef codecvt<wchar_t, char, mbstate_t> cdecvt_type;
    if (has_facet<cdecvt_type>(loc)) {
      cdecvt_type const& cdect = use_facet<cdecvt_type>(loc);
      {
        cdecvt_type::state_type state;
        memset(&state, 0, sizeof(cdecvt_type::state_type));
        string from("abcdef");
        const char* next_from;
        wchar_t to[1];
        wchar_t *next_to;
        cdecvt_type::result res = cdect.in(state, from.data(), from.data() + from.size(), next_from,
                                           to, to + sizeof(to) / sizeof(wchar_t), next_to);
        CPPUNIT_ASSERT( res == cdecvt_type::ok );
        CPPUNIT_ASSERT( next_from == from.data() + 1 );
        CPPUNIT_ASSERT( next_to == &to[0] + 1 );
        CPPUNIT_ASSERT( to[0] == L'a');
      }
      {
        cdecvt_type::state_type state;
        memset(&state, 0, sizeof(cdecvt_type::state_type));
        wstring from(L"abcdef");
        const wchar_t* next_from;
        char to[1];
        char *next_to;
        cdecvt_type::result res = cdect.out(state, from.data(), from.data() + from.size(), next_from,
                                            to, to + sizeof(to) / sizeof(char), next_to);
        CPPUNIT_ASSERT( res == cdecvt_type::ok );
        CPPUNIT_ASSERT( next_from == from.data() + 1 );
        CPPUNIT_ASSERT( next_to == &to[0] + 1 );
        CPPUNIT_ASSERT( to[0] == 'a');
      }
    }
  }
  catch (runtime_error const&) {
  }
  catch (...) {
    CPPUNIT_FAIL;
  }
#endif
}

void CodecvtTest::length_test()
{
#if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS))
  try {
    locale loc("");

    typedef codecvt<wchar_t, char, mbstate_t> cdecvt_type;
    if (has_facet<cdecvt_type>(loc)) {
      cdecvt_type const& cdect = use_facet<cdecvt_type>(loc);
      {
        cdecvt_type::state_type state;
        memset(&state, 0, sizeof(cdecvt_type::state_type));
        string from("abcdef");
        int res = cdect.length(state, from.data(), from.data() + from.size(), from.size());
        CPPUNIT_ASSERT( (size_t)res == from.size() );
      }
    }
  }
  catch (runtime_error const&) {
  }
  catch (...) {
    CPPUNIT_FAIL;
  }
#endif
}

#if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS))
typedef std::codecvt<wchar_t, char, mbstate_t> my_codecvt_base;

class my_codecvt : public my_codecvt_base {
public:
  explicit my_codecvt(size_t r = 0)
   : my_codecvt_base(r) {}

protected:
  virtual result do_in(state_type& /*state*/, const extern_type* first1,
                       const extern_type* last1, const extern_type*& next1,
                       intern_type* first2, intern_type* last2,
                       intern_type*& next2) const {
    for ( next1 = first1, next2 = first2; next1 < last1; next1 += 2 ) {
      if ( (last1 - next1) < 2 || (last2 - next2) < 1 )
        return partial;
      *next2++ = (intern_type)((*(next1 + 1) << 8) | (*next1 & 255));
    }
    return ok;
  }
  virtual bool do_always_noconv() const __NO_THROW
  { return false; }
  virtual int do_max_length() const __NO_THROW
  { return 2; }
  virtual int do_encoding() const __NO_THROW
  { return 2; }
};
#endif

void CodecvtTest::imbue_while_reading()
{
#if !defined (STLPORT) || !(defined (_STLP_NO_WCHAR_T) || !defined (_STLP_USE_EXCEPTIONS))
  {
    wofstream ofs( "test.txt" );
    const wchar_t buf[] = L" ";
    for ( int i = 0; i < 4098; ++i ) {
      ofs << buf[0];
    }
  }

  wifstream ifs("test.txt"); // a file containing 4098 wchars

  ifs.imbue( locale(locale(), new my_codecvt) );
  ifs.get();
  ifs.seekg(0);
  ifs.imbue( locale() );
  ifs.ignore(4096);
  int ch = ifs.get();
  CPPUNIT_CHECK( ch != (int)WEOF );
#endif
}

void CodecvtTest::special_encodings()
{
#if !defined (STLPORT) || (!defined (_STLP_NO_WCHAR_T) && defined (_STLP_USE_EXCEPTIONS))
  {
    locale loc(locale::classic(), new codecvt_byname<wchar_t, char, mbstate_t>("C"));
    codecvt<wchar_t, char, mbstate_t> const& cvt = use_facet<codecvt<wchar_t, char, mbstate_t> >(loc);
    mbstate_t state;
    memset(&state, 0, sizeof(mbstate_t));
    char c = '0';
    const char *from_next;
    wchar_t wc;
    wchar_t *to_next;
    CPPUNIT_ASSERT( cvt.in(state, &c, &c + 1, from_next, &wc, &wc, to_next) == codecvt_base::ok );
    CPPUNIT_ASSERT( to_next == &wc );
    CPPUNIT_ASSERT( cvt.in(state, &c, &c + 1, from_next, &wc, &wc + 1, to_next) == codecvt_base::ok );
    CPPUNIT_ASSERT( wc == L'0' );
    CPPUNIT_ASSERT( to_next == &wc + 1 );
  }
  try
  {
    wstring cp936_wstr;
    const string cp936_str = "\xd6\xd0\xb9\xfa\xc9\xe7\xbb\xe1\xbf\xc6\xd1\xa7\xd4\xba\xb7\xa2\xb2\xbc\x32\x30\x30\x38\xc4\xea\xa1\xb6\xbe\xad\xbc\xc3\xc0\xb6\xc6\xa4\xca\xe9\xa1\xb7\xd6\xb8\xb3\xf6\xa3\xac\x32\x30\x30\x37\xc4\xea\xd6\xd0\xb9\xfa\xbe\xad\xbc\xc3\xd4\xf6\xb3\xa4\xd3\xc9\xc6\xab\xbf\xec\xd7\xaa\xcf\xf2\xb9\xfd\xc8\xc8\xb5\xc4\xc7\xf7\xca\xc6\xc3\xf7\xcf\xd4\xd4\xa4\xbc\xc6\xc8\xab\xc4\xea\x47\x44\x50\xd4\xf6\xcb\xd9\xbd\xab\xb4\xef\x31\x31\x2e\x36\x25\xa1\xa3";
    locale loc(locale::classic(), ".936", locale::ctype);
    codecvt<wchar_t, char, mbstate_t> const& cvt = use_facet<codecvt<wchar_t, char, mbstate_t> >(loc);
    mbstate_t state;
    memset(&state, 0, sizeof(mbstate_t));

    codecvt_base::result res;

    {
      wchar_t wbuf[4096];
      // Check we will have enough room for the generated wide string generated from the whole char buffer:
      int len = cvt.length(state, cp936_str.data(), cp936_str.data() + cp936_str.size(), sizeof(wbuf) / sizeof(wchar_t));
      CPPUNIT_ASSERT( cp936_str.size() == (size_t)len );

      const char *from_next;
      wchar_t *to_next;
      res = cvt.in(state, cp936_str.data(), cp936_str.data() + cp936_str.size(), from_next,
                          wbuf, wbuf + sizeof(wbuf) / sizeof(wchar_t), to_next);
      CPPUNIT_ASSERT( res == codecvt_base::ok );
      CPPUNIT_ASSERT( from_next == cp936_str.data() + cp936_str.size() );
      cp936_wstr.assign(wbuf, to_next);
    }

    {
      const wchar_t *from_next;
      char buf[4096];
      char *to_next;
      res = cvt.out(state, cp936_wstr.data(), cp936_wstr.data() + cp936_wstr.size(), from_next,
                           buf, buf + sizeof(buf), to_next);
      CPPUNIT_ASSERT( res == codecvt_base::ok );
      CPPUNIT_CHECK( string(buf, to_next) == cp936_str );
    }
  }
  catch (const runtime_error&)
  {
    CPPUNIT_MESSAGE("Not enough platform localization support to check 936 code page encoding.");
  }
  try
  {
    const string utf8_str = "\xe4\xb8\xad\xe5\x9b\xbd\xe7\xa4\xbe\xe4\xbc\x9a\xe7\xa7\x91\xe5\xad\xa6\xe9\x99\xa2\xe5\x8f\x91\xe5\xb8\x83\x32\x30\x30\x38\xe5\xb9\xb4\xe3\x80\x8a\xe7\xbb\x8f\xe6\xb5\x8e\xe8\x93\x9d\xe7\x9a\xae\xe4\xb9\xa6\xe3\x80\x8b\xe6\x8c\x87\xe5\x87\xba\xef\xbc\x8c\x32\x30\x30\x37\xe5\xb9\xb4\xe4\xb8\xad\xe5\x9b\xbd\xe7\xbb\x8f\xe6\xb5\x8e\xe5\xa2\x9e\xe9\x95\xbf\xe7\x94\xb1\xe5\x81\x8f\xe5\xbf\xab\xe8\xbd\xac\xe5\x90\x91\xe8\xbf\x87\xe7\x83\xad\xe7\x9a\x84\xe8\xb6\x8b\xe5\x8a\xbf\xe6\x98\x8e\xe6\x98\xbe\xe9\xa2\x84\xe8\xae\xa1\xe5\x85\xa8\xe5\xb9\xb4\x47\x44\x50\xe5\xa2\x9e\xe9\x80\x9f\xe5\xb0\x86\xe8\xbe\xbe\x31\x31\x2e\x36\x25\xe3\x80\x82";
    wstring utf8_wstr;
    locale loc(locale::classic(), new codecvt_byname<wchar_t, char, mbstate_t>(".utf8"));
    codecvt<wchar_t, char, mbstate_t> const& cvt = use_facet<codecvt<wchar_t, char, mbstate_t> >(loc);
    mbstate_t state;
    memset(&state, 0, sizeof(mbstate_t));

    codecvt_base::result res;

    {
      wchar_t wbuf[4096];
      // Check we will have enough room for the wide string generated from the whole char buffer:
      int len = cvt.length(state, utf8_str.data(), utf8_str.data() + utf8_str.size(), sizeof(wbuf) / sizeof(wchar_t));
      CPPUNIT_ASSERT( utf8_str.size() == (size_t)len );

      const char *from_next;
      wchar_t *to_next;
      res = cvt.in(state, utf8_str.data(), utf8_str.data() + utf8_str.size(), from_next,
                          wbuf, wbuf + sizeof(wbuf) / sizeof(wchar_t), to_next);
      CPPUNIT_ASSERT( res == codecvt_base::ok );
      CPPUNIT_ASSERT( from_next == utf8_str.data() + utf8_str.size() );
      utf8_wstr.assign(wbuf, to_next);

      // Try to read one char after the other:
      wchar_t wc;
      const char* from = utf8_str.data();
      const char* from_end = from + utf8_str.size();
      from_next = utf8_str.data();
      size_t length = 1;
      size_t windex = 0;
      while (from + length <= from_end) {
        res = cvt.in(state, from, from + length, from_next,
                            &wc, &wc + 1, to_next);
        switch (res) {
          case codecvt_base::ok:
            // reset length:
            from = from_next;
            length = 1;
            CPPUNIT_ASSERT( wc == utf8_wstr[windex++] );
            wc = 0;
            break;
          case codecvt_base::partial:
            if (from_next == from)
              // from_next hasn't move so we have to pass more chars
              ++length;
            else
              // char between from and from_next has been eaten, we simply restart
              // conversion from from_next:
              from = from_next;
            continue;
          case codecvt_base::error:
          case codecvt_base::noconv:
            CPPUNIT_FAIL;
            //break;
        }
      }
      CPPUNIT_ASSERT( windex == utf8_wstr.size() );
    }

    {
      const wchar_t *from_next;
      char buf[4096];
      char *to_next;
      res = cvt.out(state, utf8_wstr.data(), utf8_wstr.data() + utf8_wstr.size(), from_next,
                           buf, buf + sizeof(buf), to_next);
      CPPUNIT_ASSERT( res == codecvt_base::ok );
      CPPUNIT_CHECK( string(buf, to_next) == utf8_str );
    }

    {
      // Check that an obviously wrong UTF8 encoded string is correctly detected:
      const string bad_utf8_str("\xdf\xdf\xdf\xdf\xdf");
      wchar_t wc;
      const char *from_next;
      wchar_t *to_next;
      res = cvt.in(state, bad_utf8_str.data(), bad_utf8_str.data() + bad_utf8_str.size(), from_next,
                          &wc, &wc + 1, to_next);
      CPPUNIT_ASSERT( res == codecvt_base::error );
    }
  }
  catch (const runtime_error&)
  {
    CPPUNIT_MESSAGE("Not enough platform localization support to check UTF8 encoding.");
  }
#endif
}

#endif