1// Tencent is pleased to support the open source community by making RapidJSON available.
2//
3// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
4//
5// Licensed under the MIT License (the "License"); you may not use this file except
6// in compliance with the License. You may obtain a copy of the License at
7//
8// http://opensource.org/licenses/MIT
9//
10// Unless required by applicable law or agreed to in writing, software distributed
11// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13// specific language governing permissions and limitations under the License.
14
15#ifndef RAPIDJSON_ENCODEDSTREAM_H_
16#define RAPIDJSON_ENCODEDSTREAM_H_
17
18#include "rapidjson.h"
19
20#ifdef __GNUC__
21RAPIDJSON_DIAG_PUSH
22RAPIDJSON_DIAG_OFF(effc++)
23#endif
24
25RAPIDJSON_NAMESPACE_BEGIN
26
27//! Input byte stream wrapper with a statically bound encoding.
28/*!
29    \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
30    \tparam InputByteStream Type of input byte stream. For example, FileReadStream.
31*/
32template <typename Encoding, typename InputByteStream>
33class EncodedInputStream {
34    RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
35public:
36    typedef typename Encoding::Ch Ch;
37
38    EncodedInputStream(InputByteStream& is) : is_(is) {
39        current_ = Encoding::TakeBOM(is_);
40    }
41
42    Ch Peek() const { return current_; }
43    Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; }
44    size_t Tell() const { return is_.Tell(); }
45
46    // Not implemented
47    void Put(Ch) { RAPIDJSON_ASSERT(false); }
48    void Flush() { RAPIDJSON_ASSERT(false); }
49    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
50    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
51
52private:
53    EncodedInputStream(const EncodedInputStream&);
54    EncodedInputStream& operator=(const EncodedInputStream&);
55
56    InputByteStream& is_;
57    Ch current_;
58};
59
60//! Output byte stream wrapper with statically bound encoding.
61/*!
62    \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
63    \tparam InputByteStream Type of input byte stream. For example, FileWriteStream.
64*/
65template <typename Encoding, typename OutputByteStream>
66class EncodedOutputStream {
67    RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
68public:
69    typedef typename Encoding::Ch Ch;
70
71    EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) {
72        if (putBOM)
73            Encoding::PutBOM(os_);
74    }
75
76    void Put(Ch c) { Encoding::Put(os_, c);  }
77    void Flush() { os_.Flush(); }
78
79    // Not implemented
80    Ch Peek() const { RAPIDJSON_ASSERT(false); }
81    Ch Take() { RAPIDJSON_ASSERT(false);  }
82    size_t Tell() const { RAPIDJSON_ASSERT(false);  return 0; }
83    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
84    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
85
86private:
87    EncodedOutputStream(const EncodedOutputStream&);
88    EncodedOutputStream& operator=(const EncodedOutputStream&);
89
90    OutputByteStream& os_;
91};
92
93#define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
94
95//! Input stream wrapper with dynamically bound encoding and automatic encoding detection.
96/*!
97    \tparam CharType Type of character for reading.
98    \tparam InputByteStream type of input byte stream to be wrapped.
99*/
100template <typename CharType, typename InputByteStream>
101class AutoUTFInputStream {
102    RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
103public:
104    typedef CharType Ch;
105
106    //! Constructor.
107    /*!
108        \param is input stream to be wrapped.
109        \param type UTF encoding type if it is not detected from the stream.
110    */
111    AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) {
112        RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
113        DetectType();
114        static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) };
115        takeFunc_ = f[type_];
116        current_ = takeFunc_(*is_);
117    }
118
119    UTFType GetType() const { return type_; }
120    bool HasBOM() const { return hasBOM_; }
121
122    Ch Peek() const { return current_; }
123    Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; }
124    size_t Tell() const { return is_->Tell(); }
125
126    // Not implemented
127    void Put(Ch) { RAPIDJSON_ASSERT(false); }
128    void Flush() { RAPIDJSON_ASSERT(false); }
129    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
130    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
131
132private:
133    AutoUTFInputStream(const AutoUTFInputStream&);
134    AutoUTFInputStream& operator=(const AutoUTFInputStream&);
135
136    // Detect encoding type with BOM or RFC 4627
137    void DetectType() {
138        // BOM (Byte Order Mark):
139        // 00 00 FE FF  UTF-32BE
140        // FF FE 00 00  UTF-32LE
141        // FE FF        UTF-16BE
142        // FF FE        UTF-16LE
143        // EF BB BF     UTF-8
144
145        const unsigned char* c = (const unsigned char *)is_->Peek4();
146        if (!c)
147            return;
148
149        unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24));
150        hasBOM_ = false;
151        if (bom == 0xFFFE0000)                  { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
152        else if (bom == 0x0000FEFF)             { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
153        else if ((bom & 0xFFFF) == 0xFFFE)      { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take();                           }
154        else if ((bom & 0xFFFF) == 0xFEFF)      { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take();                           }
155        else if ((bom & 0xFFFFFF) == 0xBFBBEF)  { type_ = kUTF8;    hasBOM_ = true; is_->Take(); is_->Take(); is_->Take();              }
156
157        // RFC 4627: Section 3
158        // "Since the first two characters of a JSON text will always be ASCII
159        // characters [RFC0020], it is possible to determine whether an octet
160        // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
161        // at the pattern of nulls in the first four octets."
162        // 00 00 00 xx  UTF-32BE
163        // 00 xx 00 xx  UTF-16BE
164        // xx 00 00 00  UTF-32LE
165        // xx 00 xx 00  UTF-16LE
166        // xx xx xx xx  UTF-8
167
168        if (!hasBOM_) {
169            unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
170            switch (pattern) {
171            case 0x08: type_ = kUTF32BE; break;
172            case 0x0A: type_ = kUTF16BE; break;
173            case 0x01: type_ = kUTF32LE; break;
174            case 0x05: type_ = kUTF16LE; break;
175            case 0x0F: type_ = kUTF8;    break;
176            default: break; // Use type defined by user.
177            }
178        }
179
180        // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
181        if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
182        if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
183    }
184
185    typedef Ch (*TakeFunc)(InputByteStream& is);
186    InputByteStream* is_;
187    UTFType type_;
188    Ch current_;
189    TakeFunc takeFunc_;
190    bool hasBOM_;
191};
192
193//! Output stream wrapper with dynamically bound encoding and automatic encoding detection.
194/*!
195    \tparam CharType Type of character for writing.
196    \tparam InputByteStream type of output byte stream to be wrapped.
197*/
198template <typename CharType, typename OutputByteStream>
199class AutoUTFOutputStream {
200    RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
201public:
202    typedef CharType Ch;
203
204    //! Constructor.
205    /*!
206        \param os output stream to be wrapped.
207        \param type UTF encoding type.
208        \param putBOM Whether to write BOM at the beginning of the stream.
209    */
210    AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) {
211        RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
212
213        // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
214        if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
215        if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
216
217        static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) };
218        putFunc_ = f[type_];
219
220        if (putBOM)
221            PutBOM();
222    }
223
224    UTFType GetType() const { return type_; }
225
226    void Put(Ch c) { putFunc_(*os_, c); }
227    void Flush() { os_->Flush(); }
228
229    // Not implemented
230    Ch Peek() const { RAPIDJSON_ASSERT(false); }
231    Ch Take() { RAPIDJSON_ASSERT(false); }
232    size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
233    Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
234    size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
235
236private:
237    AutoUTFOutputStream(const AutoUTFOutputStream&);
238    AutoUTFOutputStream& operator=(const AutoUTFOutputStream&);
239
240    void PutBOM() {
241        typedef void (*PutBOMFunc)(OutputByteStream&);
242        static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) };
243        f[type_](*os_);
244    }
245
246    typedef void (*PutFunc)(OutputByteStream&, Ch);
247
248    OutputByteStream* os_;
249    UTFType type_;
250    PutFunc putFunc_;
251};
252
253#undef RAPIDJSON_ENCODINGS_FUNC
254
255RAPIDJSON_NAMESPACE_END
256
257#ifdef __GNUC__
258RAPIDJSON_DIAG_POP
259#endif
260
261#endif // RAPIDJSON_FILESTREAM_H_
262