1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the  "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 *     http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18/*
19 * $Id: WriterToUTF8Buffered.java 469356 2006-10-31 03:20:34Z minchau $
20 */
21package org.apache.xml.serializer;
22
23import java.io.IOException;
24import java.io.OutputStream;
25import java.io.UnsupportedEncodingException;
26import java.io.Writer;
27
28
29/**
30 * This class writes unicode characters to a byte stream (java.io.OutputStream)
31 * as quickly as possible. It buffers the output in an internal
32 * buffer which must be flushed to the OutputStream when done. This flushing
33 * is done via the close() flush() or flushBuffer() method.
34 *
35 * This class is only used internally within Xalan.
36 *
37 * @xsl.usage internal
38 */
39final class WriterToUTF8Buffered extends Writer implements WriterChain
40{
41
42  /** number of bytes that the byte buffer can hold.
43   * This is a fixed constant is used rather than m_outputBytes.lenght for performance.
44   */
45  private static final int BYTES_MAX=16*1024;
46  /** number of characters that the character buffer can hold.
47   * This is 1/3 of the number of bytes because UTF-8 encoding
48   * can expand one unicode character by up to 3 bytes.
49   */
50  private static final int CHARS_MAX=(BYTES_MAX/3);
51
52 // private static final int
53
54  /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */
55  private final OutputStream m_os;
56
57  /**
58   * The internal buffer where data is stored.
59   * (sc & sb remove final to compile in JDK 1.1.8)
60   */
61  private final byte m_outputBytes[];
62
63  private final char m_inputChars[];
64
65  /**
66   * The number of valid bytes in the buffer. This value is always
67   * in the range <tt>0</tt> through <tt>m_outputBytes.length</tt>; elements
68   * <tt>m_outputBytes[0]</tt> through <tt>m_outputBytes[count-1]</tt> contain valid
69   * byte data.
70   */
71  private int count;
72
73  /**
74   * Create an buffered UTF-8 writer.
75   *
76   *
77   * @param   out    the underlying output stream.
78   *
79   * @throws UnsupportedEncodingException
80   */
81  public WriterToUTF8Buffered(OutputStream out)
82  {
83      m_os = out;
84      // get 3 extra bytes to make buffer overflow checking simpler and faster
85      // we won't have to keep checking for a few extra characters
86      m_outputBytes = new byte[BYTES_MAX + 3];
87
88      // Big enough to hold the input chars that will be transformed
89      // into output bytes in m_ouputBytes.
90      m_inputChars = new char[CHARS_MAX + 2];
91      count = 0;
92
93//      the old body of this constructor, before the buffersize was changed to a constant
94//      this(out, 8*1024);
95  }
96
97  /**
98   * Create an buffered UTF-8 writer to write data to the
99   * specified underlying output stream with the specified buffer
100   * size.
101   *
102   * @param   out    the underlying output stream.
103   * @param   size   the buffer size.
104   * @exception IllegalArgumentException if size <= 0.
105   */
106//  public WriterToUTF8Buffered(final OutputStream out, final int size)
107//  {
108//
109//    m_os = out;
110//
111//    if (size <= 0)
112//    {
113//      throw new IllegalArgumentException(
114//        SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0");
115//    }
116//
117//    m_outputBytes = new byte[size];
118//    count = 0;
119//  }
120
121  /**
122   * Write a single character.  The character to be written is contained in
123   * the 16 low-order bits of the given integer value; the 16 high-order bits
124   * are ignored.
125   *
126   * <p> Subclasses that intend to support efficient single-character output
127   * should override this method.
128   *
129   * @param c  int specifying a character to be written.
130   * @exception  IOException  If an I/O error occurs
131   */
132  public void write(final int c) throws IOException
133  {
134
135    /* If we are close to the end of the buffer then flush it.
136     * Remember the buffer can hold a few more bytes than BYTES_MAX
137     */
138    if (count >= BYTES_MAX)
139        flushBuffer();
140
141    if (c < 0x80)
142    {
143       m_outputBytes[count++] = (byte) (c);
144    }
145    else if (c < 0x800)
146    {
147      m_outputBytes[count++] = (byte) (0xc0 + (c >> 6));
148      m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
149    }
150    else if (c < 0x10000)
151    {
152      m_outputBytes[count++] = (byte) (0xe0 + (c >> 12));
153      m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
154      m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
155    }
156	else
157	{
158	  m_outputBytes[count++] = (byte) (0xf0 + (c >> 18));
159	  m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f));
160	  m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
161	  m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
162	}
163
164  }
165
166
167  /**
168   * Write a portion of an array of characters.
169   *
170   * @param  chars  Array of characters
171   * @param  start   Offset from which to start writing characters
172   * @param  length   Number of characters to write
173   *
174   * @exception  IOException  If an I/O error occurs
175   *
176   * @throws java.io.IOException
177   */
178  public void write(final char chars[], final int start, final int length)
179          throws java.io.IOException
180  {
181
182    // We multiply the length by three since this is the maximum length
183    // of the characters that we can put into the buffer.  It is possible
184    // for each Unicode character to expand to three bytes.
185
186    int lengthx3 = 3*length;
187
188    if (lengthx3 >= BYTES_MAX - count)
189    {
190      // The requested length is greater than the unused part of the buffer
191      flushBuffer();
192
193      if (lengthx3 > BYTES_MAX)
194      {
195        /*
196         * The requested length exceeds the size of the buffer.
197         * Cut the buffer up into chunks, each of which will
198         * not cause an overflow to the output buffer m_outputBytes,
199         * and make multiple recursive calls.
200         * Be careful about integer overflows in multiplication.
201         */
202        int split = length/CHARS_MAX;
203        final int chunks;
204        if (length % CHARS_MAX > 0)
205            chunks = split + 1;
206        else
207            chunks = split;
208        int end_chunk = start;
209        for (int chunk = 1; chunk <= chunks; chunk++)
210        {
211            int start_chunk = end_chunk;
212            end_chunk = start + (int) ((((long) length) * chunk) / chunks);
213
214            // Adjust the end of the chunk if it ends on a high char
215            // of a Unicode surrogate pair and low char of the pair
216            // is not going to be in the same chunk
217            final char c = chars[end_chunk - 1];
218            int ic = chars[end_chunk - 1];
219            if (c >= 0xD800 && c <= 0xDBFF) {
220                // The last Java char that we were going
221                // to process is the first of a
222                // Java surrogate char pair that
223                // represent a Unicode character.
224
225                if (end_chunk < start + length) {
226                    // Avoid spanning by including the low
227                    // char in the current chunk of chars.
228                    end_chunk++;
229                } else {
230                    /* This is the last char of the last chunk,
231                     * and it is the high char of a high/low pair with
232                     * no low char provided.
233                     * TODO: error message needed.
234                     * The char array incorrectly ends in a high char
235                     * of a high/low surrogate pair, but there is
236                     * no corresponding low as the high is the last char
237                     */
238                    end_chunk--;
239                }
240            }
241
242
243            int len_chunk = (end_chunk - start_chunk);
244            this.write(chars,start_chunk, len_chunk);
245        }
246        return;
247      }
248    }
249
250
251
252    final int n = length+start;
253    final byte[] buf_loc = m_outputBytes; // local reference for faster access
254    int count_loc = count;      // local integer for faster access
255    int i = start;
256    {
257        /* This block could be omitted and the code would produce
258         * the same result. But this block exists to give the JIT
259         * a better chance of optimizing a tight and common loop which
260         * occurs when writing out ASCII characters.
261         */
262        char c;
263        for(; i < n && (c = chars[i])< 0x80 ; i++ )
264            buf_loc[count_loc++] = (byte)c;
265    }
266    for (; i < n; i++)
267    {
268
269      final char c = chars[i];
270
271      if (c < 0x80)
272        buf_loc[count_loc++] = (byte) (c);
273      else if (c < 0x800)
274      {
275        buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
276        buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
277      }
278      /**
279        * The following else if condition is added to support XML 1.1 Characters for
280        * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
281        * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
282        *          [1101 11yy] [yyxx xxxx] (low surrogate)
283        *          * uuuuu = wwww + 1
284        */
285      else if (c >= 0xD800 && c <= 0xDBFF)
286      {
287          char high, low;
288          high = c;
289          i++;
290          low = chars[i];
291
292          buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
293          buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
294          buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
295          buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
296      }
297      else
298      {
299        buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
300        buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
301        buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
302      }
303    }
304    // Store the local integer back into the instance variable
305    count = count_loc;
306
307  }
308
309  /**
310   * Write a string.
311   *
312   * @param  s  String to be written
313   *
314   * @exception  IOException  If an I/O error occurs
315   */
316  public void write(final String s) throws IOException
317  {
318
319    // We multiply the length by three since this is the maximum length
320    // of the characters that we can put into the buffer.  It is possible
321    // for each Unicode character to expand to three bytes.
322    final int length = s.length();
323    int lengthx3 = 3*length;
324
325    if (lengthx3 >= BYTES_MAX - count)
326    {
327      // The requested length is greater than the unused part of the buffer
328      flushBuffer();
329
330      if (lengthx3 > BYTES_MAX)
331      {
332        /*
333         * The requested length exceeds the size of the buffer,
334         * so break it up in chunks that don't exceed the buffer size.
335         */
336         final int start = 0;
337         int split = length/CHARS_MAX;
338         final int chunks;
339         if (length % CHARS_MAX > 0)
340             chunks = split + 1;
341         else
342             chunks = split;
343         int end_chunk = 0;
344         for (int chunk = 1; chunk <= chunks; chunk++)
345         {
346             int start_chunk = end_chunk;
347             end_chunk = start + (int) ((((long) length) * chunk) / chunks);
348             s.getChars(start_chunk,end_chunk, m_inputChars,0);
349             int len_chunk = (end_chunk - start_chunk);
350
351             // Adjust the end of the chunk if it ends on a high char
352             // of a Unicode surrogate pair and low char of the pair
353             // is not going to be in the same chunk
354             final char c = m_inputChars[len_chunk - 1];
355             if (c >= 0xD800 && c <= 0xDBFF) {
356                 // Exclude char in this chunk,
357                 // to avoid spanning a Unicode character
358                 // that is in two Java chars as a high/low surrogate
359                 end_chunk--;
360                 len_chunk--;
361                 if (chunk == chunks) {
362                     /* TODO: error message needed.
363                      * The String incorrectly ends in a high char
364                      * of a high/low surrogate pair, but there is
365                      * no corresponding low as the high is the last char
366                      * Recover by ignoring this last char.
367                      */
368                 }
369             }
370
371             this.write(m_inputChars,0, len_chunk);
372         }
373         return;
374      }
375    }
376
377
378    s.getChars(0, length , m_inputChars, 0);
379    final char[] chars = m_inputChars;
380    final int n = length;
381    final byte[] buf_loc = m_outputBytes; // local reference for faster access
382    int count_loc = count;      // local integer for faster access
383    int i = 0;
384    {
385        /* This block could be omitted and the code would produce
386         * the same result. But this block exists to give the JIT
387         * a better chance of optimizing a tight and common loop which
388         * occurs when writing out ASCII characters.
389         */
390        char c;
391        for(; i < n && (c = chars[i])< 0x80 ; i++ )
392            buf_loc[count_loc++] = (byte)c;
393    }
394    for (; i < n; i++)
395    {
396
397      final char c = chars[i];
398
399      if (c < 0x80)
400        buf_loc[count_loc++] = (byte) (c);
401      else if (c < 0x800)
402      {
403        buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
404        buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
405      }
406    /**
407      * The following else if condition is added to support XML 1.1 Characters for
408      * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
409      * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
410      *          [1101 11yy] [yyxx xxxx] (low surrogate)
411      *          * uuuuu = wwww + 1
412      */
413    else if (c >= 0xD800 && c <= 0xDBFF)
414    {
415        char high, low;
416        high = c;
417        i++;
418        low = chars[i];
419
420        buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
421        buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
422        buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
423        buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
424    }
425      else
426      {
427        buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
428        buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
429        buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
430      }
431    }
432    // Store the local integer back into the instance variable
433    count = count_loc;
434
435  }
436
437  /**
438   * Flush the internal buffer
439   *
440   * @throws IOException
441   */
442  public void flushBuffer() throws IOException
443  {
444
445    if (count > 0)
446    {
447      m_os.write(m_outputBytes, 0, count);
448
449      count = 0;
450    }
451  }
452
453  /**
454   * Flush the stream.  If the stream has saved any characters from the
455   * various write() methods in a buffer, write them immediately to their
456   * intended destination.  Then, if that destination is another character or
457   * byte stream, flush it.  Thus one flush() invocation will flush all the
458   * buffers in a chain of Writers and OutputStreams.
459   *
460   * @exception  IOException  If an I/O error occurs
461   *
462   * @throws java.io.IOException
463   */
464  public void flush() throws java.io.IOException
465  {
466    flushBuffer();
467    m_os.flush();
468  }
469
470  /**
471   * Close the stream, flushing it first.  Once a stream has been closed,
472   * further write() or flush() invocations will cause an IOException to be
473   * thrown.  Closing a previously-closed stream, however, has no effect.
474   *
475   * @exception  IOException  If an I/O error occurs
476   *
477   * @throws java.io.IOException
478   */
479  public void close() throws java.io.IOException
480  {
481    flushBuffer();
482    m_os.close();
483  }
484
485  /**
486   * Get the output stream where the events will be serialized to.
487   *
488   * @return reference to the result stream, or null of only a writer was
489   * set.
490   */
491  public OutputStream getOutputStream()
492  {
493    return m_os;
494  }
495
496  public Writer getWriter()
497  {
498    // Only one of getWriter() or getOutputStream() can return null
499    // This type of writer wraps an OutputStream, not a Writer.
500    return null;
501  }
502}
503