/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * $Id: WriterToUTF8Buffered.java 469356 2006-10-31 03:20:34Z minchau $ */ package org.apache.xml.serializer; import java.io.IOException; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.io.Writer; /** * This class writes unicode characters to a byte stream (java.io.OutputStream) * as quickly as possible. It buffers the output in an internal * buffer which must be flushed to the OutputStream when done. This flushing * is done via the close() flush() or flushBuffer() method. * * This class is only used internally within Xalan. * * @xsl.usage internal */ final class WriterToUTF8Buffered extends Writer implements WriterChain { /** number of bytes that the byte buffer can hold. * This is a fixed constant is used rather than m_outputBytes.lenght for performance. */ private static final int BYTES_MAX=16*1024; /** number of characters that the character buffer can hold. * This is 1/3 of the number of bytes because UTF-8 encoding * can expand one unicode character by up to 3 bytes. */ private static final int CHARS_MAX=(BYTES_MAX/3); // private static final int /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */ private final OutputStream m_os; /** * The internal buffer where data is stored. * (sc & sb remove final to compile in JDK 1.1.8) */ private final byte m_outputBytes[]; private final char m_inputChars[]; /** * The number of valid bytes in the buffer. This value is always * in the range 0 through m_outputBytes.length; elements * m_outputBytes[0] through m_outputBytes[count-1] contain valid * byte data. */ private int count; /** * Create an buffered UTF-8 writer. * * * @param out the underlying output stream. * * @throws UnsupportedEncodingException */ public WriterToUTF8Buffered(OutputStream out) { m_os = out; // get 3 extra bytes to make buffer overflow checking simpler and faster // we won't have to keep checking for a few extra characters m_outputBytes = new byte[BYTES_MAX + 3]; // Big enough to hold the input chars that will be transformed // into output bytes in m_ouputBytes. m_inputChars = new char[CHARS_MAX + 2]; count = 0; // the old body of this constructor, before the buffersize was changed to a constant // this(out, 8*1024); } /** * Create an buffered UTF-8 writer to write data to the * specified underlying output stream with the specified buffer * size. * * @param out the underlying output stream. * @param size the buffer size. * @exception IllegalArgumentException if size <= 0. */ // public WriterToUTF8Buffered(final OutputStream out, final int size) // { // // m_os = out; // // if (size <= 0) // { // throw new IllegalArgumentException( // SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0"); // } // // m_outputBytes = new byte[size]; // count = 0; // } /** * Write a single character. The character to be written is contained in * the 16 low-order bits of the given integer value; the 16 high-order bits * are ignored. * *

Subclasses that intend to support efficient single-character output * should override this method. * * @param c int specifying a character to be written. * @exception IOException If an I/O error occurs */ public void write(final int c) throws IOException { /* If we are close to the end of the buffer then flush it. * Remember the buffer can hold a few more bytes than BYTES_MAX */ if (count >= BYTES_MAX) flushBuffer(); if (c < 0x80) { m_outputBytes[count++] = (byte) (c); } else if (c < 0x800) { m_outputBytes[count++] = (byte) (0xc0 + (c >> 6)); m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); } else if (c < 0x10000) { m_outputBytes[count++] = (byte) (0xe0 + (c >> 12)); m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f)); m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); } else { m_outputBytes[count++] = (byte) (0xf0 + (c >> 18)); m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f)); m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f)); m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); } } /** * Write a portion of an array of characters. * * @param chars Array of characters * @param start Offset from which to start writing characters * @param length Number of characters to write * * @exception IOException If an I/O error occurs * * @throws java.io.IOException */ public void write(final char chars[], final int start, final int length) throws java.io.IOException { // We multiply the length by three since this is the maximum length // of the characters that we can put into the buffer. It is possible // for each Unicode character to expand to three bytes. int lengthx3 = 3*length; if (lengthx3 >= BYTES_MAX - count) { // The requested length is greater than the unused part of the buffer flushBuffer(); if (lengthx3 > BYTES_MAX) { /* * The requested length exceeds the size of the buffer. * Cut the buffer up into chunks, each of which will * not cause an overflow to the output buffer m_outputBytes, * and make multiple recursive calls. * Be careful about integer overflows in multiplication. */ int split = length/CHARS_MAX; final int chunks; if (length % CHARS_MAX > 0) chunks = split + 1; else chunks = split; int end_chunk = start; for (int chunk = 1; chunk <= chunks; chunk++) { int start_chunk = end_chunk; end_chunk = start + (int) ((((long) length) * chunk) / chunks); // Adjust the end of the chunk if it ends on a high char // of a Unicode surrogate pair and low char of the pair // is not going to be in the same chunk final char c = chars[end_chunk - 1]; int ic = chars[end_chunk - 1]; if (c >= 0xD800 && c <= 0xDBFF) { // The last Java char that we were going // to process is the first of a // Java surrogate char pair that // represent a Unicode character. if (end_chunk < start + length) { // Avoid spanning by including the low // char in the current chunk of chars. end_chunk++; } else { /* This is the last char of the last chunk, * and it is the high char of a high/low pair with * no low char provided. * TODO: error message needed. * The char array incorrectly ends in a high char * of a high/low surrogate pair, but there is * no corresponding low as the high is the last char */ end_chunk--; } } int len_chunk = (end_chunk - start_chunk); this.write(chars,start_chunk, len_chunk); } return; } } final int n = length+start; final byte[] buf_loc = m_outputBytes; // local reference for faster access int count_loc = count; // local integer for faster access int i = start; { /* This block could be omitted and the code would produce * the same result. But this block exists to give the JIT * a better chance of optimizing a tight and common loop which * occurs when writing out ASCII characters. */ char c; for(; i < n && (c = chars[i])< 0x80 ; i++ ) buf_loc[count_loc++] = (byte)c; } for (; i < n; i++) { final char c = chars[i]; if (c < 0x80) buf_loc[count_loc++] = (byte) (c); else if (c < 0x800) { buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6)); buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); } /** * The following else if condition is added to support XML 1.1 Characters for * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) * [1101 11yy] [yyxx xxxx] (low surrogate) * * uuuuu = wwww + 1 */ else if (c >= 0xD800 && c <= 0xDBFF) { char high, low; high = c; i++; low = chars[i]; buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0)); buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f)); buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30)); buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f)); } else { buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12)); buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f)); buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); } } // Store the local integer back into the instance variable count = count_loc; } /** * Write a string. * * @param s String to be written * * @exception IOException If an I/O error occurs */ public void write(final String s) throws IOException { // We multiply the length by three since this is the maximum length // of the characters that we can put into the buffer. It is possible // for each Unicode character to expand to three bytes. final int length = s.length(); int lengthx3 = 3*length; if (lengthx3 >= BYTES_MAX - count) { // The requested length is greater than the unused part of the buffer flushBuffer(); if (lengthx3 > BYTES_MAX) { /* * The requested length exceeds the size of the buffer, * so break it up in chunks that don't exceed the buffer size. */ final int start = 0; int split = length/CHARS_MAX; final int chunks; if (length % CHARS_MAX > 0) chunks = split + 1; else chunks = split; int end_chunk = 0; for (int chunk = 1; chunk <= chunks; chunk++) { int start_chunk = end_chunk; end_chunk = start + (int) ((((long) length) * chunk) / chunks); s.getChars(start_chunk,end_chunk, m_inputChars,0); int len_chunk = (end_chunk - start_chunk); // Adjust the end of the chunk if it ends on a high char // of a Unicode surrogate pair and low char of the pair // is not going to be in the same chunk final char c = m_inputChars[len_chunk - 1]; if (c >= 0xD800 && c <= 0xDBFF) { // Exclude char in this chunk, // to avoid spanning a Unicode character // that is in two Java chars as a high/low surrogate end_chunk--; len_chunk--; if (chunk == chunks) { /* TODO: error message needed. * The String incorrectly ends in a high char * of a high/low surrogate pair, but there is * no corresponding low as the high is the last char * Recover by ignoring this last char. */ } } this.write(m_inputChars,0, len_chunk); } return; } } s.getChars(0, length , m_inputChars, 0); final char[] chars = m_inputChars; final int n = length; final byte[] buf_loc = m_outputBytes; // local reference for faster access int count_loc = count; // local integer for faster access int i = 0; { /* This block could be omitted and the code would produce * the same result. But this block exists to give the JIT * a better chance of optimizing a tight and common loop which * occurs when writing out ASCII characters. */ char c; for(; i < n && (c = chars[i])< 0x80 ; i++ ) buf_loc[count_loc++] = (byte)c; } for (; i < n; i++) { final char c = chars[i]; if (c < 0x80) buf_loc[count_loc++] = (byte) (c); else if (c < 0x800) { buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6)); buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); } /** * The following else if condition is added to support XML 1.1 Characters for * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) * [1101 11yy] [yyxx xxxx] (low surrogate) * * uuuuu = wwww + 1 */ else if (c >= 0xD800 && c <= 0xDBFF) { char high, low; high = c; i++; low = chars[i]; buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0)); buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f)); buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30)); buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f)); } else { buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12)); buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f)); buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); } } // Store the local integer back into the instance variable count = count_loc; } /** * Flush the internal buffer * * @throws IOException */ public void flushBuffer() throws IOException { if (count > 0) { m_os.write(m_outputBytes, 0, count); count = 0; } } /** * Flush the stream. If the stream has saved any characters from the * various write() methods in a buffer, write them immediately to their * intended destination. Then, if that destination is another character or * byte stream, flush it. Thus one flush() invocation will flush all the * buffers in a chain of Writers and OutputStreams. * * @exception IOException If an I/O error occurs * * @throws java.io.IOException */ public void flush() throws java.io.IOException { flushBuffer(); m_os.flush(); } /** * Close the stream, flushing it first. Once a stream has been closed, * further write() or flush() invocations will cause an IOException to be * thrown. Closing a previously-closed stream, however, has no effect. * * @exception IOException If an I/O error occurs * * @throws java.io.IOException */ public void close() throws java.io.IOException { flushBuffer(); m_os.close(); } /** * Get the output stream where the events will be serialized to. * * @return reference to the result stream, or null of only a writer was * set. */ public OutputStream getOutputStream() { return m_os; } public Writer getWriter() { // Only one of getWriter() or getOutputStream() can return null // This type of writer wraps an OutputStream, not a Writer. return null; } }