/* Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package java.nio.charset; import java.nio.BufferOverflowException; import java.nio.BufferUnderflowException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import org.apache.harmony.niochar.internal.nls.Messages; /** * A converter that can converts a 16-bit Unicode character sequence to a byte * sequence in some charset. *
* The input character sequence is wrapped by a * {@link java.nio.CharBuffer CharBuffer} and the output character sequence is a * {@link java.nio.ByteBuffer ByteBuffer}. An encoder instance should be used * in the following sequence, which is referred to as a encoding operation: *
endOfInput
* parameter must be set to false, the input buffer must be filled and the
* output buffer must be flushed between invocations;endOfInput
parameter must be
* set to {@code true}* The {@link #encode(CharBuffer, ByteBuffer, boolean) encode} method will * convert as many characters as possible, and the process won't stop until the * input characters have run out, the output buffer has been filled or some * error has happened. A {@link CoderResult CoderResult} instance will be * returned to indicate the stop reason, and the invoker can identify the result * and choose further action, which includes filling the input buffer, flushing * the output buffer or recovering from an error and trying again. *
* There are two common encoding errors. One is named malformed and it is * returned when the input content is an illegal 16-bit Unicode character * sequence, the other is named unmappable character and occurs when there is a * problem mapping the input to a valid byte sequence in the specified charset. *
* Both errors can be handled in three ways, the default one is to report the * error to the invoker by a {@link CoderResult CoderResult} instance, and the * alternatives are to ignore it or to replace the erroneous input with the * replacement byte array. The replacement byte array is '{@code ?}' by * default and can be changed by invoking the * {@link #replaceWith(byte[]) replaceWith} method. The invoker of this encoder * can choose one way by specifying a * {@link CodingErrorAction CodingErrorAction} instance for each error type via * the {@link #onMalformedInput(CodingErrorAction) onMalformedInput} method and * the {@link #onUnmappableCharacter(CodingErrorAction) onUnmappableCharacter} * method. *
* This class is abstract and encapsulates many common operations of the * encoding process for all charsets. Encoders for a specific charset should * extend this class and need only to implement the * {@link #encodeLoop(CharBuffer, ByteBuffer) encodeLoop} method for basic * encoding. If a subclass maintains an internal state, it should override the * {@link #implFlush(ByteBuffer) implFlush} method and the * {@link #implReset() implReset} method in addition. *
* This class is not thread-safe.
*
* @see java.nio.charset.Charset
* @see java.nio.charset.CharsetDecoder
*/
public abstract class CharsetEncoder {
/*
* internal status consts
*/
private static final int INIT = 0;
private static final int ONGOING = 1;
private static final int END = 2;
private static final int FLUSH = 3;
// the Charset which creates this encoder
private Charset cs;
// average bytes per character created by this encoder
private float averBytes;
// maximum bytes per character can be created by this encoder
private float maxBytes;
// replacement byte array
private byte[] replace;
// internal status
private int status;
// action for malformed input
private CodingErrorAction malformAction;
// action for unmapped char input
private CodingErrorAction unmapAction;
// decoder instance for this encoder's charset, used for replacement value
// checking
private CharsetDecoder decoder;
/**
* Constructs a new CharsetEncoder
using the given
* Charset
, average number and maximum number of bytes
* created by this encoder for one input character.
*
* @param cs
* the Charset
to be used by this encoder.
* @param averageBytesPerChar
* average number of bytes created by this encoder for one input
* character, must be positive.
* @param maxBytesPerChar
* maximum number of bytes which can be created by this encoder
* for one input character, must be positive.
* @throws IllegalArgumentException
* if maxBytesPerChar
or
* averageBytesPerChar
is negative.
*/
protected CharsetEncoder(Charset cs, float averageBytesPerChar,
float maxBytesPerChar) {
this(cs, averageBytesPerChar, maxBytesPerChar,
new byte[] { (byte) '?' });
}
/**
* Constructs a new CharsetEncoder
using the given
* Charset
, replacement byte array, average number and
* maximum number of bytes created by this encoder for one input character.
*
* @param cs
* the Charset
to be used by this encoder.
* @param averageBytesPerChar
* average number of bytes created by this encoder for one single
* input character, must be positive.
* @param maxBytesPerChar
* maximum number of bytes which can be created by this encoder
* for one single input character, must be positive.
* @param replacement
* the replacement byte array, cannot be null or empty, its
* length cannot be larger than maxBytesPerChar
,
* and must be a legal replacement, which can be justified by
* {@link #isLegalReplacement(byte[]) isLegalReplacement}.
* @throws IllegalArgumentException
* if any parameters are invalid.
*/
protected CharsetEncoder(Charset cs, float averageBytesPerChar,
float maxBytesPerChar, byte[] replacement) {
if (averageBytesPerChar <= 0 || maxBytesPerChar <= 0) {
// niochar.02=Bytes number for one character must be positive.
throw new IllegalArgumentException(Messages.getString("niochar.02")); //$NON-NLS-1$
}
if (averageBytesPerChar > maxBytesPerChar) {
// niochar.03=averageBytesPerChar is greater than maxBytesPerChar.
throw new IllegalArgumentException(Messages.getString("niochar.03")); //$NON-NLS-1$
}
this.cs = cs;
averBytes = averageBytesPerChar;
maxBytes = maxBytesPerChar;
status = INIT;
malformAction = CodingErrorAction.REPORT;
unmapAction = CodingErrorAction.REPORT;
replaceWith(replacement);
}
/**
* Gets the average number of bytes created by this encoder for a single
* input character.
*
* @return the average number of bytes created by this encoder for a single
* input character.
*/
public final float averageBytesPerChar() {
return averBytes;
}
/**
* Checks if the given character can be encoded by this encoder.
*
* Note that this method can change the internal status of this encoder, so
* it should not be called when another encoding process is ongoing,
* otherwise it will throw an IllegalStateException
.
*
* This method can be overridden for performance improvement.
*
* @param c
* the given encoder.
* @return true if given character can be encoded by this encoder.
* @throws IllegalStateException
* if another encode process is ongoing so that the current
* internal status is neither RESET or FLUSH.
*/
public boolean canEncode(char c) {
return implCanEncode(CharBuffer.wrap(new char[] { c }));
}
// implementation of canEncode
private boolean implCanEncode(CharBuffer cb) {
if (status == FLUSH) {
status = INIT;
}
if (status != INIT) {
// niochar.0B=Another encoding process is ongoing\!
throw new IllegalStateException(Messages.getString("niochar.0B")); //$NON-NLS-1$
}
CodingErrorAction malformBak = malformAction;
CodingErrorAction unmapBak = unmapAction;
onMalformedInput(CodingErrorAction.REPORT);
onUnmappableCharacter(CodingErrorAction.REPORT);
boolean result = true;
try {
this.encode(cb);
} catch (CharacterCodingException e) {
result = false;
}
onMalformedInput(malformBak);
onUnmappableCharacter(unmapBak);
reset();
return result;
}
/**
* Checks if a given CharSequence
can be encoded by this
* encoder.
*
* Note that this method can change the internal status of this encoder, so
* it should not be called when another encode process is ongoing, otherwise
* it will throw an IllegalStateException
.
*
* This method can be overridden for performance improvement.
*
* @param sequence
* the given CharSequence
.
* @return true if the given CharSequence
can be encoded by
* this encoder.
* @throws IllegalStateException
* if current internal status is neither RESET or FLUSH.
*/
public boolean canEncode(CharSequence sequence) {
CharBuffer cb;
if (sequence instanceof CharBuffer) {
cb = ((CharBuffer) sequence).duplicate();
} else {
cb = CharBuffer.wrap(sequence);
}
return implCanEncode(cb);
}
/**
* Gets the Charset
which this encoder uses.
*
* @return the Charset
which this encoder uses.
*/
public final Charset charset() {
return cs;
}
/**
* This is a facade method for the encoding operation.
*
* This method encodes the remaining character sequence of the given * character buffer into a new byte buffer. This method performs a complete * encoding operation, resets at first, then encodes, and flushes at last. *
* This method should not be invoked if another encode operation is ongoing.
*
* @param in
* the input buffer.
* @return a new ByteBuffer
containing the bytes produced by
* this encoding operation. The buffer's limit will be the position
* of the last byte in the buffer, and the position will be zero.
* @throws IllegalStateException
* if another encoding operation is ongoing.
* @throws MalformedInputException
* if an illegal input character sequence for this charset is
* encountered, and the action for malformed error is
* {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}
* @throws UnmappableCharacterException
* if a legal but unmappable input character sequence for this
* charset is encountered, and the action for unmappable
* character error is
* {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}.
* Unmappable means the Unicode character sequence at the input
* buffer's current position cannot be mapped to a equivalent
* byte sequence.
* @throws CharacterCodingException
* if other exception happened during the encode operation.
*/
public final ByteBuffer encode(CharBuffer in)
throws CharacterCodingException {
if (in.remaining() == 0) {
return ByteBuffer.allocate(0);
}
reset();
int length = (int) (in.remaining() * averBytes);
ByteBuffer output = ByteBuffer.allocate(length);
CoderResult result = null;
while (true) {
result = encode(in, output, false);
if (result==CoderResult.UNDERFLOW) {
break;
} else if (result==CoderResult.OVERFLOW) {
output = allocateMore(output);
continue;
}
checkCoderResult(result);
}
result = encode(in, output, true);
checkCoderResult(result);
while (true) {
result = flush(output);
if (result==CoderResult.UNDERFLOW) {
output.flip();
break;
} else if (result==CoderResult.OVERFLOW) {
output = allocateMore(output);
continue;
}
checkCoderResult(result);
output.flip();
if (result.isMalformed()) {
throw new MalformedInputException(result.length());
} else if (result.isUnmappable()) {
throw new UnmappableCharacterException(result.length());
}
break;
}
status = FLUSH;
return output;
}
/*
* checks the result whether it needs to throw CharacterCodingException.
*/
private void checkCoderResult(CoderResult result)
throws CharacterCodingException {
if (malformAction == CodingErrorAction.REPORT && result.isMalformed() ) {
throw new MalformedInputException(result.length());
} else if (unmapAction == CodingErrorAction.REPORT && result.isUnmappable()) {
throw new UnmappableCharacterException(result.length());
}
}
// allocate more spaces to the given ByteBuffer
private ByteBuffer allocateMore(ByteBuffer output) {
if (output.capacity() == 0) {
return ByteBuffer.allocate(1);
}
ByteBuffer result = ByteBuffer.allocate(output.capacity() * 2);
output.flip();
result.put(output);
return result;
}
/**
* Encodes characters starting at the current position of the given input
* buffer, and writes the equivalent byte sequence into the given output
* buffer from its current position.
*
* The buffers' position will be changed with the reading and writing * operation, but their limits and marks will be kept intact. *
* A CoderResult
instance will be returned according to
* following rules:
*
* The endOfInput
parameter indicates if the invoker can
* provider further input. This parameter is true if and only if the
* characters in the current input buffer are all inputs for this encoding
* operation. Note that it is common and won't cause an error if the invoker
* sets false and then has no more input available, while it may cause an
* error if the invoker always sets true in several consecutive invocations.
* This would make the remaining input to be treated as malformed input.
* input.
*
* This method invokes the
* {@link #encodeLoop(CharBuffer, ByteBuffer) encodeLoop} method to
* implement the basic encode logic for a specific charset.
*
* @param in
* the input buffer.
* @param out
* the output buffer.
* @param endOfInput
* true if all the input characters have been provided.
* @return a CoderResult
instance indicating the result.
* @throws IllegalStateException
* if the encoding operation has already started or no more
* input is needed in this encoding process.
* @throws CoderMalfunctionError
* If the {@link #encodeLoop(CharBuffer, ByteBuffer) encodeLoop}
* method threw an BufferUnderflowException
or
* BufferUnderflowException
.
*/
public final CoderResult encode(CharBuffer in, ByteBuffer out,
boolean endOfInput) {
if ((status == FLUSH) || (!endOfInput && status == END)) {
throw new IllegalStateException();
}
CoderResult result;
while (true) {
try {
result = encodeLoop(in, out);
} catch (BufferOverflowException e) {
throw new CoderMalfunctionError(e);
} catch (BufferUnderflowException e) {
throw new CoderMalfunctionError(e);
}
if (result==CoderResult.UNDERFLOW) {
status = endOfInput ? END : ONGOING;
if (endOfInput) {
int remaining = in.remaining();
if (remaining > 0) {
result = CoderResult.malformedForLength(remaining);
} else {
return result;
}
} else {
return result;
}
} else if (result==CoderResult.OVERFLOW) {
status = endOfInput ? END : ONGOING;
return result;
}
CodingErrorAction action = malformAction;
if (result.isUnmappable()) {
action = unmapAction;
}
// If the action is IGNORE or REPLACE, we should continue
// encoding.
if (action == CodingErrorAction.REPLACE) {
if (out.remaining() < replace.length) {
return CoderResult.OVERFLOW;
}
out.put(replace);
} else {
if (action != CodingErrorAction.IGNORE) {
return result;
}
}
in.position(in.position() + result.length());
}
}
/**
* Encodes characters into bytes. This method is called by
* {@link #encode(CharBuffer, ByteBuffer, boolean) encode}.
*
* This method will implement the essential encoding operation, and it won't
* stop encoding until either all the input characters are read, the output
* buffer is filled, or some exception is encountered. Then it will
* return a CoderResult
object indicating the result of the
* current encoding operation. The rule to construct the
* CoderResult
is the same as for
* {@link #encode(CharBuffer, ByteBuffer, boolean) encode}. When an
* exception is encountered in the encoding operation, most implementations
* of this method will return a relevant result object to the
* {@link #encode(CharBuffer, ByteBuffer, boolean) encode} method, and some
* performance optimized implementation may handle the exception and
* implement the error action itself.
*
* The buffers are scanned from their current positions, and their positions * will be modified accordingly, while their marks and limits will be * intact. At most {@link CharBuffer#remaining() in.remaining()} characters * will be read, and {@link ByteBuffer#remaining() out.remaining()} bytes * will be written. *
* Note that some implementations may pre-scan the input buffer and return
* CoderResult.UNDERFLOW
until it receives sufficient input.
*
* @param in
* the input buffer.
* @param out
* the output buffer.
* @return a CoderResult
instance indicating the result.
*/
protected abstract CoderResult encodeLoop(CharBuffer in, ByteBuffer out);
/**
* Flushes this encoder.
*
* This method will call {@link #implFlush(ByteBuffer) implFlush}. Some * encoders may need to write some bytes to the output buffer when they have * read all input characters, subclasses can overridden * {@link #implFlush(ByteBuffer) implFlush} to perform writing action. *
* The maximum number of written bytes won't larger than
* {@link ByteBuffer#remaining() out.remaining()}. If some encoder wants to
* write more bytes than the output buffer's available remaining space, then
* CoderResult.OVERFLOW
will be returned, and this method
* must be called again with a byte buffer that has free space. Otherwise
* this method will return CoderResult.UNDERFLOW
, which
* means one encoding process has been completed successfully.
*
* During the flush, the output buffer's position will be changed
* accordingly, while its mark and limit will be intact.
*
* @param out
* the given output buffer.
* @return CoderResult.UNDERFLOW
or
* CoderResult.OVERFLOW
.
* @throws IllegalStateException
* if this encoder hasn't read all input characters during one
* encoding process, which means neither after calling
* {@link #encode(CharBuffer) encode(CharBuffer)} nor after
* calling {@link #encode(CharBuffer, ByteBuffer, boolean)
* encode(CharBuffer, ByteBuffer, boolean)} with {@code true}
* for the last boolean parameter.
*/
public final CoderResult flush(ByteBuffer out) {
if (status != END && status != INIT) {
throw new IllegalStateException();
}
CoderResult result = implFlush(out);
if (result == CoderResult.UNDERFLOW) {
status = FLUSH;
}
return result;
}
/**
* Flushes this encoder. The default implementation does nothing and always
* returns CoderResult.UNDERFLOW
; this method can be
* overridden if needed.
*
* @param out
* the output buffer.
* @return CoderResult.UNDERFLOW
or
* CoderResult.OVERFLOW
.
*/
protected CoderResult implFlush(ByteBuffer out) {
return CoderResult.UNDERFLOW;
}
/**
* Notifies that this encoder's CodingErrorAction
specified
* for malformed input error has been changed. The default implementation
* does nothing; this method can be overridden if needed.
*
* @param newAction
* the new action.
*/
protected void implOnMalformedInput(CodingErrorAction newAction) {
// default implementation is empty
}
/**
* Notifies that this encoder's CodingErrorAction
specified
* for unmappable character error has been changed. The default
* implementation does nothing; this method can be overridden if needed.
*
* @param newAction
* the new action.
*/
protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
// default implementation is empty
}
/**
* Notifies that this encoder's replacement has been changed. The default
* implementation does nothing; this method can be overridden if needed.
*
* @param newReplacement
* the new replacement string.
*/
protected void implReplaceWith(byte[] newReplacement) {
// default implementation is empty
}
/**
* Resets this encoder's charset related state. The default implementation
* does nothing; this method can be overridden if needed.
*/
protected void implReset() {
// default implementation is empty
}
/**
* Checks if the given argument is legal as this encoder's replacement byte
* array.
*
* The given byte array is legal if and only if it can be decode into
* sixteen bits Unicode characters.
*
* This method can be overridden for performance improvement.
*
* @param repl
* the given byte array to be checked.
* @return true if the the given argument is legal as this encoder's
* replacement byte array.
*/
public boolean isLegalReplacement(byte[] repl) {
if (decoder == null) {
decoder = cs.newDecoder();
}
CodingErrorAction malform = decoder.malformedInputAction();
CodingErrorAction unmap = decoder.unmappableCharacterAction();
decoder.onMalformedInput(CodingErrorAction.REPORT);
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
ByteBuffer in = ByteBuffer.wrap(repl);
CharBuffer out = CharBuffer.allocate((int) (repl.length * decoder
.maxCharsPerByte()));
CoderResult result = decoder.decode(in, out, true);
decoder.onMalformedInput(malform);
decoder.onUnmappableCharacter(unmap);
return !result.isError();
}
/**
* Gets this encoder's CodingErrorAction
when a malformed
* input error occurred during the encoding process.
*
* @return this encoder's CodingErrorAction
when a malformed
* input error occurred during the encoding process.
*/
public CodingErrorAction malformedInputAction() {
return malformAction;
}
/**
* Gets the maximum number of bytes which can be created by this encoder for
* one input character, must be positive.
*
* @return the maximum number of bytes which can be created by this encoder
* for one input character, must be positive.
*/
public final float maxBytesPerChar() {
return maxBytes;
}
/**
* Sets this encoder's action on malformed input error.
*
* This method will call the
* {@link #implOnMalformedInput(CodingErrorAction) implOnMalformedInput}
* method with the given new action as argument.
*
* @param newAction
* the new action on malformed input error.
* @return this encoder.
* @throws IllegalArgumentException
* if the given newAction is null.
*/
public final CharsetEncoder onMalformedInput(CodingErrorAction newAction) {
if (null == newAction) {
// niochar.0C=Action on malformed input error cannot be null\!
throw new IllegalArgumentException(Messages.getString("niochar.0C")); //$NON-NLS-1$
}
malformAction = newAction;
implOnMalformedInput(newAction);
return this;
}
/**
* Sets this encoder's action on unmappable character error.
*
* This method will call the
* {@link #implOnUnmappableCharacter(CodingErrorAction) implOnUnmappableCharacter}
* method with the given new action as argument.
*
* @param newAction
* the new action on unmappable character error.
* @return this encoder.
* @throws IllegalArgumentException
* if the given newAction is null.
*/
public final CharsetEncoder onUnmappableCharacter(
CodingErrorAction newAction) {
if (null == newAction) {
// niochar.0D=Action on unmappable character error cannot be null\!
throw new IllegalArgumentException(Messages.getString("niochar.0D")); //$NON-NLS-1$
}
unmapAction = newAction;
implOnUnmappableCharacter(newAction);
return this;
}
/**
* Gets the replacement byte array, which is never null or empty.
*
* @return the replacement byte array, cannot be null or empty.
*/
public final byte[] replacement() {
return replace;
}
/**
* Sets the new replacement value.
*
* This method first checks the given replacement's validity, then changes
* the replacement value and finally calls the
* {@link #implReplaceWith(byte[]) implReplaceWith} method with the given
* new replacement as argument.
*
* @param replacement
* the replacement byte array, cannot be null or empty, its
* length cannot be larger than maxBytesPerChar
,
* and it must be legal replacement, which can be justified by
* calling isLegalReplacement(byte[] repl)
.
* @return this encoder.
* @throws IllegalArgumentException
* if the given replacement cannot satisfy the requirement
* mentioned above.
*/
public final CharsetEncoder replaceWith(byte[] replacement) {
if (null == replacement || 0 == replacement.length
|| maxBytes < replacement.length
|| !isLegalReplacement(replacement)) {
// niochar.0E=Replacement is illegal
throw new IllegalArgumentException(Messages.getString("niochar.0E")); //$NON-NLS-1$
}
replace = replacement;
implReplaceWith(replacement);
return this;
}
/**
* Resets this encoder. This method will reset the internal status and then
* calla implReset()
to reset any status related to the
* specific charset.
*
* @return this encoder.
*/
public final CharsetEncoder reset() {
status = INIT;
implReset();
return this;
}
/**
* Gets this encoder's CodingErrorAction
when unmappable
* character occurred during encoding process.
*
* @return this encoder's CodingErrorAction
when unmappable
* character occurred during encoding process.
*/
public CodingErrorAction unmappableCharacterAction() {
return unmapAction;
}
}