1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the  "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 *     http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18/*
19 * $Id: EncodingInfo.java 468654 2006-10-28 07:09:23Z minchau $
20 */
21package org.apache.xml.serializer;
22
23
24/**
25 * Holds information about a given encoding, which is the Java name for the
26 * encoding, the equivalent ISO name.
27 * <p>
28 * An object of this type has two useful methods
29 * <pre>
30 * isInEncoding(char ch);
31 * </pre>
32 * which can be called if the character is not the high one in
33 * a surrogate pair and:
34 * <pre>
35 * isInEncoding(char high, char low);
36 * </pre>
37 * which can be called if the two characters from a high/low surrogate pair.
38 * <p>
39 * An EncodingInfo object is a node in a binary search tree. Such a node
40 * will answer if a character is in the encoding, and do so for a given
41 * range of unicode values (<code>m_first</code> to
42 * <code>m_last</code>). It will handle a certain range of values
43 * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>).
44 * If the unicode point is before that explicit range, that is it
45 * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root
46 * of such a tree, m_before.  Likewise for values in the range
47 * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code>
48 * <p>
49 * Actually figuring out if a code point is in the encoding is expensive. So the
50 * purpose of this tree is to cache such determinations, and not to build the
51 * entire tree of information at the start, but only build up as much of the
52 * tree as is used during the transformation.
53 * <p>
54 * This Class is not a public API, and should only be used internally within
55 * the serializer.
56 * <p>
57 * This class is not a public API.
58 * @xsl.usage internal
59 */
60public final class EncodingInfo extends Object
61{
62
63    /**
64     * Not all characters in an encoding are in on contiguous group,
65     * however there is a lowest contiguous group starting at '\u0001'
66     * and working up to m_highCharInContiguousGroup.
67     * <p>
68     * This is the char for which chars at or below this value are
69     * definately in the encoding, although for chars
70     * above this point they might be in the encoding.
71     * This exists for performance, especially for ASCII characters
72     * because for ASCII all chars in the range '\u0001' to '\u007F'
73     * are in the encoding.
74     *
75     */
76    private final char m_highCharInContiguousGroup;
77
78    /**
79     * The ISO encoding name.
80     */
81    final String name;
82
83    /**
84     * The name used by the Java convertor.
85     */
86    final String javaName;
87
88    /**
89     * A helper object that we can ask if a
90     * single char, or a surrogate UTF-16 pair
91     * of chars that form a single character,
92     * is in this encoding.
93     */
94    private InEncoding m_encoding;
95
96    /**
97     * This is not a public API. It returns true if the
98     * char in question is in the encoding.
99     * @param ch the char in question.
100     * <p>
101     * This method is not a public API.
102     * @xsl.usage internal
103     */
104    public boolean isInEncoding(char ch) {
105        if (m_encoding == null) {
106            m_encoding = new EncodingImpl();
107
108            // One could put alternate logic in here to
109            // instantiate another object that implements the
110            // InEncoding interface. For example if the JRE is 1.4 or up
111            // we could have an object that uses JRE 1.4 methods
112        }
113        return m_encoding.isInEncoding(ch);
114    }
115
116    /**
117     * This is not a public API. It returns true if the
118     * character formed by the high/low pair is in the encoding.
119     * @param high a char that the a high char of a high/low surrogate pair.
120     * @param low a char that is the low char of a high/low surrogate pair.
121     * <p>
122     * This method is not a public API.
123     * @xsl.usage internal
124     */
125    public boolean isInEncoding(char high, char low) {
126        if (m_encoding == null) {
127            m_encoding = new EncodingImpl();
128
129            // One could put alternate logic in here to
130            // instantiate another object that implements the
131            // InEncoding interface. For example if the JRE is 1.4 or up
132            // we could have an object that uses JRE 1.4 methods
133        }
134        return m_encoding.isInEncoding(high, low);
135    }
136
137    /**
138     * Create an EncodingInfo object based on the ISO name and Java name.
139     * If both parameters are null any character will be considered to
140     * be in the encoding. This is useful for when the serializer is in
141     * temporary output state, and has no assciated encoding.
142     *
143     * @param name reference to the ISO name.
144     * @param javaName reference to the Java encoding name.
145     * @param highChar The char for which characters at or below this value are
146     * definately in the
147     * encoding, although for characters above this point they might be in the encoding.
148     */
149    public EncodingInfo(String name, String javaName, char highChar)
150    {
151
152        this.name = name;
153        this.javaName = javaName;
154        this.m_highCharInContiguousGroup = highChar;
155    }
156
157
158
159    /**
160     * A simple interface to isolate the implementation.
161     * We could also use some new JRE 1.4 methods in another implementation
162     * provided we use reflection with them.
163     * <p>
164     * This interface is not a public API,
165     * and should only be used internally within the serializer.
166     * @xsl.usage internal
167     */
168    private interface InEncoding {
169        /**
170         * Returns true if the char is in the encoding
171         */
172        public boolean isInEncoding(char ch);
173        /**
174         * Returns true if the high/low surrogate pair forms
175         * a character that is in the encoding.
176         */
177        public boolean isInEncoding(char high, char low);
178    }
179
180    /**
181     * This class implements the
182     */
183    private class EncodingImpl implements InEncoding {
184
185
186
187        public boolean isInEncoding(char ch1) {
188            final boolean ret;
189            int codePoint = Encodings.toCodePoint(ch1);
190            if (codePoint < m_explFirst) {
191                // The unicode value is before the range
192                // that we explictly manage, so we delegate the answer.
193
194                // If we don't have an m_before object to delegate to, make one.
195                if (m_before == null)
196                    m_before =
197                        new EncodingImpl(
198                            m_encoding,
199                            m_first,
200                            m_explFirst - 1,
201                            codePoint);
202                ret = m_before.isInEncoding(ch1);
203            } else if (m_explLast < codePoint) {
204                // The unicode value is after the range
205                // that we explictly manage, so we delegate the answer.
206
207                // If we don't have an m_after object to delegate to, make one.
208                if (m_after == null)
209                    m_after =
210                        new EncodingImpl(
211                            m_encoding,
212                            m_explLast + 1,
213                            m_last,
214                            codePoint);
215                ret = m_after.isInEncoding(ch1);
216            } else {
217                // The unicode value is in the range we explitly handle
218                final int idx = codePoint - m_explFirst;
219
220                // If we already know the answer, just return it.
221                if (m_alreadyKnown[idx])
222                    ret = m_isInEncoding[idx];
223                else {
224                    // We don't know the answer, so find out,
225                    // which may be expensive, then cache the answer
226                    ret = inEncoding(ch1, m_encoding);
227                    m_alreadyKnown[idx] = true;
228                    m_isInEncoding[idx] = ret;
229                }
230            }
231            return ret;
232        }
233
234        public boolean isInEncoding(char high, char low) {
235            final boolean ret;
236            int codePoint = Encodings.toCodePoint(high,low);
237            if (codePoint < m_explFirst) {
238                // The unicode value is before the range
239                // that we explictly manage, so we delegate the answer.
240
241                // If we don't have an m_before object to delegate to, make one.
242                if (m_before == null)
243                    m_before =
244                        new EncodingImpl(
245                            m_encoding,
246                            m_first,
247                            m_explFirst - 1,
248                            codePoint);
249                ret = m_before.isInEncoding(high,low);
250            } else if (m_explLast < codePoint) {
251                // The unicode value is after the range
252                // that we explictly manage, so we delegate the answer.
253
254                // If we don't have an m_after object to delegate to, make one.
255                if (m_after == null)
256                    m_after =
257                        new EncodingImpl(
258                            m_encoding,
259                            m_explLast + 1,
260                            m_last,
261                            codePoint);
262                ret = m_after.isInEncoding(high,low);
263            } else {
264                // The unicode value is in the range we explitly handle
265                final int idx = codePoint - m_explFirst;
266
267                // If we already know the answer, just return it.
268                if (m_alreadyKnown[idx])
269                    ret = m_isInEncoding[idx];
270                else {
271                    // We don't know the answer, so find out,
272                    // which may be expensive, then cache the answer
273                    ret = inEncoding(high, low, m_encoding);
274                    m_alreadyKnown[idx] = true;
275                    m_isInEncoding[idx] = ret;
276                }
277            }
278            return ret;
279        }
280
281        /**
282         * The encoding.
283         */
284        final private String m_encoding;
285        /**
286         * m_first through m_last is the range of unicode
287         * values that this object will return an answer on.
288         * It may delegate to a similar object with a different
289         * range
290         */
291        final private int m_first;
292
293        /**
294         * m_explFirst through m_explLast is the range of unicode
295         * value that this object handles explicitly and does not
296         * delegate to a similar object.
297         */
298        final private int m_explFirst;
299        final private int m_explLast;
300        final private int m_last;
301
302        /**
303         * The object, of the same type as this one,
304         * that handles unicode values in a range before
305         * the range explictly handled by this object, and
306         * to which this object may delegate.
307         */
308        private InEncoding m_before;
309        /**
310         * The object, of the same type as this one,
311         * that handles unicode values in a range after
312         * the range explictly handled by this object, and
313         * to which this object may delegate.
314         */
315        private InEncoding m_after;
316
317        /**
318         * The number of unicode values explicitly handled
319         * by a single EncodingInfo object. This value is
320         * tuneable, but is set to 128 because that covers the
321         * entire low range of ASCII type chars within a single
322         * object.
323         */
324        private static final int RANGE = 128;
325
326        /**
327         * A flag to record if we already know the answer
328         * for the given unicode value.
329         */
330        final private boolean m_alreadyKnown[] = new boolean[RANGE];
331        /**
332         * A table holding the answer on whether the given unicode
333         * value is in the encoding.
334         */
335        final private boolean m_isInEncoding[] = new boolean[RANGE];
336
337        private EncodingImpl() {
338            // This object will answer whether any unicode value
339            // is in the encoding, it handles values 0 through Integer.MAX_VALUE
340            this(javaName, 0, Integer.MAX_VALUE, (char) 0);
341        }
342
343        private EncodingImpl(String encoding, int first, int last, int codePoint) {
344            // Set the range of unicode values that this object manages
345            // either explicitly or implicitly.
346            m_first = first;
347            m_last = last;
348
349            // Set the range of unicode values that this object
350            // explicitly manages
351            m_explFirst = codePoint;
352            m_explLast = codePoint + (RANGE-1);
353
354            m_encoding = encoding;
355
356            if (javaName != null)
357            {
358                // Some optimization.
359                if (0 <= m_explFirst && m_explFirst <= 127) {
360                    // This particular EncodingImpl explicitly handles
361                    // characters in the low range.
362                    if ("UTF8".equals(javaName)
363                        || "UTF-16".equals(javaName)
364                        || "ASCII".equals(javaName)
365                        || "US-ASCII".equals(javaName)
366                        || "Unicode".equals(javaName)
367                        || "UNICODE".equals(javaName)
368                        || javaName.startsWith("ISO8859")) {
369
370                        // Not only does this EncodingImpl object explicitly
371                        // handle chracters in the low range, it is
372                        // also one that we know something about, without
373                        // needing to call inEncoding(char ch, String encoding)
374                        // for this low range
375                        //
376                        // By initializing the table ahead of time
377                        // for these low values, we prevent the expensive
378                        // inEncoding(char ch, String encoding)
379                        // from being called, at least for these common
380                        // encodings.
381                        for (int unicode = 1; unicode < 127; unicode++) {
382                            final int idx = unicode - m_explFirst;
383                            if (0 <= idx && idx < RANGE) {
384                                m_alreadyKnown[idx] = true;
385                                m_isInEncoding[idx] = true;
386                            }
387                        }
388                    }
389                }
390
391                /* A little bit more than optimization.
392                 *
393                 * We will say that any character is in the encoding if
394                 * we don't have an encoding.
395                 * This is meaningful when the serializer is being used
396                 * in temporary output state, where we are not writing to
397                 * the final output tree.  It is when writing to the
398                 * final output tree that we need to worry about the output
399                 * encoding
400                 */
401                if (javaName == null) {
402                    for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
403                        m_alreadyKnown[idx] = true;
404                        m_isInEncoding[idx] = true;
405                    }
406                }
407            }
408        }
409    }
410
411    /**
412     * This is heart of the code that determines if a given character
413     * is in the given encoding. This method is probably expensive,
414     * and the answer should be cached.
415     * <p>
416     * This method is not a public API,
417     * and should only be used internally within the serializer.
418     * @param ch the char in question, that is not a high char of
419     * a high/low surrogate pair.
420     * @param encoding the Java name of the enocding.
421     *
422     * @xsl.usage internal
423     *
424     */
425    private static boolean inEncoding(char ch, String encoding) {
426        boolean isInEncoding;
427        try {
428            char cArray[] = new char[1];
429            cArray[0] = ch;
430            // Construct a String from the char
431            String s = new String(cArray);
432            // Encode the String into a sequence of bytes
433            // using the given, named charset.
434            byte[] bArray = s.getBytes(encoding);
435            isInEncoding = inEncoding(ch, bArray);
436
437        } catch (Exception e) {
438            isInEncoding = false;
439
440            // If for some reason the encoding is null, e.g.
441            // for a temporary result tree, we should just
442            // say that every character is in the encoding.
443            if (encoding == null)
444            	isInEncoding = true;
445        }
446        return isInEncoding;
447    }
448
449    /**
450     * This is heart of the code that determines if a given high/low
451     * surrogate pair forms a character that is in the given encoding.
452     * This method is probably expensive, and the answer should be cached.
453     * <p>
454     * This method is not a public API,
455     * and should only be used internally within the serializer.
456     * @param high the high char of
457     * a high/low surrogate pair.
458     * @param low the low char of a high/low surrogate pair.
459     * @param encoding the Java name of the encoding.
460     *
461     * @xsl.usage internal
462     *
463     */
464    private static boolean inEncoding(char high, char low, String encoding) {
465        boolean isInEncoding;
466        try {
467            char cArray[] = new char[2];
468            cArray[0] = high;
469            cArray[1] = low;
470            // Construct a String from the char
471            String s = new String(cArray);
472            // Encode the String into a sequence of bytes
473            // using the given, named charset.
474            byte[] bArray = s.getBytes(encoding);
475            isInEncoding = inEncoding(high,bArray);
476        } catch (Exception e) {
477            isInEncoding = false;
478        }
479
480        return isInEncoding;
481    }
482
483    /**
484     * This method is the core of determining if character
485     * is in the encoding. The method is not foolproof, because
486     * s.getBytes(encoding) has specified behavior only if the
487     * characters are in the specified encoding. However this
488     * method tries it's best.
489     * @param ch the char that was converted using getBytes, or
490     * the first char of a high/low pair that was converted.
491     * @param data the bytes written out by the call to s.getBytes(encoding);
492     * @return true if the character is in the encoding.
493     */
494    private static boolean inEncoding(char ch, byte[] data) {
495        final boolean isInEncoding;
496        // If the string written out as data is not in the encoding,
497        // the output is not specified according to the documentation
498        // on the String.getBytes(encoding) method,
499        // but we do our best here.
500        if (data==null || data.length == 0) {
501            isInEncoding = false;
502        }
503        else {
504            if (data[0] == 0)
505                isInEncoding = false;
506            else if (data[0] == '?' && ch != '?')
507                isInEncoding = false;
508            /*
509             * else if (isJapanese) {
510             *   // isJapanese is really
511             *   //   (    "EUC-JP".equals(javaName)
512             *   //    ||  "EUC_JP".equals(javaName)
513             *  //     ||  "SJIS".equals(javaName)   )
514             *
515             *   // Work around some bugs in JRE for Japanese
516             *   if(data[0] == 0x21)
517             *     isInEncoding = false;
518             *   else if (ch == 0xA5)
519             *     isInEncoding = false;
520             *   else
521             *     isInEncoding = true;
522             * }
523             */
524
525            else {
526                // We don't know for sure, but it looks like it is in the encoding
527                isInEncoding = true;
528            }
529        }
530        return isInEncoding;
531    }
532
533    /**
534     * This method exists for performance reasons.
535     * <p>
536     * Except for '\u0000', if a char is less than or equal to the value
537     * returned by this method then it in the encoding.
538     * <p>
539     * The characters in an encoding are not contiguous, however
540     * there is a lowest group of chars starting at '\u0001' upto and
541     * including the char returned by this method that are all in the encoding.
542     * So the char returned by this method essentially defines the lowest
543     * contiguous group.
544     * <p>
545     * chars above the value returned might be in the encoding, but
546     * chars at or below the value returned are definately in the encoding.
547     * <p>
548     * In any case however, the isInEncoding(char) method can be used
549     * regardless of the value of the char returned by this method.
550     * <p>
551     * If the value returned is '\u0000' it means that every character must be tested
552     * with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)}
553     * for surrogate pairs.
554     * <p>
555     * This method is not a public API.
556     * @xsl.usage internal
557     */
558    public final char getHighChar() {
559        return m_highCharInContiguousGroup;
560    }
561
562}
563