XML11Char.java revision 9f8118474e9513f7a5b7d2a05e4a0fb15d1a6569
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the  "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 *     http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19package org.apache.xml.utils;
20
21import java.util.Arrays;
22
23
24/**
25 * THIS IS A COPY OF THE XERCES-2J CLASS org.apache.xerces.utls.XMLChar
26 *
27 * This class defines the basic properties of characters in XML 1.1. The data
28 * in this class can be used to verify that a character is a valid
29 * XML 1.1 character or if the character is a space, name start, or name
30 * character.
31 * <p>
32 * A series of convenience methods are supplied to ease the burden
33 * of the developer.  Using the character as an index into the <code>XML11CHARS</code>
34 * array and applying the appropriate mask flag (e.g.
35 * <code>MASK_VALID</code>), yields the same results as calling the
36 * convenience methods. There is one exception: check the comments
37 * for the <code>isValid</code> method for details.
38 *
39 * @version $Id: XML11Char.java 468655 2006-10-28 07:12:06Z minchau $
40 */
41public class XML11Char {
42
43    //
44    // Constants
45    //
46
47    /** Character flags for XML 1.1. */
48    private static final byte XML11CHARS [] = new byte [1 << 16];
49
50    /** XML 1.1 Valid character mask. */
51    public static final int MASK_XML11_VALID = 0x01;
52
53    /** XML 1.1 Space character mask. */
54    public static final int MASK_XML11_SPACE = 0x02;
55
56    /** XML 1.1 Name start character mask. */
57    public static final int MASK_XML11_NAME_START = 0x04;
58
59    /** XML 1.1 Name character mask. */
60    public static final int MASK_XML11_NAME = 0x08;
61
62    /** XML 1.1 control character mask */
63    public static final int MASK_XML11_CONTROL = 0x10;
64
65    /** XML 1.1 content for external entities (valid - "special" chars - control chars) */
66    public static final int MASK_XML11_CONTENT = 0x20;
67
68    /** XML namespaces 1.1 NCNameStart */
69    public static final int MASK_XML11_NCNAME_START = 0x40;
70
71    /** XML namespaces 1.1 NCName */
72    public static final int MASK_XML11_NCNAME = 0x80;
73
74    /** XML 1.1 content for internal entities (valid - "special" chars) */
75    public static final int MASK_XML11_CONTENT_INTERNAL = MASK_XML11_CONTROL | MASK_XML11_CONTENT;
76
77    //
78    // Static initialization
79    //
80
81    static {
82
83        // Initializing the Character Flag Array
84        // Code generated by: XML11CharGenerator.
85
86        Arrays.fill(XML11CHARS, 1, 9, (byte) 17 ); // Fill 8 of value (byte) 17
87        XML11CHARS[9] = 35;
88        XML11CHARS[10] = 3;
89        Arrays.fill(XML11CHARS, 11, 13, (byte) 17 ); // Fill 2 of value (byte) 17
90        XML11CHARS[13] = 3;
91        Arrays.fill(XML11CHARS, 14, 32, (byte) 17 ); // Fill 18 of value (byte) 17
92        XML11CHARS[32] = 35;
93        Arrays.fill(XML11CHARS, 33, 38, (byte) 33 ); // Fill 5 of value (byte) 33
94        XML11CHARS[38] = 1;
95        Arrays.fill(XML11CHARS, 39, 45, (byte) 33 ); // Fill 6 of value (byte) 33
96        Arrays.fill(XML11CHARS, 45, 47, (byte) -87 ); // Fill 2 of value (byte) -87
97        XML11CHARS[47] = 33;
98        Arrays.fill(XML11CHARS, 48, 58, (byte) -87 ); // Fill 10 of value (byte) -87
99        XML11CHARS[58] = 45;
100        XML11CHARS[59] = 33;
101        XML11CHARS[60] = 1;
102        Arrays.fill(XML11CHARS, 61, 65, (byte) 33 ); // Fill 4 of value (byte) 33
103        Arrays.fill(XML11CHARS, 65, 91, (byte) -19 ); // Fill 26 of value (byte) -19
104        Arrays.fill(XML11CHARS, 91, 93, (byte) 33 ); // Fill 2 of value (byte) 33
105        XML11CHARS[93] = 1;
106        XML11CHARS[94] = 33;
107        XML11CHARS[95] = -19;
108        XML11CHARS[96] = 33;
109        Arrays.fill(XML11CHARS, 97, 123, (byte) -19 ); // Fill 26 of value (byte) -19
110        Arrays.fill(XML11CHARS, 123, 127, (byte) 33 ); // Fill 4 of value (byte) 33
111        Arrays.fill(XML11CHARS, 127, 133, (byte) 17 ); // Fill 6 of value (byte) 17
112        XML11CHARS[133] = 35;
113        Arrays.fill(XML11CHARS, 134, 160, (byte) 17 ); // Fill 26 of value (byte) 17
114        Arrays.fill(XML11CHARS, 160, 183, (byte) 33 ); // Fill 23 of value (byte) 33
115        XML11CHARS[183] = -87;
116        Arrays.fill(XML11CHARS, 184, 192, (byte) 33 ); // Fill 8 of value (byte) 33
117        Arrays.fill(XML11CHARS, 192, 215, (byte) -19 ); // Fill 23 of value (byte) -19
118        XML11CHARS[215] = 33;
119        Arrays.fill(XML11CHARS, 216, 247, (byte) -19 ); // Fill 31 of value (byte) -19
120        XML11CHARS[247] = 33;
121        Arrays.fill(XML11CHARS, 248, 768, (byte) -19 ); // Fill 520 of value (byte) -19
122        Arrays.fill(XML11CHARS, 768, 880, (byte) -87 ); // Fill 112 of value (byte) -87
123        Arrays.fill(XML11CHARS, 880, 894, (byte) -19 ); // Fill 14 of value (byte) -19
124        XML11CHARS[894] = 33;
125        Arrays.fill(XML11CHARS, 895, 8192, (byte) -19 ); // Fill 7297 of value (byte) -19
126        Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33 ); // Fill 12 of value (byte) 33
127        Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19 ); // Fill 2 of value (byte) -19
128        Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33 ); // Fill 26 of value (byte) 33
129        XML11CHARS[8232] = 35;
130        Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33 ); // Fill 22 of value (byte) 33
131        Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87 ); // Fill 2 of value (byte) -87
132        Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33 ); // Fill 47 of value (byte) 33
133        Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19 ); // Fill 288 of value (byte) -19
134        Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33 ); // Fill 2672 of value (byte) 33
135        Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19 ); // Fill 1008 of value (byte) -19
136        Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33 ); // Fill 17 of value (byte) 33
137        Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19 ); // Fill 43007 of value (byte) -19
138        Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33 ); // Fill 6400 of value (byte) 33
139        Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19 ); // Fill 1232 of value (byte) -19
140        Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33 ); // Fill 32 of value (byte) 33
141        Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19 ); // Fill 526 of value (byte) -19
142
143    } // <clinit>()
144
145    //
146    // Public static methods
147    //
148
149    /**
150     * Returns true if the specified character is a space character
151     * as amdended in the XML 1.1 specification.
152     *
153     * @param c The character to check.
154     */
155    public static boolean isXML11Space(int c) {
156        return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_SPACE) != 0);
157    } // isXML11Space(int):boolean
158
159    /**
160     * Returns true if the specified character is valid. This method
161     * also checks the surrogate character range from 0x10000 to 0x10FFFF.
162     * <p>
163     * If the program chooses to apply the mask directly to the
164     * <code>XML11CHARS</code> array, then they are responsible for checking
165     * the surrogate character range.
166     *
167     * @param c The character to check.
168     */
169    public static boolean isXML11Valid(int c) {
170        return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_VALID) != 0)
171                || (0x10000 <= c && c <= 0x10FFFF);
172    } // isXML11Valid(int):boolean
173
174    /**
175     * Returns true if the specified character is invalid.
176     *
177     * @param c The character to check.
178     */
179    public static boolean isXML11Invalid(int c) {
180        return !isXML11Valid(c);
181    } // isXML11Invalid(int):boolean
182
183    /**
184     * Returns true if the specified character is valid and permitted outside
185     * of a character reference.
186     * That is, this method will return false for the same set as
187     * isXML11Valid, except it also reports false for "control characters".
188     *
189     * @param c The character to check.
190     */
191    public static boolean isXML11ValidLiteral(int c) {
192        return ((c < 0x10000 && ((XML11CHARS[c] & MASK_XML11_VALID) != 0 && (XML11CHARS[c] & MASK_XML11_CONTROL) == 0))
193            || (0x10000 <= c && c <= 0x10FFFF));
194    } // isXML11ValidLiteral(int):boolean
195
196    /**
197     * Returns true if the specified character can be considered
198     * content in an external parsed entity.
199     *
200     * @param c The character to check.
201     */
202    public static boolean isXML11Content(int c) {
203        return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT) != 0) ||
204               (0x10000 <= c && c <= 0x10FFFF);
205    } // isXML11Content(int):boolean
206
207    /**
208     * Returns true if the specified character can be considered
209     * content in an internal parsed entity.
210     *
211     * @param c The character to check.
212     */
213    public static boolean isXML11InternalEntityContent(int c) {
214        return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT_INTERNAL) != 0) ||
215               (0x10000 <= c && c <= 0x10FFFF);
216    } // isXML11InternalEntityContent(int):boolean
217
218    /**
219     * Returns true if the specified character is a valid name start
220     * character as defined by production [4] in the XML 1.1
221     * specification.
222     *
223     * @param c The character to check.
224     */
225    public static boolean isXML11NameStart(int c) {
226        return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME_START) != 0)
227            || (0x10000 <= c && c < 0xF0000);
228    } // isXML11NameStart(int):boolean
229
230    /**
231     * Returns true if the specified character is a valid name
232     * character as defined by production [4a] in the XML 1.1
233     * specification.
234     *
235     * @param c The character to check.
236     */
237    public static boolean isXML11Name(int c) {
238        return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME) != 0)
239            || (c >= 0x10000 && c < 0xF0000);
240    } // isXML11Name(int):boolean
241
242    /**
243     * Returns true if the specified character is a valid NCName start
244     * character as defined by production [4] in Namespaces in XML
245     * 1.1 recommendation.
246     *
247     * @param c The character to check.
248     */
249    public static boolean isXML11NCNameStart(int c) {
250        return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME_START) != 0)
251            || (0x10000 <= c && c < 0xF0000);
252    } // isXML11NCNameStart(int):boolean
253
254    /**
255     * Returns true if the specified character is a valid NCName
256     * character as defined by production [5] in Namespaces in XML
257     * 1.1 recommendation.
258     *
259     * @param c The character to check.
260     */
261    public static boolean isXML11NCName(int c) {
262        return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME) != 0)
263            || (0x10000 <= c && c < 0xF0000);
264    } // isXML11NCName(int):boolean
265
266    /**
267     * Returns whether the given character is a valid
268     * high surrogate for a name character. This includes
269     * all high surrogates for characters [0x10000-0xEFFFF].
270     * In other words everything excluding planes 15 and 16.
271     *
272     * @param c The character to check.
273     */
274    public static boolean isXML11NameHighSurrogate(int c) {
275        return (0xD800 <= c && c <= 0xDB7F);
276    }
277
278    /*
279     * [5] Name ::= NameStartChar NameChar*
280     */
281    /**
282     * Check to see if a string is a valid Name according to [5]
283     * in the XML 1.1 Recommendation
284     *
285     * @param name string to check
286     * @return true if name is a valid Name
287     */
288    public static boolean isXML11ValidName(String name) {
289        int length = name.length();
290        if (length == 0)
291            return false;
292        int i = 1;
293        char ch = name.charAt(0);
294        if( !isXML11NameStart(ch) ) {
295            if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
296                char ch2 = name.charAt(1);
297                if ( !XMLChar.isLowSurrogate(ch2) ||
298                     !isXML11NameStart(XMLChar.supplemental(ch, ch2)) ) {
299                    return false;
300                }
301                i = 2;
302            }
303            else {
304                return false;
305            }
306        }
307        while (i < length) {
308            ch = name.charAt(i);
309            if ( !isXML11Name(ch) ) {
310                if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
311                    char ch2 = name.charAt(i);
312                    if ( !XMLChar.isLowSurrogate(ch2) ||
313                         !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
314                        return false;
315                    }
316                }
317                else {
318                    return false;
319                }
320            }
321            ++i;
322        }
323        return true;
324    } // isXML11ValidName(String):boolean
325
326
327    /*
328     * from the namespace 1.1 rec
329     * [4] NCName ::= NCNameStartChar NCNameChar*
330     */
331    /**
332     * Check to see if a string is a valid NCName according to [4]
333     * from the XML Namespaces 1.1 Recommendation
334     *
335     * @param ncName string to check
336     * @return true if name is a valid NCName
337     */
338    public static boolean isXML11ValidNCName(String ncName) {
339        int length = ncName.length();
340        if (length == 0)
341            return false;
342        int i = 1;
343        char ch = ncName.charAt(0);
344        if( !isXML11NCNameStart(ch) ) {
345            if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
346                char ch2 = ncName.charAt(1);
347                if ( !XMLChar.isLowSurrogate(ch2) ||
348                     !isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) {
349                    return false;
350                }
351                i = 2;
352            }
353            else {
354                return false;
355            }
356        }
357        while (i < length) {
358            ch = ncName.charAt(i);
359            if ( !isXML11NCName(ch) ) {
360                if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
361                    char ch2 = ncName.charAt(i);
362                    if ( !XMLChar.isLowSurrogate(ch2) ||
363                         !isXML11NCName(XMLChar.supplemental(ch, ch2)) ) {
364                        return false;
365                    }
366                }
367                else {
368                    return false;
369                }
370            }
371            ++i;
372        }
373        return true;
374    } // isXML11ValidNCName(String):boolean
375
376    /*
377     * [7] Nmtoken ::= (NameChar)+
378     */
379    /**
380     * Check to see if a string is a valid Nmtoken according to [7]
381     * in the XML 1.1 Recommendation
382     *
383     * @param nmtoken string to check
384     * @return true if nmtoken is a valid Nmtoken
385     */
386    public static boolean isXML11ValidNmtoken(String nmtoken) {
387        int length = nmtoken.length();
388        if (length == 0)
389            return false;
390        for (int i = 0; i < length; ++i ) {
391            char ch = nmtoken.charAt(i);
392            if( !isXML11Name(ch) ) {
393                if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
394                    char ch2 = nmtoken.charAt(i);
395                    if ( !XMLChar.isLowSurrogate(ch2) ||
396                         !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
397                        return false;
398                    }
399                }
400                else {
401                    return false;
402                }
403            }
404        }
405        return true;
406    } // isXML11ValidName(String):boolean
407
408    /**
409      * Simple check to determine if qname is legal. If it returns false
410      * then <param>str</param> is illegal; if it returns true then
411      * <param>str</param> is legal.
412      */
413     public static boolean isXML11ValidQName(String str) {
414
415        final int colon = str.indexOf(':');
416
417        if (colon == 0 || colon == str.length() - 1) {
418            return false;
419        }
420
421        if (colon > 0) {
422            final String prefix = str.substring(0,colon);
423            final String localPart = str.substring(colon+1);
424            return isXML11ValidNCName(prefix) && isXML11ValidNCName(localPart);
425        }
426        else {
427            return isXML11ValidNCName(str);
428        }
429     }
430
431} // class XML11Char
432
433