1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the  "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 package org.apache.xml.utils;
20 
21 import java.util.Arrays;
22 
23 
24 /**
25  * THIS IS A COPY OF THE XERCES-2J CLASS org.apache.xerces.utls.XMLChar
26  *
27  * This class defines the basic properties of characters in XML 1.1. The data
28  * in this class can be used to verify that a character is a valid
29  * XML 1.1 character or if the character is a space, name start, or name
30  * character.
31  * <p>
32  * A series of convenience methods are supplied to ease the burden
33  * of the developer.  Using the character as an index into the <code>XML11CHARS</code>
34  * array and applying the appropriate mask flag (e.g.
35  * <code>MASK_VALID</code>), yields the same results as calling the
36  * convenience methods. There is one exception: check the comments
37  * for the <code>isValid</code> method for details.
38  *
39  * @version $Id: XML11Char.java 468655 2006-10-28 07:12:06Z minchau $
40  */
41 public class XML11Char {
42 
43     //
44     // Constants
45     //
46 
47     /** Character flags for XML 1.1. */
48     private static final byte XML11CHARS [] = new byte [1 << 16];
49 
50     /** XML 1.1 Valid character mask. */
51     public static final int MASK_XML11_VALID = 0x01;
52 
53     /** XML 1.1 Space character mask. */
54     public static final int MASK_XML11_SPACE = 0x02;
55 
56     /** XML 1.1 Name start character mask. */
57     public static final int MASK_XML11_NAME_START = 0x04;
58 
59     /** XML 1.1 Name character mask. */
60     public static final int MASK_XML11_NAME = 0x08;
61 
62     /** XML 1.1 control character mask */
63     public static final int MASK_XML11_CONTROL = 0x10;
64 
65     /** XML 1.1 content for external entities (valid - "special" chars - control chars) */
66     public static final int MASK_XML11_CONTENT = 0x20;
67 
68     /** XML namespaces 1.1 NCNameStart */
69     public static final int MASK_XML11_NCNAME_START = 0x40;
70 
71     /** XML namespaces 1.1 NCName */
72     public static final int MASK_XML11_NCNAME = 0x80;
73 
74     /** XML 1.1 content for internal entities (valid - "special" chars) */
75     public static final int MASK_XML11_CONTENT_INTERNAL = MASK_XML11_CONTROL | MASK_XML11_CONTENT;
76 
77     //
78     // Static initialization
79     //
80 
81     static {
82 
83         // Initializing the Character Flag Array
84         // Code generated by: XML11CharGenerator.
85 
Arrays.fill(XML11CHARS, 1, 9, (byte) 17 )86         Arrays.fill(XML11CHARS, 1, 9, (byte) 17 ); // Fill 8 of value (byte) 17
87         XML11CHARS[9] = 35;
88         XML11CHARS[10] = 3;
Arrays.fill(XML11CHARS, 11, 13, (byte) 17 )89         Arrays.fill(XML11CHARS, 11, 13, (byte) 17 ); // Fill 2 of value (byte) 17
90         XML11CHARS[13] = 3;
Arrays.fill(XML11CHARS, 14, 32, (byte) 17 )91         Arrays.fill(XML11CHARS, 14, 32, (byte) 17 ); // Fill 18 of value (byte) 17
92         XML11CHARS[32] = 35;
Arrays.fill(XML11CHARS, 33, 38, (byte) 33 )93         Arrays.fill(XML11CHARS, 33, 38, (byte) 33 ); // Fill 5 of value (byte) 33
94         XML11CHARS[38] = 1;
Arrays.fill(XML11CHARS, 39, 45, (byte) 33 )95         Arrays.fill(XML11CHARS, 39, 45, (byte) 33 ); // Fill 6 of value (byte) 33
Arrays.fill(XML11CHARS, 45, 47, (byte) -87 )96         Arrays.fill(XML11CHARS, 45, 47, (byte) -87 ); // Fill 2 of value (byte) -87
97         XML11CHARS[47] = 33;
Arrays.fill(XML11CHARS, 48, 58, (byte) -87 )98         Arrays.fill(XML11CHARS, 48, 58, (byte) -87 ); // Fill 10 of value (byte) -87
99         XML11CHARS[58] = 45;
100         XML11CHARS[59] = 33;
101         XML11CHARS[60] = 1;
Arrays.fill(XML11CHARS, 61, 65, (byte) 33 )102         Arrays.fill(XML11CHARS, 61, 65, (byte) 33 ); // Fill 4 of value (byte) 33
Arrays.fill(XML11CHARS, 65, 91, (byte) -19 )103         Arrays.fill(XML11CHARS, 65, 91, (byte) -19 ); // Fill 26 of value (byte) -19
Arrays.fill(XML11CHARS, 91, 93, (byte) 33 )104         Arrays.fill(XML11CHARS, 91, 93, (byte) 33 ); // Fill 2 of value (byte) 33
105         XML11CHARS[93] = 1;
106         XML11CHARS[94] = 33;
107         XML11CHARS[95] = -19;
108         XML11CHARS[96] = 33;
Arrays.fill(XML11CHARS, 97, 123, (byte) -19 )109         Arrays.fill(XML11CHARS, 97, 123, (byte) -19 ); // Fill 26 of value (byte) -19
Arrays.fill(XML11CHARS, 123, 127, (byte) 33 )110         Arrays.fill(XML11CHARS, 123, 127, (byte) 33 ); // Fill 4 of value (byte) 33
Arrays.fill(XML11CHARS, 127, 133, (byte) 17 )111         Arrays.fill(XML11CHARS, 127, 133, (byte) 17 ); // Fill 6 of value (byte) 17
112         XML11CHARS[133] = 35;
Arrays.fill(XML11CHARS, 134, 160, (byte) 17 )113         Arrays.fill(XML11CHARS, 134, 160, (byte) 17 ); // Fill 26 of value (byte) 17
Arrays.fill(XML11CHARS, 160, 183, (byte) 33 )114         Arrays.fill(XML11CHARS, 160, 183, (byte) 33 ); // Fill 23 of value (byte) 33
115         XML11CHARS[183] = -87;
Arrays.fill(XML11CHARS, 184, 192, (byte) 33 )116         Arrays.fill(XML11CHARS, 184, 192, (byte) 33 ); // Fill 8 of value (byte) 33
Arrays.fill(XML11CHARS, 192, 215, (byte) -19 )117         Arrays.fill(XML11CHARS, 192, 215, (byte) -19 ); // Fill 23 of value (byte) -19
118         XML11CHARS[215] = 33;
Arrays.fill(XML11CHARS, 216, 247, (byte) -19 )119         Arrays.fill(XML11CHARS, 216, 247, (byte) -19 ); // Fill 31 of value (byte) -19
120         XML11CHARS[247] = 33;
Arrays.fill(XML11CHARS, 248, 768, (byte) -19 )121         Arrays.fill(XML11CHARS, 248, 768, (byte) -19 ); // Fill 520 of value (byte) -19
Arrays.fill(XML11CHARS, 768, 880, (byte) -87 )122         Arrays.fill(XML11CHARS, 768, 880, (byte) -87 ); // Fill 112 of value (byte) -87
Arrays.fill(XML11CHARS, 880, 894, (byte) -19 )123         Arrays.fill(XML11CHARS, 880, 894, (byte) -19 ); // Fill 14 of value (byte) -19
124         XML11CHARS[894] = 33;
Arrays.fill(XML11CHARS, 895, 8192, (byte) -19 )125         Arrays.fill(XML11CHARS, 895, 8192, (byte) -19 ); // Fill 7297 of value (byte) -19
Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33 )126         Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33 ); // Fill 12 of value (byte) 33
Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19 )127         Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19 ); // Fill 2 of value (byte) -19
Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33 )128         Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33 ); // Fill 26 of value (byte) 33
129         XML11CHARS[8232] = 35;
Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33 )130         Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33 ); // Fill 22 of value (byte) 33
Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87 )131         Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87 ); // Fill 2 of value (byte) -87
Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33 )132         Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33 ); // Fill 47 of value (byte) 33
Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19 )133         Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19 ); // Fill 288 of value (byte) -19
Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33 )134         Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33 ); // Fill 2672 of value (byte) 33
Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19 )135         Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19 ); // Fill 1008 of value (byte) -19
Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33 )136         Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33 ); // Fill 17 of value (byte) 33
Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19 )137         Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19 ); // Fill 43007 of value (byte) -19
Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33 )138         Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33 ); // Fill 6400 of value (byte) 33
Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19 )139         Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19 ); // Fill 1232 of value (byte) -19
Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33 )140         Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33 ); // Fill 32 of value (byte) 33
Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19 )141         Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19 ); // Fill 526 of value (byte) -19
142 
143     } // <clinit>()
144 
145     //
146     // Public static methods
147     //
148 
149     /**
150      * Returns true if the specified character is a space character
151      * as amdended in the XML 1.1 specification.
152      *
153      * @param c The character to check.
154      */
isXML11Space(int c)155     public static boolean isXML11Space(int c) {
156         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_SPACE) != 0);
157     } // isXML11Space(int):boolean
158 
159     /**
160      * Returns true if the specified character is valid. This method
161      * also checks the surrogate character range from 0x10000 to 0x10FFFF.
162      * <p>
163      * If the program chooses to apply the mask directly to the
164      * <code>XML11CHARS</code> array, then they are responsible for checking
165      * the surrogate character range.
166      *
167      * @param c The character to check.
168      */
isXML11Valid(int c)169     public static boolean isXML11Valid(int c) {
170         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_VALID) != 0)
171                 || (0x10000 <= c && c <= 0x10FFFF);
172     } // isXML11Valid(int):boolean
173 
174     /**
175      * Returns true if the specified character is invalid.
176      *
177      * @param c The character to check.
178      */
isXML11Invalid(int c)179     public static boolean isXML11Invalid(int c) {
180         return !isXML11Valid(c);
181     } // isXML11Invalid(int):boolean
182 
183     /**
184      * Returns true if the specified character is valid and permitted outside
185      * of a character reference.
186      * That is, this method will return false for the same set as
187      * isXML11Valid, except it also reports false for "control characters".
188      *
189      * @param c The character to check.
190      */
isXML11ValidLiteral(int c)191     public static boolean isXML11ValidLiteral(int c) {
192         return ((c < 0x10000 && ((XML11CHARS[c] & MASK_XML11_VALID) != 0 && (XML11CHARS[c] & MASK_XML11_CONTROL) == 0))
193             || (0x10000 <= c && c <= 0x10FFFF));
194     } // isXML11ValidLiteral(int):boolean
195 
196     /**
197      * Returns true if the specified character can be considered
198      * content in an external parsed entity.
199      *
200      * @param c The character to check.
201      */
isXML11Content(int c)202     public static boolean isXML11Content(int c) {
203         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT) != 0) ||
204                (0x10000 <= c && c <= 0x10FFFF);
205     } // isXML11Content(int):boolean
206 
207     /**
208      * Returns true if the specified character can be considered
209      * content in an internal parsed entity.
210      *
211      * @param c The character to check.
212      */
isXML11InternalEntityContent(int c)213     public static boolean isXML11InternalEntityContent(int c) {
214         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT_INTERNAL) != 0) ||
215                (0x10000 <= c && c <= 0x10FFFF);
216     } // isXML11InternalEntityContent(int):boolean
217 
218     /**
219      * Returns true if the specified character is a valid name start
220      * character as defined by production [4] in the XML 1.1
221      * specification.
222      *
223      * @param c The character to check.
224      */
isXML11NameStart(int c)225     public static boolean isXML11NameStart(int c) {
226         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME_START) != 0)
227             || (0x10000 <= c && c < 0xF0000);
228     } // isXML11NameStart(int):boolean
229 
230     /**
231      * Returns true if the specified character is a valid name
232      * character as defined by production [4a] in the XML 1.1
233      * specification.
234      *
235      * @param c The character to check.
236      */
isXML11Name(int c)237     public static boolean isXML11Name(int c) {
238         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME) != 0)
239             || (c >= 0x10000 && c < 0xF0000);
240     } // isXML11Name(int):boolean
241 
242     /**
243      * Returns true if the specified character is a valid NCName start
244      * character as defined by production [4] in Namespaces in XML
245      * 1.1 recommendation.
246      *
247      * @param c The character to check.
248      */
isXML11NCNameStart(int c)249     public static boolean isXML11NCNameStart(int c) {
250         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME_START) != 0)
251             || (0x10000 <= c && c < 0xF0000);
252     } // isXML11NCNameStart(int):boolean
253 
254     /**
255      * Returns true if the specified character is a valid NCName
256      * character as defined by production [5] in Namespaces in XML
257      * 1.1 recommendation.
258      *
259      * @param c The character to check.
260      */
isXML11NCName(int c)261     public static boolean isXML11NCName(int c) {
262         return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME) != 0)
263             || (0x10000 <= c && c < 0xF0000);
264     } // isXML11NCName(int):boolean
265 
266     /**
267      * Returns whether the given character is a valid
268      * high surrogate for a name character. This includes
269      * all high surrogates for characters [0x10000-0xEFFFF].
270      * In other words everything excluding planes 15 and 16.
271      *
272      * @param c The character to check.
273      */
isXML11NameHighSurrogate(int c)274     public static boolean isXML11NameHighSurrogate(int c) {
275         return (0xD800 <= c && c <= 0xDB7F);
276     }
277 
278     /*
279      * [5] Name ::= NameStartChar NameChar*
280      */
281     /**
282      * Check to see if a string is a valid Name according to [5]
283      * in the XML 1.1 Recommendation
284      *
285      * @param name string to check
286      * @return true if name is a valid Name
287      */
isXML11ValidName(String name)288     public static boolean isXML11ValidName(String name) {
289         int length = name.length();
290         if (length == 0)
291             return false;
292         int i = 1;
293         char ch = name.charAt(0);
294         if( !isXML11NameStart(ch) ) {
295             if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
296                 char ch2 = name.charAt(1);
297                 if ( !XMLChar.isLowSurrogate(ch2) ||
298                      !isXML11NameStart(XMLChar.supplemental(ch, ch2)) ) {
299                     return false;
300                 }
301                 i = 2;
302             }
303             else {
304                 return false;
305             }
306         }
307         while (i < length) {
308             ch = name.charAt(i);
309             if ( !isXML11Name(ch) ) {
310                 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
311                     char ch2 = name.charAt(i);
312                     if ( !XMLChar.isLowSurrogate(ch2) ||
313                          !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
314                         return false;
315                     }
316                 }
317                 else {
318                     return false;
319                 }
320             }
321             ++i;
322         }
323         return true;
324     } // isXML11ValidName(String):boolean
325 
326 
327     /*
328      * from the namespace 1.1 rec
329      * [4] NCName ::= NCNameStartChar NCNameChar*
330      */
331     /**
332      * Check to see if a string is a valid NCName according to [4]
333      * from the XML Namespaces 1.1 Recommendation
334      *
335      * @param ncName string to check
336      * @return true if name is a valid NCName
337      */
isXML11ValidNCName(String ncName)338     public static boolean isXML11ValidNCName(String ncName) {
339         int length = ncName.length();
340         if (length == 0)
341             return false;
342         int i = 1;
343         char ch = ncName.charAt(0);
344         if( !isXML11NCNameStart(ch) ) {
345             if ( length > 1 && isXML11NameHighSurrogate(ch) ) {
346                 char ch2 = ncName.charAt(1);
347                 if ( !XMLChar.isLowSurrogate(ch2) ||
348                      !isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) {
349                     return false;
350                 }
351                 i = 2;
352             }
353             else {
354                 return false;
355             }
356         }
357         while (i < length) {
358             ch = ncName.charAt(i);
359             if ( !isXML11NCName(ch) ) {
360                 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
361                     char ch2 = ncName.charAt(i);
362                     if ( !XMLChar.isLowSurrogate(ch2) ||
363                          !isXML11NCName(XMLChar.supplemental(ch, ch2)) ) {
364                         return false;
365                     }
366                 }
367                 else {
368                     return false;
369                 }
370             }
371             ++i;
372         }
373         return true;
374     } // isXML11ValidNCName(String):boolean
375 
376     /*
377      * [7] Nmtoken ::= (NameChar)+
378      */
379     /**
380      * Check to see if a string is a valid Nmtoken according to [7]
381      * in the XML 1.1 Recommendation
382      *
383      * @param nmtoken string to check
384      * @return true if nmtoken is a valid Nmtoken
385      */
isXML11ValidNmtoken(String nmtoken)386     public static boolean isXML11ValidNmtoken(String nmtoken) {
387         int length = nmtoken.length();
388         if (length == 0)
389             return false;
390         for (int i = 0; i < length; ++i ) {
391             char ch = nmtoken.charAt(i);
392             if( !isXML11Name(ch) ) {
393                 if ( ++i < length && isXML11NameHighSurrogate(ch) ) {
394                     char ch2 = nmtoken.charAt(i);
395                     if ( !XMLChar.isLowSurrogate(ch2) ||
396                          !isXML11Name(XMLChar.supplemental(ch, ch2)) ) {
397                         return false;
398                     }
399                 }
400                 else {
401                     return false;
402                 }
403             }
404         }
405         return true;
406     } // isXML11ValidName(String):boolean
407 
408     /**
409       * Simple check to determine if qname is legal. If it returns false
410       * then <param>str</param> is illegal; if it returns true then
411       * <param>str</param> is legal.
412       */
isXML11ValidQName(String str)413      public static boolean isXML11ValidQName(String str) {
414 
415         final int colon = str.indexOf(':');
416 
417         if (colon == 0 || colon == str.length() - 1) {
418             return false;
419         }
420 
421         if (colon > 0) {
422             final String prefix = str.substring(0,colon);
423             final String localPart = str.substring(colon+1);
424             return isXML11ValidNCName(prefix) && isXML11ValidNCName(localPart);
425         }
426         else {
427             return isXML11ValidNCName(str);
428         }
429      }
430 
431 } // class XML11Char
432 
433