1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 package org.apache.xml.utils; 20 21 import java.util.Arrays; 22 23 24 /** 25 * THIS IS A COPY OF THE XERCES-2J CLASS org.apache.xerces.utls.XMLChar 26 * 27 * This class defines the basic properties of characters in XML 1.1. The data 28 * in this class can be used to verify that a character is a valid 29 * XML 1.1 character or if the character is a space, name start, or name 30 * character. 31 * <p> 32 * A series of convenience methods are supplied to ease the burden 33 * of the developer. Using the character as an index into the <code>XML11CHARS</code> 34 * array and applying the appropriate mask flag (e.g. 35 * <code>MASK_VALID</code>), yields the same results as calling the 36 * convenience methods. There is one exception: check the comments 37 * for the <code>isValid</code> method for details. 38 * 39 * @version $Id: XML11Char.java 468655 2006-10-28 07:12:06Z minchau $ 40 */ 41 public class XML11Char { 42 43 // 44 // Constants 45 // 46 47 /** Character flags for XML 1.1. */ 48 private static final byte XML11CHARS [] = new byte [1 << 16]; 49 50 /** XML 1.1 Valid character mask. */ 51 public static final int MASK_XML11_VALID = 0x01; 52 53 /** XML 1.1 Space character mask. */ 54 public static final int MASK_XML11_SPACE = 0x02; 55 56 /** XML 1.1 Name start character mask. */ 57 public static final int MASK_XML11_NAME_START = 0x04; 58 59 /** XML 1.1 Name character mask. */ 60 public static final int MASK_XML11_NAME = 0x08; 61 62 /** XML 1.1 control character mask */ 63 public static final int MASK_XML11_CONTROL = 0x10; 64 65 /** XML 1.1 content for external entities (valid - "special" chars - control chars) */ 66 public static final int MASK_XML11_CONTENT = 0x20; 67 68 /** XML namespaces 1.1 NCNameStart */ 69 public static final int MASK_XML11_NCNAME_START = 0x40; 70 71 /** XML namespaces 1.1 NCName */ 72 public static final int MASK_XML11_NCNAME = 0x80; 73 74 /** XML 1.1 content for internal entities (valid - "special" chars) */ 75 public static final int MASK_XML11_CONTENT_INTERNAL = MASK_XML11_CONTROL | MASK_XML11_CONTENT; 76 77 // 78 // Static initialization 79 // 80 81 static { 82 83 // Initializing the Character Flag Array 84 // Code generated by: XML11CharGenerator. 85 Arrays.fill(XML11CHARS, 1, 9, (byte) 17 )86 Arrays.fill(XML11CHARS, 1, 9, (byte) 17 ); // Fill 8 of value (byte) 17 87 XML11CHARS[9] = 35; 88 XML11CHARS[10] = 3; Arrays.fill(XML11CHARS, 11, 13, (byte) 17 )89 Arrays.fill(XML11CHARS, 11, 13, (byte) 17 ); // Fill 2 of value (byte) 17 90 XML11CHARS[13] = 3; Arrays.fill(XML11CHARS, 14, 32, (byte) 17 )91 Arrays.fill(XML11CHARS, 14, 32, (byte) 17 ); // Fill 18 of value (byte) 17 92 XML11CHARS[32] = 35; Arrays.fill(XML11CHARS, 33, 38, (byte) 33 )93 Arrays.fill(XML11CHARS, 33, 38, (byte) 33 ); // Fill 5 of value (byte) 33 94 XML11CHARS[38] = 1; Arrays.fill(XML11CHARS, 39, 45, (byte) 33 )95 Arrays.fill(XML11CHARS, 39, 45, (byte) 33 ); // Fill 6 of value (byte) 33 Arrays.fill(XML11CHARS, 45, 47, (byte) -87 )96 Arrays.fill(XML11CHARS, 45, 47, (byte) -87 ); // Fill 2 of value (byte) -87 97 XML11CHARS[47] = 33; Arrays.fill(XML11CHARS, 48, 58, (byte) -87 )98 Arrays.fill(XML11CHARS, 48, 58, (byte) -87 ); // Fill 10 of value (byte) -87 99 XML11CHARS[58] = 45; 100 XML11CHARS[59] = 33; 101 XML11CHARS[60] = 1; Arrays.fill(XML11CHARS, 61, 65, (byte) 33 )102 Arrays.fill(XML11CHARS, 61, 65, (byte) 33 ); // Fill 4 of value (byte) 33 Arrays.fill(XML11CHARS, 65, 91, (byte) -19 )103 Arrays.fill(XML11CHARS, 65, 91, (byte) -19 ); // Fill 26 of value (byte) -19 Arrays.fill(XML11CHARS, 91, 93, (byte) 33 )104 Arrays.fill(XML11CHARS, 91, 93, (byte) 33 ); // Fill 2 of value (byte) 33 105 XML11CHARS[93] = 1; 106 XML11CHARS[94] = 33; 107 XML11CHARS[95] = -19; 108 XML11CHARS[96] = 33; Arrays.fill(XML11CHARS, 97, 123, (byte) -19 )109 Arrays.fill(XML11CHARS, 97, 123, (byte) -19 ); // Fill 26 of value (byte) -19 Arrays.fill(XML11CHARS, 123, 127, (byte) 33 )110 Arrays.fill(XML11CHARS, 123, 127, (byte) 33 ); // Fill 4 of value (byte) 33 Arrays.fill(XML11CHARS, 127, 133, (byte) 17 )111 Arrays.fill(XML11CHARS, 127, 133, (byte) 17 ); // Fill 6 of value (byte) 17 112 XML11CHARS[133] = 35; Arrays.fill(XML11CHARS, 134, 160, (byte) 17 )113 Arrays.fill(XML11CHARS, 134, 160, (byte) 17 ); // Fill 26 of value (byte) 17 Arrays.fill(XML11CHARS, 160, 183, (byte) 33 )114 Arrays.fill(XML11CHARS, 160, 183, (byte) 33 ); // Fill 23 of value (byte) 33 115 XML11CHARS[183] = -87; Arrays.fill(XML11CHARS, 184, 192, (byte) 33 )116 Arrays.fill(XML11CHARS, 184, 192, (byte) 33 ); // Fill 8 of value (byte) 33 Arrays.fill(XML11CHARS, 192, 215, (byte) -19 )117 Arrays.fill(XML11CHARS, 192, 215, (byte) -19 ); // Fill 23 of value (byte) -19 118 XML11CHARS[215] = 33; Arrays.fill(XML11CHARS, 216, 247, (byte) -19 )119 Arrays.fill(XML11CHARS, 216, 247, (byte) -19 ); // Fill 31 of value (byte) -19 120 XML11CHARS[247] = 33; Arrays.fill(XML11CHARS, 248, 768, (byte) -19 )121 Arrays.fill(XML11CHARS, 248, 768, (byte) -19 ); // Fill 520 of value (byte) -19 Arrays.fill(XML11CHARS, 768, 880, (byte) -87 )122 Arrays.fill(XML11CHARS, 768, 880, (byte) -87 ); // Fill 112 of value (byte) -87 Arrays.fill(XML11CHARS, 880, 894, (byte) -19 )123 Arrays.fill(XML11CHARS, 880, 894, (byte) -19 ); // Fill 14 of value (byte) -19 124 XML11CHARS[894] = 33; Arrays.fill(XML11CHARS, 895, 8192, (byte) -19 )125 Arrays.fill(XML11CHARS, 895, 8192, (byte) -19 ); // Fill 7297 of value (byte) -19 Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33 )126 Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33 ); // Fill 12 of value (byte) 33 Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19 )127 Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19 ); // Fill 2 of value (byte) -19 Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33 )128 Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33 ); // Fill 26 of value (byte) 33 129 XML11CHARS[8232] = 35; Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33 )130 Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33 ); // Fill 22 of value (byte) 33 Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87 )131 Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87 ); // Fill 2 of value (byte) -87 Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33 )132 Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33 ); // Fill 47 of value (byte) 33 Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19 )133 Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19 ); // Fill 288 of value (byte) -19 Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33 )134 Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33 ); // Fill 2672 of value (byte) 33 Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19 )135 Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19 ); // Fill 1008 of value (byte) -19 Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33 )136 Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33 ); // Fill 17 of value (byte) 33 Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19 )137 Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19 ); // Fill 43007 of value (byte) -19 Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33 )138 Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33 ); // Fill 6400 of value (byte) 33 Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19 )139 Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19 ); // Fill 1232 of value (byte) -19 Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33 )140 Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33 ); // Fill 32 of value (byte) 33 Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19 )141 Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19 ); // Fill 526 of value (byte) -19 142 143 } // <clinit>() 144 145 // 146 // Public static methods 147 // 148 149 /** 150 * Returns true if the specified character is a space character 151 * as amdended in the XML 1.1 specification. 152 * 153 * @param c The character to check. 154 */ isXML11Space(int c)155 public static boolean isXML11Space(int c) { 156 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_SPACE) != 0); 157 } // isXML11Space(int):boolean 158 159 /** 160 * Returns true if the specified character is valid. This method 161 * also checks the surrogate character range from 0x10000 to 0x10FFFF. 162 * <p> 163 * If the program chooses to apply the mask directly to the 164 * <code>XML11CHARS</code> array, then they are responsible for checking 165 * the surrogate character range. 166 * 167 * @param c The character to check. 168 */ isXML11Valid(int c)169 public static boolean isXML11Valid(int c) { 170 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_VALID) != 0) 171 || (0x10000 <= c && c <= 0x10FFFF); 172 } // isXML11Valid(int):boolean 173 174 /** 175 * Returns true if the specified character is invalid. 176 * 177 * @param c The character to check. 178 */ isXML11Invalid(int c)179 public static boolean isXML11Invalid(int c) { 180 return !isXML11Valid(c); 181 } // isXML11Invalid(int):boolean 182 183 /** 184 * Returns true if the specified character is valid and permitted outside 185 * of a character reference. 186 * That is, this method will return false for the same set as 187 * isXML11Valid, except it also reports false for "control characters". 188 * 189 * @param c The character to check. 190 */ isXML11ValidLiteral(int c)191 public static boolean isXML11ValidLiteral(int c) { 192 return ((c < 0x10000 && ((XML11CHARS[c] & MASK_XML11_VALID) != 0 && (XML11CHARS[c] & MASK_XML11_CONTROL) == 0)) 193 || (0x10000 <= c && c <= 0x10FFFF)); 194 } // isXML11ValidLiteral(int):boolean 195 196 /** 197 * Returns true if the specified character can be considered 198 * content in an external parsed entity. 199 * 200 * @param c The character to check. 201 */ isXML11Content(int c)202 public static boolean isXML11Content(int c) { 203 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT) != 0) || 204 (0x10000 <= c && c <= 0x10FFFF); 205 } // isXML11Content(int):boolean 206 207 /** 208 * Returns true if the specified character can be considered 209 * content in an internal parsed entity. 210 * 211 * @param c The character to check. 212 */ isXML11InternalEntityContent(int c)213 public static boolean isXML11InternalEntityContent(int c) { 214 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT_INTERNAL) != 0) || 215 (0x10000 <= c && c <= 0x10FFFF); 216 } // isXML11InternalEntityContent(int):boolean 217 218 /** 219 * Returns true if the specified character is a valid name start 220 * character as defined by production [4] in the XML 1.1 221 * specification. 222 * 223 * @param c The character to check. 224 */ isXML11NameStart(int c)225 public static boolean isXML11NameStart(int c) { 226 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME_START) != 0) 227 || (0x10000 <= c && c < 0xF0000); 228 } // isXML11NameStart(int):boolean 229 230 /** 231 * Returns true if the specified character is a valid name 232 * character as defined by production [4a] in the XML 1.1 233 * specification. 234 * 235 * @param c The character to check. 236 */ isXML11Name(int c)237 public static boolean isXML11Name(int c) { 238 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME) != 0) 239 || (c >= 0x10000 && c < 0xF0000); 240 } // isXML11Name(int):boolean 241 242 /** 243 * Returns true if the specified character is a valid NCName start 244 * character as defined by production [4] in Namespaces in XML 245 * 1.1 recommendation. 246 * 247 * @param c The character to check. 248 */ isXML11NCNameStart(int c)249 public static boolean isXML11NCNameStart(int c) { 250 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME_START) != 0) 251 || (0x10000 <= c && c < 0xF0000); 252 } // isXML11NCNameStart(int):boolean 253 254 /** 255 * Returns true if the specified character is a valid NCName 256 * character as defined by production [5] in Namespaces in XML 257 * 1.1 recommendation. 258 * 259 * @param c The character to check. 260 */ isXML11NCName(int c)261 public static boolean isXML11NCName(int c) { 262 return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME) != 0) 263 || (0x10000 <= c && c < 0xF0000); 264 } // isXML11NCName(int):boolean 265 266 /** 267 * Returns whether the given character is a valid 268 * high surrogate for a name character. This includes 269 * all high surrogates for characters [0x10000-0xEFFFF]. 270 * In other words everything excluding planes 15 and 16. 271 * 272 * @param c The character to check. 273 */ isXML11NameHighSurrogate(int c)274 public static boolean isXML11NameHighSurrogate(int c) { 275 return (0xD800 <= c && c <= 0xDB7F); 276 } 277 278 /* 279 * [5] Name ::= NameStartChar NameChar* 280 */ 281 /** 282 * Check to see if a string is a valid Name according to [5] 283 * in the XML 1.1 Recommendation 284 * 285 * @param name string to check 286 * @return true if name is a valid Name 287 */ isXML11ValidName(String name)288 public static boolean isXML11ValidName(String name) { 289 int length = name.length(); 290 if (length == 0) 291 return false; 292 int i = 1; 293 char ch = name.charAt(0); 294 if( !isXML11NameStart(ch) ) { 295 if ( length > 1 && isXML11NameHighSurrogate(ch) ) { 296 char ch2 = name.charAt(1); 297 if ( !XMLChar.isLowSurrogate(ch2) || 298 !isXML11NameStart(XMLChar.supplemental(ch, ch2)) ) { 299 return false; 300 } 301 i = 2; 302 } 303 else { 304 return false; 305 } 306 } 307 while (i < length) { 308 ch = name.charAt(i); 309 if ( !isXML11Name(ch) ) { 310 if ( ++i < length && isXML11NameHighSurrogate(ch) ) { 311 char ch2 = name.charAt(i); 312 if ( !XMLChar.isLowSurrogate(ch2) || 313 !isXML11Name(XMLChar.supplemental(ch, ch2)) ) { 314 return false; 315 } 316 } 317 else { 318 return false; 319 } 320 } 321 ++i; 322 } 323 return true; 324 } // isXML11ValidName(String):boolean 325 326 327 /* 328 * from the namespace 1.1 rec 329 * [4] NCName ::= NCNameStartChar NCNameChar* 330 */ 331 /** 332 * Check to see if a string is a valid NCName according to [4] 333 * from the XML Namespaces 1.1 Recommendation 334 * 335 * @param ncName string to check 336 * @return true if name is a valid NCName 337 */ isXML11ValidNCName(String ncName)338 public static boolean isXML11ValidNCName(String ncName) { 339 int length = ncName.length(); 340 if (length == 0) 341 return false; 342 int i = 1; 343 char ch = ncName.charAt(0); 344 if( !isXML11NCNameStart(ch) ) { 345 if ( length > 1 && isXML11NameHighSurrogate(ch) ) { 346 char ch2 = ncName.charAt(1); 347 if ( !XMLChar.isLowSurrogate(ch2) || 348 !isXML11NCNameStart(XMLChar.supplemental(ch, ch2)) ) { 349 return false; 350 } 351 i = 2; 352 } 353 else { 354 return false; 355 } 356 } 357 while (i < length) { 358 ch = ncName.charAt(i); 359 if ( !isXML11NCName(ch) ) { 360 if ( ++i < length && isXML11NameHighSurrogate(ch) ) { 361 char ch2 = ncName.charAt(i); 362 if ( !XMLChar.isLowSurrogate(ch2) || 363 !isXML11NCName(XMLChar.supplemental(ch, ch2)) ) { 364 return false; 365 } 366 } 367 else { 368 return false; 369 } 370 } 371 ++i; 372 } 373 return true; 374 } // isXML11ValidNCName(String):boolean 375 376 /* 377 * [7] Nmtoken ::= (NameChar)+ 378 */ 379 /** 380 * Check to see if a string is a valid Nmtoken according to [7] 381 * in the XML 1.1 Recommendation 382 * 383 * @param nmtoken string to check 384 * @return true if nmtoken is a valid Nmtoken 385 */ isXML11ValidNmtoken(String nmtoken)386 public static boolean isXML11ValidNmtoken(String nmtoken) { 387 int length = nmtoken.length(); 388 if (length == 0) 389 return false; 390 for (int i = 0; i < length; ++i ) { 391 char ch = nmtoken.charAt(i); 392 if( !isXML11Name(ch) ) { 393 if ( ++i < length && isXML11NameHighSurrogate(ch) ) { 394 char ch2 = nmtoken.charAt(i); 395 if ( !XMLChar.isLowSurrogate(ch2) || 396 !isXML11Name(XMLChar.supplemental(ch, ch2)) ) { 397 return false; 398 } 399 } 400 else { 401 return false; 402 } 403 } 404 } 405 return true; 406 } // isXML11ValidName(String):boolean 407 408 /** 409 * Simple check to determine if qname is legal. If it returns false 410 * then <param>str</param> is illegal; if it returns true then 411 * <param>str</param> is legal. 412 */ isXML11ValidQName(String str)413 public static boolean isXML11ValidQName(String str) { 414 415 final int colon = str.indexOf(':'); 416 417 if (colon == 0 || colon == str.length() - 1) { 418 return false; 419 } 420 421 if (colon > 0) { 422 final String prefix = str.substring(0,colon); 423 final String localPart = str.substring(colon+1); 424 return isXML11ValidNCName(prefix) && isXML11ValidNCName(localPart); 425 } 426 else { 427 return isXML11ValidNCName(str); 428 } 429 } 430 431 } // class XML11Char 432 433