1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 /* 19 * $Id: EncodingInfo.java 468654 2006-10-28 07:09:23Z minchau $ 20 */ 21 package org.apache.xml.serializer; 22 23 24 /** 25 * Holds information about a given encoding, which is the Java name for the 26 * encoding, the equivalent ISO name. 27 * <p> 28 * An object of this type has two useful methods 29 * <pre> 30 * isInEncoding(char ch); 31 * </pre> 32 * which can be called if the character is not the high one in 33 * a surrogate pair and: 34 * <pre> 35 * isInEncoding(char high, char low); 36 * </pre> 37 * which can be called if the two characters from a high/low surrogate pair. 38 * <p> 39 * An EncodingInfo object is a node in a binary search tree. Such a node 40 * will answer if a character is in the encoding, and do so for a given 41 * range of unicode values (<code>m_first</code> to 42 * <code>m_last</code>). It will handle a certain range of values 43 * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>). 44 * If the unicode point is before that explicit range, that is it 45 * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root 46 * of such a tree, m_before. Likewise for values in the range 47 * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code> 48 * <p> 49 * Actually figuring out if a code point is in the encoding is expensive. So the 50 * purpose of this tree is to cache such determinations, and not to build the 51 * entire tree of information at the start, but only build up as much of the 52 * tree as is used during the transformation. 53 * <p> 54 * This Class is not a public API, and should only be used internally within 55 * the serializer. 56 * <p> 57 * This class is not a public API. 58 * @xsl.usage internal 59 */ 60 public final class EncodingInfo extends Object 61 { 62 63 /** 64 * Not all characters in an encoding are in on contiguous group, 65 * however there is a lowest contiguous group starting at '\u0001' 66 * and working up to m_highCharInContiguousGroup. 67 * <p> 68 * This is the char for which chars at or below this value are 69 * definately in the encoding, although for chars 70 * above this point they might be in the encoding. 71 * This exists for performance, especially for ASCII characters 72 * because for ASCII all chars in the range '\u0001' to '\u007F' 73 * are in the encoding. 74 * 75 */ 76 private final char m_highCharInContiguousGroup; 77 78 /** 79 * The ISO encoding name. 80 */ 81 final String name; 82 83 /** 84 * The name used by the Java convertor. 85 */ 86 final String javaName; 87 88 /** 89 * A helper object that we can ask if a 90 * single char, or a surrogate UTF-16 pair 91 * of chars that form a single character, 92 * is in this encoding. 93 */ 94 private InEncoding m_encoding; 95 96 /** 97 * This is not a public API. It returns true if the 98 * char in question is in the encoding. 99 * @param ch the char in question. 100 * <p> 101 * This method is not a public API. 102 * @xsl.usage internal 103 */ isInEncoding(char ch)104 public boolean isInEncoding(char ch) { 105 if (m_encoding == null) { 106 m_encoding = new EncodingImpl(); 107 108 // One could put alternate logic in here to 109 // instantiate another object that implements the 110 // InEncoding interface. For example if the JRE is 1.4 or up 111 // we could have an object that uses JRE 1.4 methods 112 } 113 return m_encoding.isInEncoding(ch); 114 } 115 116 /** 117 * This is not a public API. It returns true if the 118 * character formed by the high/low pair is in the encoding. 119 * @param high a char that the a high char of a high/low surrogate pair. 120 * @param low a char that is the low char of a high/low surrogate pair. 121 * <p> 122 * This method is not a public API. 123 * @xsl.usage internal 124 */ isInEncoding(char high, char low)125 public boolean isInEncoding(char high, char low) { 126 if (m_encoding == null) { 127 m_encoding = new EncodingImpl(); 128 129 // One could put alternate logic in here to 130 // instantiate another object that implements the 131 // InEncoding interface. For example if the JRE is 1.4 or up 132 // we could have an object that uses JRE 1.4 methods 133 } 134 return m_encoding.isInEncoding(high, low); 135 } 136 137 /** 138 * Create an EncodingInfo object based on the ISO name and Java name. 139 * If both parameters are null any character will be considered to 140 * be in the encoding. This is useful for when the serializer is in 141 * temporary output state, and has no assciated encoding. 142 * 143 * @param name reference to the ISO name. 144 * @param javaName reference to the Java encoding name. 145 * @param highChar The char for which characters at or below this value are 146 * definately in the 147 * encoding, although for characters above this point they might be in the encoding. 148 */ EncodingInfo(String name, String javaName, char highChar)149 public EncodingInfo(String name, String javaName, char highChar) 150 { 151 152 this.name = name; 153 this.javaName = javaName; 154 this.m_highCharInContiguousGroup = highChar; 155 } 156 157 158 159 /** 160 * A simple interface to isolate the implementation. 161 * We could also use some new JRE 1.4 methods in another implementation 162 * provided we use reflection with them. 163 * <p> 164 * This interface is not a public API, 165 * and should only be used internally within the serializer. 166 * @xsl.usage internal 167 */ 168 private interface InEncoding { 169 /** 170 * Returns true if the char is in the encoding 171 */ isInEncoding(char ch)172 public boolean isInEncoding(char ch); 173 /** 174 * Returns true if the high/low surrogate pair forms 175 * a character that is in the encoding. 176 */ isInEncoding(char high, char low)177 public boolean isInEncoding(char high, char low); 178 } 179 180 /** 181 * This class implements the 182 */ 183 private class EncodingImpl implements InEncoding { 184 185 186 isInEncoding(char ch1)187 public boolean isInEncoding(char ch1) { 188 final boolean ret; 189 int codePoint = Encodings.toCodePoint(ch1); 190 if (codePoint < m_explFirst) { 191 // The unicode value is before the range 192 // that we explictly manage, so we delegate the answer. 193 194 // If we don't have an m_before object to delegate to, make one. 195 if (m_before == null) 196 m_before = 197 new EncodingImpl( 198 m_encoding, 199 m_first, 200 m_explFirst - 1, 201 codePoint); 202 ret = m_before.isInEncoding(ch1); 203 } else if (m_explLast < codePoint) { 204 // The unicode value is after the range 205 // that we explictly manage, so we delegate the answer. 206 207 // If we don't have an m_after object to delegate to, make one. 208 if (m_after == null) 209 m_after = 210 new EncodingImpl( 211 m_encoding, 212 m_explLast + 1, 213 m_last, 214 codePoint); 215 ret = m_after.isInEncoding(ch1); 216 } else { 217 // The unicode value is in the range we explitly handle 218 final int idx = codePoint - m_explFirst; 219 220 // If we already know the answer, just return it. 221 if (m_alreadyKnown[idx]) 222 ret = m_isInEncoding[idx]; 223 else { 224 // We don't know the answer, so find out, 225 // which may be expensive, then cache the answer 226 ret = inEncoding(ch1, m_encoding); 227 m_alreadyKnown[idx] = true; 228 m_isInEncoding[idx] = ret; 229 } 230 } 231 return ret; 232 } 233 isInEncoding(char high, char low)234 public boolean isInEncoding(char high, char low) { 235 final boolean ret; 236 int codePoint = Encodings.toCodePoint(high,low); 237 if (codePoint < m_explFirst) { 238 // The unicode value is before the range 239 // that we explictly manage, so we delegate the answer. 240 241 // If we don't have an m_before object to delegate to, make one. 242 if (m_before == null) 243 m_before = 244 new EncodingImpl( 245 m_encoding, 246 m_first, 247 m_explFirst - 1, 248 codePoint); 249 ret = m_before.isInEncoding(high,low); 250 } else if (m_explLast < codePoint) { 251 // The unicode value is after the range 252 // that we explictly manage, so we delegate the answer. 253 254 // If we don't have an m_after object to delegate to, make one. 255 if (m_after == null) 256 m_after = 257 new EncodingImpl( 258 m_encoding, 259 m_explLast + 1, 260 m_last, 261 codePoint); 262 ret = m_after.isInEncoding(high,low); 263 } else { 264 // The unicode value is in the range we explitly handle 265 final int idx = codePoint - m_explFirst; 266 267 // If we already know the answer, just return it. 268 if (m_alreadyKnown[idx]) 269 ret = m_isInEncoding[idx]; 270 else { 271 // We don't know the answer, so find out, 272 // which may be expensive, then cache the answer 273 ret = inEncoding(high, low, m_encoding); 274 m_alreadyKnown[idx] = true; 275 m_isInEncoding[idx] = ret; 276 } 277 } 278 return ret; 279 } 280 281 /** 282 * The encoding. 283 */ 284 final private String m_encoding; 285 /** 286 * m_first through m_last is the range of unicode 287 * values that this object will return an answer on. 288 * It may delegate to a similar object with a different 289 * range 290 */ 291 final private int m_first; 292 293 /** 294 * m_explFirst through m_explLast is the range of unicode 295 * value that this object handles explicitly and does not 296 * delegate to a similar object. 297 */ 298 final private int m_explFirst; 299 final private int m_explLast; 300 final private int m_last; 301 302 /** 303 * The object, of the same type as this one, 304 * that handles unicode values in a range before 305 * the range explictly handled by this object, and 306 * to which this object may delegate. 307 */ 308 private InEncoding m_before; 309 /** 310 * The object, of the same type as this one, 311 * that handles unicode values in a range after 312 * the range explictly handled by this object, and 313 * to which this object may delegate. 314 */ 315 private InEncoding m_after; 316 317 /** 318 * The number of unicode values explicitly handled 319 * by a single EncodingInfo object. This value is 320 * tuneable, but is set to 128 because that covers the 321 * entire low range of ASCII type chars within a single 322 * object. 323 */ 324 private static final int RANGE = 128; 325 326 /** 327 * A flag to record if we already know the answer 328 * for the given unicode value. 329 */ 330 final private boolean m_alreadyKnown[] = new boolean[RANGE]; 331 /** 332 * A table holding the answer on whether the given unicode 333 * value is in the encoding. 334 */ 335 final private boolean m_isInEncoding[] = new boolean[RANGE]; 336 EncodingImpl()337 private EncodingImpl() { 338 // This object will answer whether any unicode value 339 // is in the encoding, it handles values 0 through Integer.MAX_VALUE 340 this(javaName, 0, Integer.MAX_VALUE, (char) 0); 341 } 342 EncodingImpl(String encoding, int first, int last, int codePoint)343 private EncodingImpl(String encoding, int first, int last, int codePoint) { 344 // Set the range of unicode values that this object manages 345 // either explicitly or implicitly. 346 m_first = first; 347 m_last = last; 348 349 // Set the range of unicode values that this object 350 // explicitly manages 351 m_explFirst = codePoint; 352 m_explLast = codePoint + (RANGE-1); 353 354 m_encoding = encoding; 355 356 if (javaName != null) 357 { 358 // Some optimization. 359 if (0 <= m_explFirst && m_explFirst <= 127) { 360 // This particular EncodingImpl explicitly handles 361 // characters in the low range. 362 if ("UTF8".equals(javaName) 363 || "UTF-16".equals(javaName) 364 || "ASCII".equals(javaName) 365 || "US-ASCII".equals(javaName) 366 || "Unicode".equals(javaName) 367 || "UNICODE".equals(javaName) 368 || javaName.startsWith("ISO8859")) { 369 370 // Not only does this EncodingImpl object explicitly 371 // handle chracters in the low range, it is 372 // also one that we know something about, without 373 // needing to call inEncoding(char ch, String encoding) 374 // for this low range 375 // 376 // By initializing the table ahead of time 377 // for these low values, we prevent the expensive 378 // inEncoding(char ch, String encoding) 379 // from being called, at least for these common 380 // encodings. 381 for (int unicode = 1; unicode < 127; unicode++) { 382 final int idx = unicode - m_explFirst; 383 if (0 <= idx && idx < RANGE) { 384 m_alreadyKnown[idx] = true; 385 m_isInEncoding[idx] = true; 386 } 387 } 388 } 389 } 390 391 /* A little bit more than optimization. 392 * 393 * We will say that any character is in the encoding if 394 * we don't have an encoding. 395 * This is meaningful when the serializer is being used 396 * in temporary output state, where we are not writing to 397 * the final output tree. It is when writing to the 398 * final output tree that we need to worry about the output 399 * encoding 400 */ 401 if (javaName == null) { 402 for (int idx = 0; idx < m_alreadyKnown.length; idx++) { 403 m_alreadyKnown[idx] = true; 404 m_isInEncoding[idx] = true; 405 } 406 } 407 } 408 } 409 } 410 411 /** 412 * This is heart of the code that determines if a given character 413 * is in the given encoding. This method is probably expensive, 414 * and the answer should be cached. 415 * <p> 416 * This method is not a public API, 417 * and should only be used internally within the serializer. 418 * @param ch the char in question, that is not a high char of 419 * a high/low surrogate pair. 420 * @param encoding the Java name of the enocding. 421 * 422 * @xsl.usage internal 423 * 424 */ inEncoding(char ch, String encoding)425 private static boolean inEncoding(char ch, String encoding) { 426 boolean isInEncoding; 427 try { 428 char cArray[] = new char[1]; 429 cArray[0] = ch; 430 // Construct a String from the char 431 String s = new String(cArray); 432 // Encode the String into a sequence of bytes 433 // using the given, named charset. 434 byte[] bArray = s.getBytes(encoding); 435 isInEncoding = inEncoding(ch, bArray); 436 437 } catch (Exception e) { 438 isInEncoding = false; 439 440 // If for some reason the encoding is null, e.g. 441 // for a temporary result tree, we should just 442 // say that every character is in the encoding. 443 if (encoding == null) 444 isInEncoding = true; 445 } 446 return isInEncoding; 447 } 448 449 /** 450 * This is heart of the code that determines if a given high/low 451 * surrogate pair forms a character that is in the given encoding. 452 * This method is probably expensive, and the answer should be cached. 453 * <p> 454 * This method is not a public API, 455 * and should only be used internally within the serializer. 456 * @param high the high char of 457 * a high/low surrogate pair. 458 * @param low the low char of a high/low surrogate pair. 459 * @param encoding the Java name of the encoding. 460 * 461 * @xsl.usage internal 462 * 463 */ inEncoding(char high, char low, String encoding)464 private static boolean inEncoding(char high, char low, String encoding) { 465 boolean isInEncoding; 466 try { 467 char cArray[] = new char[2]; 468 cArray[0] = high; 469 cArray[1] = low; 470 // Construct a String from the char 471 String s = new String(cArray); 472 // Encode the String into a sequence of bytes 473 // using the given, named charset. 474 byte[] bArray = s.getBytes(encoding); 475 isInEncoding = inEncoding(high,bArray); 476 } catch (Exception e) { 477 isInEncoding = false; 478 } 479 480 return isInEncoding; 481 } 482 483 /** 484 * This method is the core of determining if character 485 * is in the encoding. The method is not foolproof, because 486 * s.getBytes(encoding) has specified behavior only if the 487 * characters are in the specified encoding. However this 488 * method tries it's best. 489 * @param ch the char that was converted using getBytes, or 490 * the first char of a high/low pair that was converted. 491 * @param data the bytes written out by the call to s.getBytes(encoding); 492 * @return true if the character is in the encoding. 493 */ inEncoding(char ch, byte[] data)494 private static boolean inEncoding(char ch, byte[] data) { 495 final boolean isInEncoding; 496 // If the string written out as data is not in the encoding, 497 // the output is not specified according to the documentation 498 // on the String.getBytes(encoding) method, 499 // but we do our best here. 500 if (data==null || data.length == 0) { 501 isInEncoding = false; 502 } 503 else { 504 if (data[0] == 0) 505 isInEncoding = false; 506 else if (data[0] == '?' && ch != '?') 507 isInEncoding = false; 508 /* 509 * else if (isJapanese) { 510 * // isJapanese is really 511 * // ( "EUC-JP".equals(javaName) 512 * // || "EUC_JP".equals(javaName) 513 * // || "SJIS".equals(javaName) ) 514 * 515 * // Work around some bugs in JRE for Japanese 516 * if(data[0] == 0x21) 517 * isInEncoding = false; 518 * else if (ch == 0xA5) 519 * isInEncoding = false; 520 * else 521 * isInEncoding = true; 522 * } 523 */ 524 525 else { 526 // We don't know for sure, but it looks like it is in the encoding 527 isInEncoding = true; 528 } 529 } 530 return isInEncoding; 531 } 532 533 /** 534 * This method exists for performance reasons. 535 * <p> 536 * Except for '\u0000', if a char is less than or equal to the value 537 * returned by this method then it in the encoding. 538 * <p> 539 * The characters in an encoding are not contiguous, however 540 * there is a lowest group of chars starting at '\u0001' upto and 541 * including the char returned by this method that are all in the encoding. 542 * So the char returned by this method essentially defines the lowest 543 * contiguous group. 544 * <p> 545 * chars above the value returned might be in the encoding, but 546 * chars at or below the value returned are definately in the encoding. 547 * <p> 548 * In any case however, the isInEncoding(char) method can be used 549 * regardless of the value of the char returned by this method. 550 * <p> 551 * If the value returned is '\u0000' it means that every character must be tested 552 * with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)} 553 * for surrogate pairs. 554 * <p> 555 * This method is not a public API. 556 * @xsl.usage internal 557 */ getHighChar()558 public final char getHighChar() { 559 return m_highCharInContiguousGroup; 560 } 561 562 } 563