1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the  "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 /*
19  * $Id: EncodingInfo.java 468654 2006-10-28 07:09:23Z minchau $
20  */
21 package org.apache.xml.serializer;
22 
23 
24 /**
25  * Holds information about a given encoding, which is the Java name for the
26  * encoding, the equivalent ISO name.
27  * <p>
28  * An object of this type has two useful methods
29  * <pre>
30  * isInEncoding(char ch);
31  * </pre>
32  * which can be called if the character is not the high one in
33  * a surrogate pair and:
34  * <pre>
35  * isInEncoding(char high, char low);
36  * </pre>
37  * which can be called if the two characters from a high/low surrogate pair.
38  * <p>
39  * An EncodingInfo object is a node in a binary search tree. Such a node
40  * will answer if a character is in the encoding, and do so for a given
41  * range of unicode values (<code>m_first</code> to
42  * <code>m_last</code>). It will handle a certain range of values
43  * explicitly (<code>m_explFirst</code> to <code>m_explLast</code>).
44  * If the unicode point is before that explicit range, that is it
45  * is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root
46  * of such a tree, m_before.  Likewise for values in the range
47  * <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code>
48  * <p>
49  * Actually figuring out if a code point is in the encoding is expensive. So the
50  * purpose of this tree is to cache such determinations, and not to build the
51  * entire tree of information at the start, but only build up as much of the
52  * tree as is used during the transformation.
53  * <p>
54  * This Class is not a public API, and should only be used internally within
55  * the serializer.
56  * <p>
57  * This class is not a public API.
58  * @xsl.usage internal
59  */
60 public final class EncodingInfo extends Object
61 {
62 
63     /**
64      * Not all characters in an encoding are in on contiguous group,
65      * however there is a lowest contiguous group starting at '\u0001'
66      * and working up to m_highCharInContiguousGroup.
67      * <p>
68      * This is the char for which chars at or below this value are
69      * definately in the encoding, although for chars
70      * above this point they might be in the encoding.
71      * This exists for performance, especially for ASCII characters
72      * because for ASCII all chars in the range '\u0001' to '\u007F'
73      * are in the encoding.
74      *
75      */
76     private final char m_highCharInContiguousGroup;
77 
78     /**
79      * The ISO encoding name.
80      */
81     final String name;
82 
83     /**
84      * The name used by the Java convertor.
85      */
86     final String javaName;
87 
88     /**
89      * A helper object that we can ask if a
90      * single char, or a surrogate UTF-16 pair
91      * of chars that form a single character,
92      * is in this encoding.
93      */
94     private InEncoding m_encoding;
95 
96     /**
97      * This is not a public API. It returns true if the
98      * char in question is in the encoding.
99      * @param ch the char in question.
100      * <p>
101      * This method is not a public API.
102      * @xsl.usage internal
103      */
isInEncoding(char ch)104     public boolean isInEncoding(char ch) {
105         if (m_encoding == null) {
106             m_encoding = new EncodingImpl();
107 
108             // One could put alternate logic in here to
109             // instantiate another object that implements the
110             // InEncoding interface. For example if the JRE is 1.4 or up
111             // we could have an object that uses JRE 1.4 methods
112         }
113         return m_encoding.isInEncoding(ch);
114     }
115 
116     /**
117      * This is not a public API. It returns true if the
118      * character formed by the high/low pair is in the encoding.
119      * @param high a char that the a high char of a high/low surrogate pair.
120      * @param low a char that is the low char of a high/low surrogate pair.
121      * <p>
122      * This method is not a public API.
123      * @xsl.usage internal
124      */
isInEncoding(char high, char low)125     public boolean isInEncoding(char high, char low) {
126         if (m_encoding == null) {
127             m_encoding = new EncodingImpl();
128 
129             // One could put alternate logic in here to
130             // instantiate another object that implements the
131             // InEncoding interface. For example if the JRE is 1.4 or up
132             // we could have an object that uses JRE 1.4 methods
133         }
134         return m_encoding.isInEncoding(high, low);
135     }
136 
137     /**
138      * Create an EncodingInfo object based on the ISO name and Java name.
139      * If both parameters are null any character will be considered to
140      * be in the encoding. This is useful for when the serializer is in
141      * temporary output state, and has no assciated encoding.
142      *
143      * @param name reference to the ISO name.
144      * @param javaName reference to the Java encoding name.
145      * @param highChar The char for which characters at or below this value are
146      * definately in the
147      * encoding, although for characters above this point they might be in the encoding.
148      */
EncodingInfo(String name, String javaName, char highChar)149     public EncodingInfo(String name, String javaName, char highChar)
150     {
151 
152         this.name = name;
153         this.javaName = javaName;
154         this.m_highCharInContiguousGroup = highChar;
155     }
156 
157 
158 
159     /**
160      * A simple interface to isolate the implementation.
161      * We could also use some new JRE 1.4 methods in another implementation
162      * provided we use reflection with them.
163      * <p>
164      * This interface is not a public API,
165      * and should only be used internally within the serializer.
166      * @xsl.usage internal
167      */
168     private interface InEncoding {
169         /**
170          * Returns true if the char is in the encoding
171          */
isInEncoding(char ch)172         public boolean isInEncoding(char ch);
173         /**
174          * Returns true if the high/low surrogate pair forms
175          * a character that is in the encoding.
176          */
isInEncoding(char high, char low)177         public boolean isInEncoding(char high, char low);
178     }
179 
180     /**
181      * This class implements the
182      */
183     private class EncodingImpl implements InEncoding {
184 
185 
186 
isInEncoding(char ch1)187         public boolean isInEncoding(char ch1) {
188             final boolean ret;
189             int codePoint = Encodings.toCodePoint(ch1);
190             if (codePoint < m_explFirst) {
191                 // The unicode value is before the range
192                 // that we explictly manage, so we delegate the answer.
193 
194                 // If we don't have an m_before object to delegate to, make one.
195                 if (m_before == null)
196                     m_before =
197                         new EncodingImpl(
198                             m_encoding,
199                             m_first,
200                             m_explFirst - 1,
201                             codePoint);
202                 ret = m_before.isInEncoding(ch1);
203             } else if (m_explLast < codePoint) {
204                 // The unicode value is after the range
205                 // that we explictly manage, so we delegate the answer.
206 
207                 // If we don't have an m_after object to delegate to, make one.
208                 if (m_after == null)
209                     m_after =
210                         new EncodingImpl(
211                             m_encoding,
212                             m_explLast + 1,
213                             m_last,
214                             codePoint);
215                 ret = m_after.isInEncoding(ch1);
216             } else {
217                 // The unicode value is in the range we explitly handle
218                 final int idx = codePoint - m_explFirst;
219 
220                 // If we already know the answer, just return it.
221                 if (m_alreadyKnown[idx])
222                     ret = m_isInEncoding[idx];
223                 else {
224                     // We don't know the answer, so find out,
225                     // which may be expensive, then cache the answer
226                     ret = inEncoding(ch1, m_encoding);
227                     m_alreadyKnown[idx] = true;
228                     m_isInEncoding[idx] = ret;
229                 }
230             }
231             return ret;
232         }
233 
isInEncoding(char high, char low)234         public boolean isInEncoding(char high, char low) {
235             final boolean ret;
236             int codePoint = Encodings.toCodePoint(high,low);
237             if (codePoint < m_explFirst) {
238                 // The unicode value is before the range
239                 // that we explictly manage, so we delegate the answer.
240 
241                 // If we don't have an m_before object to delegate to, make one.
242                 if (m_before == null)
243                     m_before =
244                         new EncodingImpl(
245                             m_encoding,
246                             m_first,
247                             m_explFirst - 1,
248                             codePoint);
249                 ret = m_before.isInEncoding(high,low);
250             } else if (m_explLast < codePoint) {
251                 // The unicode value is after the range
252                 // that we explictly manage, so we delegate the answer.
253 
254                 // If we don't have an m_after object to delegate to, make one.
255                 if (m_after == null)
256                     m_after =
257                         new EncodingImpl(
258                             m_encoding,
259                             m_explLast + 1,
260                             m_last,
261                             codePoint);
262                 ret = m_after.isInEncoding(high,low);
263             } else {
264                 // The unicode value is in the range we explitly handle
265                 final int idx = codePoint - m_explFirst;
266 
267                 // If we already know the answer, just return it.
268                 if (m_alreadyKnown[idx])
269                     ret = m_isInEncoding[idx];
270                 else {
271                     // We don't know the answer, so find out,
272                     // which may be expensive, then cache the answer
273                     ret = inEncoding(high, low, m_encoding);
274                     m_alreadyKnown[idx] = true;
275                     m_isInEncoding[idx] = ret;
276                 }
277             }
278             return ret;
279         }
280 
281         /**
282          * The encoding.
283          */
284         final private String m_encoding;
285         /**
286          * m_first through m_last is the range of unicode
287          * values that this object will return an answer on.
288          * It may delegate to a similar object with a different
289          * range
290          */
291         final private int m_first;
292 
293         /**
294          * m_explFirst through m_explLast is the range of unicode
295          * value that this object handles explicitly and does not
296          * delegate to a similar object.
297          */
298         final private int m_explFirst;
299         final private int m_explLast;
300         final private int m_last;
301 
302         /**
303          * The object, of the same type as this one,
304          * that handles unicode values in a range before
305          * the range explictly handled by this object, and
306          * to which this object may delegate.
307          */
308         private InEncoding m_before;
309         /**
310          * The object, of the same type as this one,
311          * that handles unicode values in a range after
312          * the range explictly handled by this object, and
313          * to which this object may delegate.
314          */
315         private InEncoding m_after;
316 
317         /**
318          * The number of unicode values explicitly handled
319          * by a single EncodingInfo object. This value is
320          * tuneable, but is set to 128 because that covers the
321          * entire low range of ASCII type chars within a single
322          * object.
323          */
324         private static final int RANGE = 128;
325 
326         /**
327          * A flag to record if we already know the answer
328          * for the given unicode value.
329          */
330         final private boolean m_alreadyKnown[] = new boolean[RANGE];
331         /**
332          * A table holding the answer on whether the given unicode
333          * value is in the encoding.
334          */
335         final private boolean m_isInEncoding[] = new boolean[RANGE];
336 
EncodingImpl()337         private EncodingImpl() {
338             // This object will answer whether any unicode value
339             // is in the encoding, it handles values 0 through Integer.MAX_VALUE
340             this(javaName, 0, Integer.MAX_VALUE, (char) 0);
341         }
342 
EncodingImpl(String encoding, int first, int last, int codePoint)343         private EncodingImpl(String encoding, int first, int last, int codePoint) {
344             // Set the range of unicode values that this object manages
345             // either explicitly or implicitly.
346             m_first = first;
347             m_last = last;
348 
349             // Set the range of unicode values that this object
350             // explicitly manages
351             m_explFirst = codePoint;
352             m_explLast = codePoint + (RANGE-1);
353 
354             m_encoding = encoding;
355 
356             if (javaName != null)
357             {
358                 // Some optimization.
359                 if (0 <= m_explFirst && m_explFirst <= 127) {
360                     // This particular EncodingImpl explicitly handles
361                     // characters in the low range.
362                     if ("UTF8".equals(javaName)
363                         || "UTF-16".equals(javaName)
364                         || "ASCII".equals(javaName)
365                         || "US-ASCII".equals(javaName)
366                         || "Unicode".equals(javaName)
367                         || "UNICODE".equals(javaName)
368                         || javaName.startsWith("ISO8859")) {
369 
370                         // Not only does this EncodingImpl object explicitly
371                         // handle chracters in the low range, it is
372                         // also one that we know something about, without
373                         // needing to call inEncoding(char ch, String encoding)
374                         // for this low range
375                         //
376                         // By initializing the table ahead of time
377                         // for these low values, we prevent the expensive
378                         // inEncoding(char ch, String encoding)
379                         // from being called, at least for these common
380                         // encodings.
381                         for (int unicode = 1; unicode < 127; unicode++) {
382                             final int idx = unicode - m_explFirst;
383                             if (0 <= idx && idx < RANGE) {
384                                 m_alreadyKnown[idx] = true;
385                                 m_isInEncoding[idx] = true;
386                             }
387                         }
388                     }
389                 }
390 
391                 /* A little bit more than optimization.
392                  *
393                  * We will say that any character is in the encoding if
394                  * we don't have an encoding.
395                  * This is meaningful when the serializer is being used
396                  * in temporary output state, where we are not writing to
397                  * the final output tree.  It is when writing to the
398                  * final output tree that we need to worry about the output
399                  * encoding
400                  */
401                 if (javaName == null) {
402                     for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
403                         m_alreadyKnown[idx] = true;
404                         m_isInEncoding[idx] = true;
405                     }
406                 }
407             }
408         }
409     }
410 
411     /**
412      * This is heart of the code that determines if a given character
413      * is in the given encoding. This method is probably expensive,
414      * and the answer should be cached.
415      * <p>
416      * This method is not a public API,
417      * and should only be used internally within the serializer.
418      * @param ch the char in question, that is not a high char of
419      * a high/low surrogate pair.
420      * @param encoding the Java name of the enocding.
421      *
422      * @xsl.usage internal
423      *
424      */
inEncoding(char ch, String encoding)425     private static boolean inEncoding(char ch, String encoding) {
426         boolean isInEncoding;
427         try {
428             char cArray[] = new char[1];
429             cArray[0] = ch;
430             // Construct a String from the char
431             String s = new String(cArray);
432             // Encode the String into a sequence of bytes
433             // using the given, named charset.
434             byte[] bArray = s.getBytes(encoding);
435             isInEncoding = inEncoding(ch, bArray);
436 
437         } catch (Exception e) {
438             isInEncoding = false;
439 
440             // If for some reason the encoding is null, e.g.
441             // for a temporary result tree, we should just
442             // say that every character is in the encoding.
443             if (encoding == null)
444             	isInEncoding = true;
445         }
446         return isInEncoding;
447     }
448 
449     /**
450      * This is heart of the code that determines if a given high/low
451      * surrogate pair forms a character that is in the given encoding.
452      * This method is probably expensive, and the answer should be cached.
453      * <p>
454      * This method is not a public API,
455      * and should only be used internally within the serializer.
456      * @param high the high char of
457      * a high/low surrogate pair.
458      * @param low the low char of a high/low surrogate pair.
459      * @param encoding the Java name of the encoding.
460      *
461      * @xsl.usage internal
462      *
463      */
inEncoding(char high, char low, String encoding)464     private static boolean inEncoding(char high, char low, String encoding) {
465         boolean isInEncoding;
466         try {
467             char cArray[] = new char[2];
468             cArray[0] = high;
469             cArray[1] = low;
470             // Construct a String from the char
471             String s = new String(cArray);
472             // Encode the String into a sequence of bytes
473             // using the given, named charset.
474             byte[] bArray = s.getBytes(encoding);
475             isInEncoding = inEncoding(high,bArray);
476         } catch (Exception e) {
477             isInEncoding = false;
478         }
479 
480         return isInEncoding;
481     }
482 
483     /**
484      * This method is the core of determining if character
485      * is in the encoding. The method is not foolproof, because
486      * s.getBytes(encoding) has specified behavior only if the
487      * characters are in the specified encoding. However this
488      * method tries it's best.
489      * @param ch the char that was converted using getBytes, or
490      * the first char of a high/low pair that was converted.
491      * @param data the bytes written out by the call to s.getBytes(encoding);
492      * @return true if the character is in the encoding.
493      */
inEncoding(char ch, byte[] data)494     private static boolean inEncoding(char ch, byte[] data) {
495         final boolean isInEncoding;
496         // If the string written out as data is not in the encoding,
497         // the output is not specified according to the documentation
498         // on the String.getBytes(encoding) method,
499         // but we do our best here.
500         if (data==null || data.length == 0) {
501             isInEncoding = false;
502         }
503         else {
504             if (data[0] == 0)
505                 isInEncoding = false;
506             else if (data[0] == '?' && ch != '?')
507                 isInEncoding = false;
508             /*
509              * else if (isJapanese) {
510              *   // isJapanese is really
511              *   //   (    "EUC-JP".equals(javaName)
512              *   //    ||  "EUC_JP".equals(javaName)
513              *  //     ||  "SJIS".equals(javaName)   )
514              *
515              *   // Work around some bugs in JRE for Japanese
516              *   if(data[0] == 0x21)
517              *     isInEncoding = false;
518              *   else if (ch == 0xA5)
519              *     isInEncoding = false;
520              *   else
521              *     isInEncoding = true;
522              * }
523              */
524 
525             else {
526                 // We don't know for sure, but it looks like it is in the encoding
527                 isInEncoding = true;
528             }
529         }
530         return isInEncoding;
531     }
532 
533     /**
534      * This method exists for performance reasons.
535      * <p>
536      * Except for '\u0000', if a char is less than or equal to the value
537      * returned by this method then it in the encoding.
538      * <p>
539      * The characters in an encoding are not contiguous, however
540      * there is a lowest group of chars starting at '\u0001' upto and
541      * including the char returned by this method that are all in the encoding.
542      * So the char returned by this method essentially defines the lowest
543      * contiguous group.
544      * <p>
545      * chars above the value returned might be in the encoding, but
546      * chars at or below the value returned are definately in the encoding.
547      * <p>
548      * In any case however, the isInEncoding(char) method can be used
549      * regardless of the value of the char returned by this method.
550      * <p>
551      * If the value returned is '\u0000' it means that every character must be tested
552      * with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)}
553      * for surrogate pairs.
554      * <p>
555      * This method is not a public API.
556      * @xsl.usage internal
557      */
getHighChar()558     public final char getHighChar() {
559         return m_highCharInContiguousGroup;
560     }
561 
562 }
563