1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the  "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 /*
19  * $Id: WriterToUTF8Buffered.java 469356 2006-10-31 03:20:34Z minchau $
20  */
21 package org.apache.xml.serializer;
22 
23 import java.io.IOException;
24 import java.io.OutputStream;
25 import java.io.UnsupportedEncodingException;
26 import java.io.Writer;
27 
28 
29 /**
30  * This class writes unicode characters to a byte stream (java.io.OutputStream)
31  * as quickly as possible. It buffers the output in an internal
32  * buffer which must be flushed to the OutputStream when done. This flushing
33  * is done via the close() flush() or flushBuffer() method.
34  *
35  * This class is only used internally within Xalan.
36  *
37  * @xsl.usage internal
38  */
39 final class WriterToUTF8Buffered extends Writer implements WriterChain
40 {
41 
42   /** number of bytes that the byte buffer can hold.
43    * This is a fixed constant is used rather than m_outputBytes.lenght for performance.
44    */
45   private static final int BYTES_MAX=16*1024;
46   /** number of characters that the character buffer can hold.
47    * This is 1/3 of the number of bytes because UTF-8 encoding
48    * can expand one unicode character by up to 3 bytes.
49    */
50   private static final int CHARS_MAX=(BYTES_MAX/3);
51 
52  // private static final int
53 
54   /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */
55   private final OutputStream m_os;
56 
57   /**
58    * The internal buffer where data is stored.
59    * (sc & sb remove final to compile in JDK 1.1.8)
60    */
61   private final byte m_outputBytes[];
62 
63   private final char m_inputChars[];
64 
65   /**
66    * The number of valid bytes in the buffer. This value is always
67    * in the range <tt>0</tt> through <tt>m_outputBytes.length</tt>; elements
68    * <tt>m_outputBytes[0]</tt> through <tt>m_outputBytes[count-1]</tt> contain valid
69    * byte data.
70    */
71   private int count;
72 
73   /**
74    * Create an buffered UTF-8 writer.
75    *
76    *
77    * @param   out    the underlying output stream.
78    *
79    * @throws UnsupportedEncodingException
80    */
WriterToUTF8Buffered(OutputStream out)81   public WriterToUTF8Buffered(OutputStream out)
82   {
83       m_os = out;
84       // get 3 extra bytes to make buffer overflow checking simpler and faster
85       // we won't have to keep checking for a few extra characters
86       m_outputBytes = new byte[BYTES_MAX + 3];
87 
88       // Big enough to hold the input chars that will be transformed
89       // into output bytes in m_ouputBytes.
90       m_inputChars = new char[CHARS_MAX + 2];
91       count = 0;
92 
93 //      the old body of this constructor, before the buffersize was changed to a constant
94 //      this(out, 8*1024);
95   }
96 
97   /**
98    * Create an buffered UTF-8 writer to write data to the
99    * specified underlying output stream with the specified buffer
100    * size.
101    *
102    * @param   out    the underlying output stream.
103    * @param   size   the buffer size.
104    * @exception IllegalArgumentException if size <= 0.
105    */
106 //  public WriterToUTF8Buffered(final OutputStream out, final int size)
107 //  {
108 //
109 //    m_os = out;
110 //
111 //    if (size <= 0)
112 //    {
113 //      throw new IllegalArgumentException(
114 //        SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0");
115 //    }
116 //
117 //    m_outputBytes = new byte[size];
118 //    count = 0;
119 //  }
120 
121   /**
122    * Write a single character.  The character to be written is contained in
123    * the 16 low-order bits of the given integer value; the 16 high-order bits
124    * are ignored.
125    *
126    * <p> Subclasses that intend to support efficient single-character output
127    * should override this method.
128    *
129    * @param c  int specifying a character to be written.
130    * @exception  IOException  If an I/O error occurs
131    */
write(final int c)132   public void write(final int c) throws IOException
133   {
134 
135     /* If we are close to the end of the buffer then flush it.
136      * Remember the buffer can hold a few more bytes than BYTES_MAX
137      */
138     if (count >= BYTES_MAX)
139         flushBuffer();
140 
141     if (c < 0x80)
142     {
143        m_outputBytes[count++] = (byte) (c);
144     }
145     else if (c < 0x800)
146     {
147       m_outputBytes[count++] = (byte) (0xc0 + (c >> 6));
148       m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
149     }
150     else if (c < 0x10000)
151     {
152       m_outputBytes[count++] = (byte) (0xe0 + (c >> 12));
153       m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
154       m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
155     }
156 	else
157 	{
158 	  m_outputBytes[count++] = (byte) (0xf0 + (c >> 18));
159 	  m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f));
160 	  m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
161 	  m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
162 	}
163 
164   }
165 
166 
167   /**
168    * Write a portion of an array of characters.
169    *
170    * @param  chars  Array of characters
171    * @param  start   Offset from which to start writing characters
172    * @param  length   Number of characters to write
173    *
174    * @exception  IOException  If an I/O error occurs
175    *
176    * @throws java.io.IOException
177    */
write(final char chars[], final int start, final int length)178   public void write(final char chars[], final int start, final int length)
179           throws java.io.IOException
180   {
181 
182     // We multiply the length by three since this is the maximum length
183     // of the characters that we can put into the buffer.  It is possible
184     // for each Unicode character to expand to three bytes.
185 
186     int lengthx3 = 3*length;
187 
188     if (lengthx3 >= BYTES_MAX - count)
189     {
190       // The requested length is greater than the unused part of the buffer
191       flushBuffer();
192 
193       if (lengthx3 > BYTES_MAX)
194       {
195         /*
196          * The requested length exceeds the size of the buffer.
197          * Cut the buffer up into chunks, each of which will
198          * not cause an overflow to the output buffer m_outputBytes,
199          * and make multiple recursive calls.
200          * Be careful about integer overflows in multiplication.
201          */
202         int split = length/CHARS_MAX;
203         final int chunks;
204         if (length % CHARS_MAX > 0)
205             chunks = split + 1;
206         else
207             chunks = split;
208         int end_chunk = start;
209         for (int chunk = 1; chunk <= chunks; chunk++)
210         {
211             int start_chunk = end_chunk;
212             end_chunk = start + (int) ((((long) length) * chunk) / chunks);
213 
214             // Adjust the end of the chunk if it ends on a high char
215             // of a Unicode surrogate pair and low char of the pair
216             // is not going to be in the same chunk
217             final char c = chars[end_chunk - 1];
218             int ic = chars[end_chunk - 1];
219             if (c >= 0xD800 && c <= 0xDBFF) {
220                 // The last Java char that we were going
221                 // to process is the first of a
222                 // Java surrogate char pair that
223                 // represent a Unicode character.
224 
225                 if (end_chunk < start + length) {
226                     // Avoid spanning by including the low
227                     // char in the current chunk of chars.
228                     end_chunk++;
229                 } else {
230                     /* This is the last char of the last chunk,
231                      * and it is the high char of a high/low pair with
232                      * no low char provided.
233                      * TODO: error message needed.
234                      * The char array incorrectly ends in a high char
235                      * of a high/low surrogate pair, but there is
236                      * no corresponding low as the high is the last char
237                      */
238                     end_chunk--;
239                 }
240             }
241 
242 
243             int len_chunk = (end_chunk - start_chunk);
244             this.write(chars,start_chunk, len_chunk);
245         }
246         return;
247       }
248     }
249 
250 
251 
252     final int n = length+start;
253     final byte[] buf_loc = m_outputBytes; // local reference for faster access
254     int count_loc = count;      // local integer for faster access
255     int i = start;
256     {
257         /* This block could be omitted and the code would produce
258          * the same result. But this block exists to give the JIT
259          * a better chance of optimizing a tight and common loop which
260          * occurs when writing out ASCII characters.
261          */
262         char c;
263         for(; i < n && (c = chars[i])< 0x80 ; i++ )
264             buf_loc[count_loc++] = (byte)c;
265     }
266     for (; i < n; i++)
267     {
268 
269       final char c = chars[i];
270 
271       if (c < 0x80)
272         buf_loc[count_loc++] = (byte) (c);
273       else if (c < 0x800)
274       {
275         buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
276         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
277       }
278       /**
279         * The following else if condition is added to support XML 1.1 Characters for
280         * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
281         * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
282         *          [1101 11yy] [yyxx xxxx] (low surrogate)
283         *          * uuuuu = wwww + 1
284         */
285       else if (c >= 0xD800 && c <= 0xDBFF)
286       {
287           char high, low;
288           high = c;
289           i++;
290           low = chars[i];
291 
292           buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
293           buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
294           buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
295           buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
296       }
297       else
298       {
299         buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
300         buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
301         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
302       }
303     }
304     // Store the local integer back into the instance variable
305     count = count_loc;
306 
307   }
308 
309   /**
310    * Write a string.
311    *
312    * @param  s  String to be written
313    *
314    * @exception  IOException  If an I/O error occurs
315    */
write(final String s)316   public void write(final String s) throws IOException
317   {
318 
319     // We multiply the length by three since this is the maximum length
320     // of the characters that we can put into the buffer.  It is possible
321     // for each Unicode character to expand to three bytes.
322     final int length = s.length();
323     int lengthx3 = 3*length;
324 
325     if (lengthx3 >= BYTES_MAX - count)
326     {
327       // The requested length is greater than the unused part of the buffer
328       flushBuffer();
329 
330       if (lengthx3 > BYTES_MAX)
331       {
332         /*
333          * The requested length exceeds the size of the buffer,
334          * so break it up in chunks that don't exceed the buffer size.
335          */
336          final int start = 0;
337          int split = length/CHARS_MAX;
338          final int chunks;
339          if (length % CHARS_MAX > 0)
340              chunks = split + 1;
341          else
342              chunks = split;
343          int end_chunk = 0;
344          for (int chunk = 1; chunk <= chunks; chunk++)
345          {
346              int start_chunk = end_chunk;
347              end_chunk = start + (int) ((((long) length) * chunk) / chunks);
348              s.getChars(start_chunk,end_chunk, m_inputChars,0);
349              int len_chunk = (end_chunk - start_chunk);
350 
351              // Adjust the end of the chunk if it ends on a high char
352              // of a Unicode surrogate pair and low char of the pair
353              // is not going to be in the same chunk
354              final char c = m_inputChars[len_chunk - 1];
355              if (c >= 0xD800 && c <= 0xDBFF) {
356                  // Exclude char in this chunk,
357                  // to avoid spanning a Unicode character
358                  // that is in two Java chars as a high/low surrogate
359                  end_chunk--;
360                  len_chunk--;
361                  if (chunk == chunks) {
362                      /* TODO: error message needed.
363                       * The String incorrectly ends in a high char
364                       * of a high/low surrogate pair, but there is
365                       * no corresponding low as the high is the last char
366                       * Recover by ignoring this last char.
367                       */
368                  }
369              }
370 
371              this.write(m_inputChars,0, len_chunk);
372          }
373          return;
374       }
375     }
376 
377 
378     s.getChars(0, length , m_inputChars, 0);
379     final char[] chars = m_inputChars;
380     final int n = length;
381     final byte[] buf_loc = m_outputBytes; // local reference for faster access
382     int count_loc = count;      // local integer for faster access
383     int i = 0;
384     {
385         /* This block could be omitted and the code would produce
386          * the same result. But this block exists to give the JIT
387          * a better chance of optimizing a tight and common loop which
388          * occurs when writing out ASCII characters.
389          */
390         char c;
391         for(; i < n && (c = chars[i])< 0x80 ; i++ )
392             buf_loc[count_loc++] = (byte)c;
393     }
394     for (; i < n; i++)
395     {
396 
397       final char c = chars[i];
398 
399       if (c < 0x80)
400         buf_loc[count_loc++] = (byte) (c);
401       else if (c < 0x800)
402       {
403         buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
404         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
405       }
406     /**
407       * The following else if condition is added to support XML 1.1 Characters for
408       * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
409       * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
410       *          [1101 11yy] [yyxx xxxx] (low surrogate)
411       *          * uuuuu = wwww + 1
412       */
413     else if (c >= 0xD800 && c <= 0xDBFF)
414     {
415         char high, low;
416         high = c;
417         i++;
418         low = chars[i];
419 
420         buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
421         buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
422         buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
423         buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
424     }
425       else
426       {
427         buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
428         buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
429         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
430       }
431     }
432     // Store the local integer back into the instance variable
433     count = count_loc;
434 
435   }
436 
437   /**
438    * Flush the internal buffer
439    *
440    * @throws IOException
441    */
flushBuffer()442   public void flushBuffer() throws IOException
443   {
444 
445     if (count > 0)
446     {
447       m_os.write(m_outputBytes, 0, count);
448 
449       count = 0;
450     }
451   }
452 
453   /**
454    * Flush the stream.  If the stream has saved any characters from the
455    * various write() methods in a buffer, write them immediately to their
456    * intended destination.  Then, if that destination is another character or
457    * byte stream, flush it.  Thus one flush() invocation will flush all the
458    * buffers in a chain of Writers and OutputStreams.
459    *
460    * @exception  IOException  If an I/O error occurs
461    *
462    * @throws java.io.IOException
463    */
flush()464   public void flush() throws java.io.IOException
465   {
466     flushBuffer();
467     m_os.flush();
468   }
469 
470   /**
471    * Close the stream, flushing it first.  Once a stream has been closed,
472    * further write() or flush() invocations will cause an IOException to be
473    * thrown.  Closing a previously-closed stream, however, has no effect.
474    *
475    * @exception  IOException  If an I/O error occurs
476    *
477    * @throws java.io.IOException
478    */
close()479   public void close() throws java.io.IOException
480   {
481     flushBuffer();
482     m_os.close();
483   }
484 
485   /**
486    * Get the output stream where the events will be serialized to.
487    *
488    * @return reference to the result stream, or null of only a writer was
489    * set.
490    */
getOutputStream()491   public OutputStream getOutputStream()
492   {
493     return m_os;
494   }
495 
getWriter()496   public Writer getWriter()
497   {
498     // Only one of getWriter() or getOutputStream() can return null
499     // This type of writer wraps an OutputStream, not a Writer.
500     return null;
501   }
502 }
503