1 /*
2  * XZInputStream
3  *
4  * Author: Lasse Collin <lasse.collin@tukaani.org>
5  *
6  * This file has been put into the public domain.
7  * You can do whatever you want with this file.
8  */
9 
10 package org.tukaani.xz;
11 
12 import java.io.InputStream;
13 import java.io.DataInputStream;
14 import java.io.IOException;
15 import java.io.EOFException;
16 import org.tukaani.xz.common.DecoderUtil;
17 
18 /**
19  * Decompresses a .xz file in streamed mode (no seeking).
20  * <p>
21  * Use this to decompress regular standalone .xz files. This reads from
22  * its input stream until the end of the input or until an error occurs.
23  * This supports decompressing concatenated .xz files.
24  *
25  * <h4>Typical use cases</h4>
26  * <p>
27  * Getting an input stream to decompress a .xz file:
28  * <p><blockquote><pre>
29  * InputStream infile = new FileInputStream("foo.xz");
30  * XZInputStream inxz = new XZInputStream(infile);
31  * </pre></blockquote>
32  * <p>
33  * It's important to keep in mind that decompressor memory usage depends
34  * on the settings used to compress the file. The worst-case memory usage
35  * of XZInputStream is currently 1.5&nbsp;GiB. Still, very few files will
36  * require more than about 65&nbsp;MiB because that's how much decompressing
37  * a file created with the highest preset level will need, and only a few
38  * people use settings other than the predefined presets.
39  * <p>
40  * It is possible to specify a memory usage limit for
41  * <code>XZInputStream</code>. If decompression requires more memory than
42  * the specified limit, MemoryLimitException will be thrown when reading
43  * from the stream. For example, the following sets the memory usage limit
44  * to 100&nbsp;MiB:
45  * <p><blockquote><pre>
46  * InputStream infile = new FileInputStream("foo.xz");
47  * XZInputStream inxz = new XZInputStream(infile, 100 * 1024);
48  * </pre></blockquote>
49  *
50  * <h4>When uncompressed size is known beforehand</h4>
51  * <p>
52  * If you are decompressing complete files and your application knows
53  * exactly how much uncompressed data there should be, it is good to try
54  * reading one more byte by calling <code>read()</code> and checking
55  * that it returns <code>-1</code>. This way the decompressor will parse the
56  * file footers and verify the integrity checks, giving the caller more
57  * confidence that the uncompressed data is valid. (This advice seems to
58  * apply to
59  * {@link java.util.zip.GZIPInputStream java.util.zip.GZIPInputStream} too.)
60  *
61  * @see SingleXZInputStream
62  */
63 public class XZInputStream extends InputStream {
64     private final ArrayCache arrayCache;
65 
66     private final int memoryLimit;
67     private InputStream in;
68     private SingleXZInputStream xzIn;
69     private final boolean verifyCheck;
70     private boolean endReached = false;
71     private IOException exception = null;
72 
73     private final byte[] tempBuf = new byte[1];
74 
75     /**
76      * Creates a new XZ decompressor without a memory usage limit.
77      * <p>
78      * This constructor reads and parses the XZ Stream Header (12 bytes)
79      * from <code>in</code>. The header of the first Block is not read
80      * until <code>read</code> is called.
81      *
82      * @param       in          input stream from which XZ-compressed
83      *                          data is read
84      *
85      * @throws      XZFormatException
86      *                          input is not in the XZ format
87      *
88      * @throws      CorruptedInputException
89      *                          XZ header CRC32 doesn't match
90      *
91      * @throws      UnsupportedOptionsException
92      *                          XZ header is valid but specifies options
93      *                          not supported by this implementation
94      *
95      * @throws      EOFException
96      *                          less than 12 bytes of input was available
97      *                          from <code>in</code>
98      *
99      * @throws      IOException may be thrown by <code>in</code>
100      */
XZInputStream(InputStream in)101     public XZInputStream(InputStream in) throws IOException {
102         this(in, -1);
103     }
104 
105     /**
106      * Creates a new XZ decompressor without a memory usage limit.
107      * <p>
108      * This is identical to <code>XZInputStream(InputStream)</code>
109      * except that this takes also the <code>arrayCache</code> argument.
110      *
111      * @param       in          input stream from which XZ-compressed
112      *                          data is read
113      *
114      * @param       arrayCache  cache to be used for allocating large arrays
115      *
116      * @throws      XZFormatException
117      *                          input is not in the XZ format
118      *
119      * @throws      CorruptedInputException
120      *                          XZ header CRC32 doesn't match
121      *
122      * @throws      UnsupportedOptionsException
123      *                          XZ header is valid but specifies options
124      *                          not supported by this implementation
125      *
126      * @throws      EOFException
127      *                          less than 12 bytes of input was available
128      *                          from <code>in</code>
129      *
130      * @throws      IOException may be thrown by <code>in</code>
131      *
132      * @since 1.7
133      */
XZInputStream(InputStream in, ArrayCache arrayCache)134     public XZInputStream(InputStream in, ArrayCache arrayCache)
135             throws IOException {
136         this(in, -1, arrayCache);
137     }
138 
139     /**
140      * Creates a new XZ decompressor with an optional memory usage limit.
141      * <p>
142      * This is identical to <code>XZInputStream(InputStream)</code> except
143      * that this takes also the <code>memoryLimit</code> argument.
144      *
145      * @param       in          input stream from which XZ-compressed
146      *                          data is read
147      *
148      * @param       memoryLimit memory usage limit in kibibytes (KiB)
149      *                          or <code>-1</code> to impose no
150      *                          memory usage limit
151      *
152      * @throws      XZFormatException
153      *                          input is not in the XZ format
154      *
155      * @throws      CorruptedInputException
156      *                          XZ header CRC32 doesn't match
157      *
158      * @throws      UnsupportedOptionsException
159      *                          XZ header is valid but specifies options
160      *                          not supported by this implementation
161      *
162      * @throws      EOFException
163      *                          less than 12 bytes of input was available
164      *                          from <code>in</code>
165      *
166      * @throws      IOException may be thrown by <code>in</code>
167      */
XZInputStream(InputStream in, int memoryLimit)168     public XZInputStream(InputStream in, int memoryLimit) throws IOException {
169         this(in, memoryLimit, true);
170     }
171 
172     /**
173      * Creates a new XZ decompressor with an optional memory usage limit.
174      * <p>
175      * This is identical to <code>XZInputStream(InputStream)</code> except
176      * that this takes also the <code>memoryLimit</code> and
177      * <code>arrayCache</code> arguments.
178      *
179      * @param       in          input stream from which XZ-compressed
180      *                          data is read
181      *
182      * @param       memoryLimit memory usage limit in kibibytes (KiB)
183      *                          or <code>-1</code> to impose no
184      *                          memory usage limit
185      *
186      * @param       arrayCache  cache to be used for allocating large arrays
187      *
188      * @throws      XZFormatException
189      *                          input is not in the XZ format
190      *
191      * @throws      CorruptedInputException
192      *                          XZ header CRC32 doesn't match
193      *
194      * @throws      UnsupportedOptionsException
195      *                          XZ header is valid but specifies options
196      *                          not supported by this implementation
197      *
198      * @throws      EOFException
199      *                          less than 12 bytes of input was available
200      *                          from <code>in</code>
201      *
202      * @throws      IOException may be thrown by <code>in</code>
203      *
204      * @since 1.7
205      */
XZInputStream(InputStream in, int memoryLimit, ArrayCache arrayCache)206     public XZInputStream(InputStream in, int memoryLimit,
207                          ArrayCache arrayCache) throws IOException {
208         this(in, memoryLimit, true, arrayCache);
209     }
210 
211     /**
212      * Creates a new XZ decompressor with an optional memory usage limit
213      * and ability to disable verification of integrity checks.
214      * <p>
215      * This is identical to <code>XZInputStream(InputStream,int)</code> except
216      * that this takes also the <code>verifyCheck</code> argument.
217      * <p>
218      * Note that integrity check verification should almost never be disabled.
219      * Possible reasons to disable integrity check verification:
220      * <ul>
221      *   <li>Trying to recover data from a corrupt .xz file.</li>
222      *   <li>Speeding up decompression. This matters mostly with SHA-256
223      *   or with files that have compressed extremely well. It's recommended
224      *   that integrity checking isn't disabled for performance reasons
225      *   unless the file integrity is verified externally in some other
226      *   way.</li>
227      * </ul>
228      * <p>
229      * <code>verifyCheck</code> only affects the integrity check of
230      * the actual compressed data. The CRC32 fields in the headers
231      * are always verified.
232      *
233      * @param       in          input stream from which XZ-compressed
234      *                          data is read
235      *
236      * @param       memoryLimit memory usage limit in kibibytes (KiB)
237      *                          or <code>-1</code> to impose no
238      *                          memory usage limit
239      *
240      * @param       verifyCheck if <code>true</code>, the integrity checks
241      *                          will be verified; this should almost never
242      *                          be set to <code>false</code>
243      *
244      * @throws      XZFormatException
245      *                          input is not in the XZ format
246      *
247      * @throws      CorruptedInputException
248      *                          XZ header CRC32 doesn't match
249      *
250      * @throws      UnsupportedOptionsException
251      *                          XZ header is valid but specifies options
252      *                          not supported by this implementation
253      *
254      * @throws      EOFException
255      *                          less than 12 bytes of input was available
256      *                          from <code>in</code>
257      *
258      * @throws      IOException may be thrown by <code>in</code>
259      *
260      * @since 1.6
261      */
XZInputStream(InputStream in, int memoryLimit, boolean verifyCheck)262     public XZInputStream(InputStream in, int memoryLimit, boolean verifyCheck)
263             throws IOException {
264         this(in, memoryLimit, verifyCheck, ArrayCache.getDefaultCache());
265     }
266 
267     /**
268      * Creates a new XZ decompressor with an optional memory usage limit
269      * and ability to disable verification of integrity checks.
270      * <p>
271      * This is identical to <code>XZInputStream(InputStream,int,boolean)</code>
272      * except that this takes also the <code>arrayCache</code> argument.
273      *
274      * @param       in          input stream from which XZ-compressed
275      *                          data is read
276      *
277      * @param       memoryLimit memory usage limit in kibibytes (KiB)
278      *                          or <code>-1</code> to impose no
279      *                          memory usage limit
280      *
281      * @param       verifyCheck if <code>true</code>, the integrity checks
282      *                          will be verified; this should almost never
283      *                          be set to <code>false</code>
284      *
285      * @param       arrayCache  cache to be used for allocating large arrays
286      *
287      * @throws      XZFormatException
288      *                          input is not in the XZ format
289      *
290      * @throws      CorruptedInputException
291      *                          XZ header CRC32 doesn't match
292      *
293      * @throws      UnsupportedOptionsException
294      *                          XZ header is valid but specifies options
295      *                          not supported by this implementation
296      *
297      * @throws      EOFException
298      *                          less than 12 bytes of input was available
299      *                          from <code>in</code>
300      *
301      * @throws      IOException may be thrown by <code>in</code>
302      *
303      * @since 1.7
304      */
XZInputStream(InputStream in, int memoryLimit, boolean verifyCheck, ArrayCache arrayCache)305     public XZInputStream(InputStream in, int memoryLimit, boolean verifyCheck,
306                          ArrayCache arrayCache) throws IOException {
307         this.arrayCache = arrayCache;
308         this.in = in;
309         this.memoryLimit = memoryLimit;
310         this.verifyCheck = verifyCheck;
311         this.xzIn = new SingleXZInputStream(in, memoryLimit, verifyCheck,
312                                             arrayCache);
313     }
314 
315     /**
316      * Decompresses the next byte from this input stream.
317      * <p>
318      * Reading lots of data with <code>read()</code> from this input stream
319      * may be inefficient. Wrap it in {@link java.io.BufferedInputStream}
320      * if you need to read lots of data one byte at a time.
321      *
322      * @return      the next decompressed byte, or <code>-1</code>
323      *              to indicate the end of the compressed stream
324      *
325      * @throws      CorruptedInputException
326      * @throws      UnsupportedOptionsException
327      * @throws      MemoryLimitException
328      *
329      * @throws      XZIOException if the stream has been closed
330      *
331      * @throws      EOFException
332      *                          compressed input is truncated or corrupt
333      *
334      * @throws      IOException may be thrown by <code>in</code>
335      */
read()336     public int read() throws IOException {
337         return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF);
338     }
339 
340     /**
341      * Decompresses into an array of bytes.
342      * <p>
343      * If <code>len</code> is zero, no bytes are read and <code>0</code>
344      * is returned. Otherwise this will try to decompress <code>len</code>
345      * bytes of uncompressed data. Less than <code>len</code> bytes may
346      * be read only in the following situations:
347      * <ul>
348      *   <li>The end of the compressed data was reached successfully.</li>
349      *   <li>An error is detected after at least one but less <code>len</code>
350      *       bytes have already been successfully decompressed.
351      *       The next call with non-zero <code>len</code> will immediately
352      *       throw the pending exception.</li>
353      *   <li>An exception is thrown.</li>
354      * </ul>
355      *
356      * @param       buf         target buffer for uncompressed data
357      * @param       off         start offset in <code>buf</code>
358      * @param       len         maximum number of uncompressed bytes to read
359      *
360      * @return      number of bytes read, or <code>-1</code> to indicate
361      *              the end of the compressed stream
362      *
363      * @throws      CorruptedInputException
364      * @throws      UnsupportedOptionsException
365      * @throws      MemoryLimitException
366      *
367      * @throws      XZIOException if the stream has been closed
368      *
369      * @throws      EOFException
370      *                          compressed input is truncated or corrupt
371      *
372      * @throws      IOException may be thrown by <code>in</code>
373      */
read(byte[] buf, int off, int len)374     public int read(byte[] buf, int off, int len) throws IOException {
375         if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
376             throw new IndexOutOfBoundsException();
377 
378         if (len == 0)
379             return 0;
380 
381         if (in == null)
382             throw new XZIOException("Stream closed");
383 
384         if (exception != null)
385             throw exception;
386 
387         if (endReached)
388             return -1;
389 
390         int size = 0;
391 
392         try {
393             while (len > 0) {
394                 if (xzIn == null) {
395                     prepareNextStream();
396                     if (endReached)
397                         return size == 0 ? -1 : size;
398                 }
399 
400                 int ret = xzIn.read(buf, off, len);
401 
402                 if (ret > 0) {
403                     size += ret;
404                     off += ret;
405                     len -= ret;
406                 } else if (ret == -1) {
407                     xzIn = null;
408                 }
409             }
410         } catch (IOException e) {
411             exception = e;
412             if (size == 0)
413                 throw e;
414         }
415 
416         return size;
417     }
418 
prepareNextStream()419     private void prepareNextStream() throws IOException {
420         DataInputStream inData = new DataInputStream(in);
421         byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE];
422 
423         // The size of Stream Padding must be a multiple of four bytes,
424         // all bytes zero.
425         do {
426             // First try to read one byte to see if we have reached the end
427             // of the file.
428             int ret = inData.read(buf, 0, 1);
429             if (ret == -1) {
430                 endReached = true;
431                 return;
432             }
433 
434             // Since we got one byte of input, there must be at least
435             // three more available in a valid file.
436             inData.readFully(buf, 1, 3);
437 
438         } while (buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] == 0);
439 
440         // Not all bytes are zero. In a valid Stream it indicates the
441         // beginning of the next Stream. Read the rest of the Stream Header
442         // and initialize the XZ decoder.
443         inData.readFully(buf, 4, DecoderUtil.STREAM_HEADER_SIZE - 4);
444 
445         try {
446             xzIn = new SingleXZInputStream(in, memoryLimit, verifyCheck, buf,
447                                            arrayCache);
448         } catch (XZFormatException e) {
449             // Since this isn't the first .xz Stream, it is more
450             // logical to tell that the data is corrupt.
451             throw new CorruptedInputException(
452                     "Garbage after a valid XZ Stream");
453         }
454     }
455 
456     /**
457      * Returns the number of uncompressed bytes that can be read
458      * without blocking. The value is returned with an assumption
459      * that the compressed input data will be valid. If the compressed
460      * data is corrupt, <code>CorruptedInputException</code> may get
461      * thrown before the number of bytes claimed to be available have
462      * been read from this input stream.
463      *
464      * @return      the number of uncompressed bytes that can be read
465      *              without blocking
466      */
available()467     public int available() throws IOException {
468         if (in == null)
469             throw new XZIOException("Stream closed");
470 
471         if (exception != null)
472             throw exception;
473 
474         return xzIn == null ? 0 : xzIn.available();
475     }
476 
477     /**
478      * Closes the stream and calls <code>in.close()</code>.
479      * If the stream was already closed, this does nothing.
480      * <p>
481      * This is equivalent to <code>close(true)</code>.
482      *
483      * @throws  IOException if thrown by <code>in.close()</code>
484      */
close()485     public void close() throws IOException {
486         close(true);
487     }
488 
489     /**
490      * Closes the stream and optionally calls <code>in.close()</code>.
491      * If the stream was already closed, this does nothing.
492      * If <code>close(false)</code> has been called, a further
493      * call of <code>close(true)</code> does nothing (it doesn't call
494      * <code>in.close()</code>).
495      * <p>
496      * If you don't want to close the underlying <code>InputStream</code>,
497      * there is usually no need to worry about closing this stream either;
498      * it's fine to do nothing and let the garbage collector handle it.
499      * However, if you are using {@link ArrayCache}, <code>close(false)</code>
500      * can be useful to put the allocated arrays back to the cache without
501      * closing the underlying <code>InputStream</code>.
502      * <p>
503      * Note that if you successfully reach the end of the stream
504      * (<code>read</code> returns <code>-1</code>), the arrays are
505      * automatically put back to the cache by that <code>read</code> call. In
506      * this situation <code>close(false)</code> is redundant (but harmless).
507      *
508      * @throws  IOException if thrown by <code>in.close()</code>
509      *
510      * @since 1.7
511      */
close(boolean closeInput)512     public void close(boolean closeInput) throws IOException {
513         if (in != null) {
514             if (xzIn != null) {
515                 xzIn.close(false);
516                 xzIn = null;
517             }
518 
519             try {
520                 if (closeInput)
521                     in.close();
522             } finally {
523                 in = null;
524             }
525         }
526     }
527 }
528