1 /*
2  * LZMAInputStream
3  *
4  * Authors: Lasse Collin <lasse.collin@tukaani.org>
5  *          Igor Pavlov <http://7-zip.org/>
6  *
7  * This file has been put into the public domain.
8  * You can do whatever you want with this file.
9  */
10 
11 package org.tukaani.xz;
12 
13 import java.io.InputStream;
14 import java.io.DataInputStream;
15 import java.io.IOException;
16 import org.tukaani.xz.lz.LZDecoder;
17 import org.tukaani.xz.rangecoder.RangeDecoderFromStream;
18 import org.tukaani.xz.lzma.LZMADecoder;
19 
20 /**
21  * Decompresses legacy .lzma files and raw LZMA streams (no .lzma header).
22  * <p>
23  * <b>IMPORTANT:</b> In contrast to other classes in this package, this class
24  * reads data from its input stream one byte at a time. If the input stream
25  * is for example {@link java.io.FileInputStream}, wrapping it into
26  * {@link java.io.BufferedInputStream} tends to improve performance a lot.
27  * This is not automatically done by this class because there may be use
28  * cases where it is desired that this class won't read any bytes past
29  * the end of the LZMA stream.
30  * <p>
31  * Even when using <code>BufferedInputStream</code>, the performance tends
32  * to be worse (maybe 10-20&nbsp;% slower) than with {@link LZMA2InputStream}
33  * or {@link XZInputStream} (when the .xz file contains LZMA2-compressed data).
34  *
35  * @since 1.4
36  */
37 public class LZMAInputStream extends InputStream {
38     /**
39      * Largest dictionary size supported by this implementation.
40      * <p>
41      * LZMA allows dictionaries up to one byte less than 4 GiB. This
42      * implementation supports only 16 bytes less than 2 GiB. This
43      * limitation is due to Java using signed 32-bit integers for array
44      * indexing. The limitation shouldn't matter much in practice since so
45      * huge dictionaries are not normally used.
46      */
47     public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15;
48 
49     private InputStream in;
50     private LZDecoder lz;
51     private RangeDecoderFromStream rc;
52     private LZMADecoder lzma;
53 
54     private boolean endReached = false;
55 
56     private final byte[] tempBuf = new byte[1];
57 
58     /**
59      * Number of uncompressed bytes left to be decompressed, or -1 if
60      * the end marker is used.
61      */
62     private long remainingSize;
63 
64     private IOException exception = null;
65 
66     /**
67      * Gets approximate decompressor memory requirements as kibibytes for
68      * the given dictionary size and LZMA properties byte (lc, lp, and pb).
69      *
70      * @param       dictSize    LZMA dictionary size as bytes, should be
71      *                          in the range [<code>0</code>,
72      *                          <code>DICT_SIZE_MAX</code>]
73      *
74      * @param       propsByte   LZMA properties byte that encodes the values
75      *                          of lc, lp, and pb
76      *
77      * @return      approximate memory requirements as kibibytes (KiB)
78      *
79      * @throws      UnsupportedOptionsException
80      *                          if <code>dictSize</code> is outside
81      *                          the range [<code>0</code>,
82      *                          <code>DICT_SIZE_MAX</code>]
83      *
84      * @throws      CorruptedInputException
85      *                          if <code>propsByte</code> is invalid
86      */
getMemoryUsage(int dictSize, byte propsByte)87     public static int getMemoryUsage(int dictSize, byte propsByte)
88             throws UnsupportedOptionsException, CorruptedInputException {
89         if (dictSize < 0 || dictSize > DICT_SIZE_MAX)
90             throw new UnsupportedOptionsException(
91                     "LZMA dictionary is too big for this implementation");
92 
93         int props = propsByte & 0xFF;
94         if (props > (4 * 5 + 4) * 9 + 8)
95             throw new CorruptedInputException("Invalid LZMA properties byte");
96 
97         props %= 9 * 5;
98         int lp = props / 9;
99         int lc = props - lp * 9;
100 
101         return getMemoryUsage(dictSize, lc, lp);
102     }
103 
104     /**
105      * Gets approximate decompressor memory requirements as kibibytes for
106      * the given dictionary size, lc, and lp. Note that pb isn't needed.
107      *
108      * @param       dictSize    LZMA dictionary size as bytes, must be
109      *                          in the range [<code>0</code>,
110      *                          <code>DICT_SIZE_MAX</code>]
111      *
112      * @param       lc          number of literal context bits, must be
113      *                          in the range [0, 8]
114      *
115      * @param       lp          number of literal position bits, must be
116      *                          in the range [0, 4]
117      *
118      * @return      approximate memory requirements as kibibytes (KiB)
119      */
getMemoryUsage(int dictSize, int lc, int lp)120     public static int getMemoryUsage(int dictSize, int lc, int lp) {
121         if (lc < 0 || lc > 8 || lp < 0 || lp > 4)
122             throw new IllegalArgumentException("Invalid lc or lp");
123 
124         // Probability variables have the type "short". There are
125         // 0x300 (768) probability variables in each literal subcoder.
126         // The number of literal subcoders is 2^(lc + lp).
127         //
128         // Roughly 10 KiB for the base state + LZ decoder's dictionary buffer
129         // + sizeof(short) * number probability variables per literal subcoder
130         //   * number of literal subcoders
131         return 10 + getDictSize(dictSize) / 1024
132                + ((2 * 0x300) << (lc + lp)) / 1024;
133     }
134 
getDictSize(int dictSize)135     private static int getDictSize(int dictSize) {
136         if (dictSize < 0 || dictSize > DICT_SIZE_MAX)
137             throw new IllegalArgumentException(
138                     "LZMA dictionary is too big for this implementation");
139 
140         // For performance reasons, use a 4 KiB dictionary if something
141         // smaller was requested. It's a rare situation and the performance
142         // difference isn't huge, and it starts to matter mostly when the
143         // dictionary is just a few bytes. But we need to handle the special
144         // case of dictSize == 0 anyway, which is an allowed value but in
145         // practice means one-byte dictionary.
146         //
147         // Note that using a dictionary bigger than specified in the headers
148         // can hide errors if there is a reference to data beyond the original
149         // dictionary size but is still within 4 KiB.
150         if (dictSize < 4096)
151             dictSize = 4096;
152 
153         // Round dictionary size upward to a multiple of 16. This way LZMA
154         // can use LZDecoder.getPos() for calculating LZMA's posMask.
155         return (dictSize + 15) & ~15;
156     }
157 
158     /**
159      * Creates a new .lzma file format decompressor without
160      * a memory usage limit.
161      *
162      * @param       in          input stream from which .lzma data is read;
163      *                          it might be a good idea to wrap it in
164      *                          <code>BufferedInputStream</code>, see the
165      *                          note at the top of this page
166      *
167      * @throws      CorruptedInputException
168      *                          file is corrupt or perhaps not in
169      *                          the .lzma format at all
170      *
171      * @throws      UnsupportedOptionsException
172      *                          dictionary size or uncompressed size is too
173      *                          big for this implementation
174      *
175      * @throws      EOFException
176      *                          file is truncated or perhaps not in
177      *                          the .lzma format at all
178      *
179      * @throws      IOException may be thrown by <code>in</code>
180      */
LZMAInputStream(InputStream in)181     public LZMAInputStream(InputStream in) throws IOException {
182         this(in, -1);
183     }
184 
185     /**
186      * Creates a new .lzma file format decompressor with an optional
187      * memory usage limit.
188      *
189      * @param       in          input stream from which .lzma data is read;
190      *                          it might be a good idea to wrap it in
191      *                          <code>BufferedInputStream</code>, see the
192      *                          note at the top of this page
193      *
194      * @param       memoryLimit memory usage limit in kibibytes (KiB)
195      *                          or <code>-1</code> to impose no
196      *                          memory usage limit
197      *
198      * @throws      CorruptedInputException
199      *                          file is corrupt or perhaps not in
200      *                          the .lzma format at all
201      *
202      * @throws      UnsupportedOptionsException
203      *                          dictionary size or uncompressed size is too
204      *                          big for this implementation
205      *
206      * @throws      MemoryLimitException
207      *                          memory usage limit was exceeded
208      *
209      * @throws      EOFException
210      *                          file is truncated or perhaps not in
211      *                          the .lzma format at all
212      *
213      * @throws      IOException may be thrown by <code>in</code>
214      */
LZMAInputStream(InputStream in, int memoryLimit)215     public LZMAInputStream(InputStream in, int memoryLimit)
216             throws IOException {
217         DataInputStream inData = new DataInputStream(in);
218 
219         // Properties byte (lc, lp, and pb)
220         byte propsByte = inData.readByte();
221 
222         // Dictionary size is an unsigned 32-bit little endian integer.
223         int dictSize = 0;
224         for (int i = 0; i < 4; ++i)
225             dictSize |= inData.readUnsignedByte() << (8 * i);
226 
227         // Uncompressed size is an unsigned 64-bit little endian integer.
228         // The maximum 64-bit value is a special case (becomes -1 here)
229         // which indicates that the end marker is used instead of knowing
230         // the uncompressed size beforehand.
231         long uncompSize = 0;
232         for (int i = 0; i < 8; ++i)
233             uncompSize |= (long)inData.readUnsignedByte() << (8 * i);
234 
235         // Check the memory usage limit.
236         int memoryNeeded = getMemoryUsage(dictSize, propsByte);
237         if (memoryLimit != -1 && memoryNeeded > memoryLimit)
238             throw new MemoryLimitException(memoryNeeded, memoryLimit);
239 
240         initialize(in, uncompSize, propsByte, dictSize, null);
241     }
242 
243     /**
244      * Creates a new input stream that decompresses raw LZMA data (no .lzma
245      * header) from <code>in</code>.
246      * <p>
247      * The caller needs to know if the "end of payload marker (EOPM)" alias
248      * "end of stream marker (EOS marker)" alias "end marker" present.
249      * If the end marker isn't used, the caller must know the exact
250      * uncompressed size of the stream.
251      * <p>
252      * The caller also needs to provide the LZMA properties byte that encodes
253      * the number of literal context bits (lc), literal position bits (lp),
254      * and position bits (pb).
255      * <p>
256      * The dictionary size used when compressing is also needed. Specifying
257      * a too small dictionary size will prevent decompressing the stream.
258      * Specifying a too big dictionary is waste of memory but decompression
259      * will work.
260      * <p>
261      * There is no need to specify a dictionary bigger than
262      * the uncompressed size of the data even if a bigger dictionary
263      * was used when compressing. If you know the uncompressed size
264      * of the data, this might allow saving some memory.
265      *
266      * @param       in          input stream from which compressed
267      *                          data is read
268      *
269      * @param       uncompSize  uncompressed size of the LZMA stream or -1
270      *                          if the end marker is used in the LZMA stream
271      *
272      * @param       propsByte   LZMA properties byte that has the encoded
273      *                          values for literal context bits (lc), literal
274      *                          position bits (lp), and position bits (pb)
275      *
276      * @param       dictSize    dictionary size as bytes, must be in the range
277      *                          [<code>0</code>, <code>DICT_SIZE_MAX</code>]
278      *
279      * @throws      CorruptedInputException
280      *                          if <code>propsByte</code> is invalid or
281      *                          the first input byte is not 0x00
282      *
283      * @throws      UnsupportedOptionsException
284      *                          dictionary size or uncompressed size is too
285      *                          big for this implementation
286      *
287      *
288      */
LZMAInputStream(InputStream in, long uncompSize, byte propsByte, int dictSize)289     public LZMAInputStream(InputStream in, long uncompSize, byte propsByte,
290                            int dictSize) throws IOException {
291         initialize(in, uncompSize, propsByte, dictSize, null);
292     }
293 
294     /**
295      * Creates a new input stream that decompresses raw LZMA data (no .lzma
296      * header) from <code>in</code> optionally with a preset dictionary.
297      *
298      * @param       in          input stream from which LZMA-compressed
299      *                          data is read
300      *
301      * @param       uncompSize  uncompressed size of the LZMA stream or -1
302      *                          if the end marker is used in the LZMA stream
303      *
304      * @param       propsByte   LZMA properties byte that has the encoded
305      *                          values for literal context bits (lc), literal
306      *                          position bits (lp), and position bits (pb)
307      *
308      * @param       dictSize    dictionary size as bytes, must be in the range
309      *                          [<code>0</code>, <code>DICT_SIZE_MAX</code>]
310      *
311      * @param       presetDict  preset dictionary or <code>null</code>
312      *                          to use no preset dictionary
313      *
314      * @throws      CorruptedInputException
315      *                          if <code>propsByte</code> is invalid or
316      *                          the first input byte is not 0x00
317      *
318      * @throws      UnsupportedOptionsException
319      *                          dictionary size or uncompressed size is too
320      *                          big for this implementation
321      *
322      * @throws      EOFException file is truncated or corrupt
323      *
324      * @throws      IOException may be thrown by <code>in</code>
325      */
LZMAInputStream(InputStream in, long uncompSize, byte propsByte, int dictSize, byte[] presetDict)326     public LZMAInputStream(InputStream in, long uncompSize, byte propsByte,
327                            int dictSize, byte[] presetDict)
328             throws IOException {
329         initialize(in, uncompSize, propsByte, dictSize, presetDict);
330     }
331 
332     /**
333      * Creates a new input stream that decompresses raw LZMA data (no .lzma
334      * header) from <code>in</code> optionally with a preset dictionary.
335      *
336      * @param       in          input stream from which LZMA-compressed
337      *                          data is read
338      *
339      * @param       uncompSize  uncompressed size of the LZMA stream or -1
340      *                          if the end marker is used in the LZMA stream
341      *
342      * @param       lc          number of literal context bits, must be
343      *                          in the range [0, 8]
344      *
345      * @param       lp          number of literal position bits, must be
346      *                          in the range [0, 4]
347      *
348      * @param       pb          number position bits, must be
349      *                          in the range [0, 4]
350      *
351      * @param       dictSize    dictionary size as bytes, must be in the range
352      *                          [<code>0</code>, <code>DICT_SIZE_MAX</code>]
353      *
354      * @param       presetDict  preset dictionary or <code>null</code>
355      *                          to use no preset dictionary
356      *
357      * @throws      CorruptedInputException
358      *                          if the first input byte is not 0x00
359      *
360      * @throws      EOFException file is truncated or corrupt
361      *
362      * @throws      IOException may be thrown by <code>in</code>
363      */
LZMAInputStream(InputStream in, long uncompSize, int lc, int lp, int pb, int dictSize, byte[] presetDict)364     public LZMAInputStream(InputStream in, long uncompSize,
365                            int lc, int lp, int pb,
366                            int dictSize, byte[] presetDict)
367             throws IOException {
368         initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict);
369     }
370 
initialize(InputStream in, long uncompSize, byte propsByte, int dictSize, byte[] presetDict)371     private void initialize(InputStream in, long uncompSize, byte propsByte,
372                             int dictSize, byte[] presetDict)
373             throws IOException {
374         // Validate the uncompressed size since the other "initialize" throws
375         // IllegalArgumentException if uncompSize < -1.
376         if (uncompSize < -1)
377             throw new UnsupportedOptionsException(
378                     "Uncompressed size is too big");
379 
380         // Decode the properties byte. In contrast to LZMA2, there is no
381         // limit of lc + lp <= 4.
382         int props = propsByte & 0xFF;
383         if (props > (4 * 5 + 4) * 9 + 8)
384             throw new CorruptedInputException("Invalid LZMA properties byte");
385 
386         int pb = props / (9 * 5);
387         props -= pb * 9 * 5;
388         int lp = props / 9;
389         int lc = props - lp * 9;
390 
391         // Validate the dictionary size since the other "initialize" throws
392         // IllegalArgumentException if dictSize is not supported.
393         if (dictSize < 0 || dictSize > DICT_SIZE_MAX)
394             throw new UnsupportedOptionsException(
395                     "LZMA dictionary is too big for this implementation");
396 
397         initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict);
398     }
399 
initialize(InputStream in, long uncompSize, int lc, int lp, int pb, int dictSize, byte[] presetDict)400     private void initialize(InputStream in, long uncompSize,
401                             int lc, int lp, int pb,
402                             int dictSize, byte[] presetDict)
403             throws IOException {
404         // getDictSize validates dictSize and gives a message in
405         // the exception too, so skip validating dictSize here.
406         if (uncompSize < -1 || lc < 0 || lc > 8 || lp < 0 || lp > 4
407                 || pb < 0 || pb > 4)
408             throw new IllegalArgumentException();
409 
410         this.in = in;
411 
412         // If uncompressed size is known, use it to avoid wasting memory for
413         // a uselessly large dictionary buffer.
414         dictSize = getDictSize(dictSize);
415         if (uncompSize >= 0 && dictSize > uncompSize)
416             dictSize = getDictSize((int)uncompSize);
417 
418         lz = new LZDecoder(getDictSize(dictSize), presetDict);
419         rc = new RangeDecoderFromStream(in);
420         lzma = new LZMADecoder(lz, rc, lc, lp, pb);
421         remainingSize = uncompSize;
422     }
423 
424     /**
425      * Decompresses the next byte from this input stream.
426      * <p>
427      * Reading lots of data with <code>read()</code> from this input stream
428      * may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
429      * if you need to read lots of data one byte at a time.
430      *
431      * @return      the next decompressed byte, or <code>-1</code>
432      *              to indicate the end of the compressed stream
433      *
434      * @throws      CorruptedInputException
435      *
436      * @throws      XZIOException if the stream has been closed
437      *
438      * @throws      EOFException
439      *                          compressed input is truncated or corrupt
440      *
441      * @throws      IOException may be thrown by <code>in</code>
442      */
read()443     public int read() throws IOException {
444         return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF);
445     }
446 
447     /**
448      * Decompresses into an array of bytes.
449      * <p>
450      * If <code>len</code> is zero, no bytes are read and <code>0</code>
451      * is returned. Otherwise this will block until <code>len</code>
452      * bytes have been decompressed, the end of the LZMA stream is reached,
453      * or an exception is thrown.
454      *
455      * @param       buf         target buffer for uncompressed data
456      * @param       off         start offset in <code>buf</code>
457      * @param       len         maximum number of uncompressed bytes to read
458      *
459      * @return      number of bytes read, or <code>-1</code> to indicate
460      *              the end of the compressed stream
461      *
462      * @throws      CorruptedInputException
463      *
464      * @throws      XZIOException if the stream has been closed
465      *
466      * @throws      EOFException compressed input is truncated or corrupt
467      *
468      * @throws      IOException may be thrown by <code>in</code>
469      */
read(byte[] buf, int off, int len)470     public int read(byte[] buf, int off, int len) throws IOException {
471         if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
472             throw new IndexOutOfBoundsException();
473 
474         if (len == 0)
475             return 0;
476 
477         if (in == null)
478             throw new XZIOException("Stream closed");
479 
480         if (exception != null)
481             throw exception;
482 
483         if (endReached)
484             return -1;
485 
486         try {
487             int size = 0;
488 
489             while (len > 0) {
490                 // If uncompressed size is known and thus no end marker will
491                 // be present, set the limit so that the uncompressed size
492                 // won't be exceeded.
493                 int copySizeMax = len;
494                 if (remainingSize >= 0 && remainingSize < len)
495                     copySizeMax = (int)remainingSize;
496 
497                 lz.setLimit(copySizeMax);
498 
499                 // Decode into the dictionary buffer.
500                 try {
501                     lzma.decode();
502                 } catch (CorruptedInputException e) {
503                     // The end marker is encoded with a LZMA symbol that
504                     // indicates maximum match distance. This is larger
505                     // than any supported dictionary and thus causes
506                     // CorruptedInputException from LZDecoder.repeat.
507                     if (remainingSize != -1 || !lzma.endMarkerDetected())
508                         throw e;
509 
510                     endReached = true;
511 
512                     // The exception makes lzma.decode() miss the last range
513                     // decoder normalization, so do it here. This might
514                     // cause an IOException if it needs to read a byte
515                     // from the input stream.
516                     rc.normalize();
517                 }
518 
519                 // Copy from the dictionary to buf.
520                 int copiedSize = lz.flush(buf, off);
521                 off += copiedSize;
522                 len -= copiedSize;
523                 size += copiedSize;
524 
525                 if (remainingSize >= 0) {
526                     // Update the number of bytes left to be decompressed.
527                     remainingSize -= copiedSize;
528                     assert remainingSize >= 0;
529 
530                     if (remainingSize == 0)
531                         endReached = true;
532                 }
533 
534                 if (endReached) {
535                     // Checking these helps a lot when catching corrupt
536                     // or truncated .lzma files. LZMA Utils doesn't do
537                     // the first check and thus it accepts many invalid
538                     // files that this implementation and XZ Utils don't.
539                     if (!rc.isFinished() || lz.hasPending())
540                         throw new CorruptedInputException();
541 
542                     return size == 0 ? -1 : size;
543                 }
544             }
545 
546             return size;
547 
548         } catch (IOException e) {
549             exception = e;
550             throw e;
551         }
552     }
553 
554     /**
555      * Closes the stream and calls <code>in.close()</code>.
556      * If the stream was already closed, this does nothing.
557      *
558      * @throws  IOException if thrown by <code>in.close()</code>
559      */
close()560     public void close() throws IOException {
561         if (in != null) {
562             try {
563                 in.close();
564             } finally {
565                 in = null;
566             }
567         }
568     }
569 }
570