1 /*
2  * LZMA2InputStream
3  *
4  * Authors: Lasse Collin <lasse.collin@tukaani.org>
5  *          Igor Pavlov <http://7-zip.org/>
6  *
7  * This file has been put into the public domain.
8  * You can do whatever you want with this file.
9  */
10 
11 package org.tukaani.xz;
12 
13 import java.io.InputStream;
14 import java.io.DataInputStream;
15 import java.io.IOException;
16 import org.tukaani.xz.lz.LZDecoder;
17 import org.tukaani.xz.rangecoder.RangeDecoderFromBuffer;
18 import org.tukaani.xz.lzma.LZMADecoder;
19 
20 /**
21  * Decompresses a raw LZMA2 stream (no XZ headers).
22  */
23 public class LZMA2InputStream extends InputStream {
24     /**
25      * Smallest valid LZMA2 dictionary size.
26      * <p>
27      * Very tiny dictionaries would be a performance problem, so
28      * the minimum is 4 KiB.
29      */
30     public static final int DICT_SIZE_MIN = 4096;
31 
32     /**
33      * Largest dictionary size supported by this implementation.
34      * <p>
35      * The LZMA2 algorithm allows dictionaries up to one byte less than 4 GiB.
36      * This implementation supports only 16 bytes less than 2 GiB for raw
37      * LZMA2 streams, and for .xz files the maximum is 1.5 GiB. This
38      * limitation is due to Java using signed 32-bit integers for array
39      * indexing. The limitation shouldn't matter much in practice since so
40      * huge dictionaries are not normally used.
41      */
42     public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15;
43 
44     private static final int COMPRESSED_SIZE_MAX = 1 << 16;
45 
46     private final ArrayCache arrayCache;
47     private DataInputStream in;
48 
49     private LZDecoder lz;
50     private RangeDecoderFromBuffer rc;
51     private LZMADecoder lzma;
52 
53     private int uncompressedSize = 0;
54     private boolean isLZMAChunk = false;
55 
56     private boolean needDictReset = true;
57     private boolean needProps = true;
58     private boolean endReached = false;
59 
60     private IOException exception = null;
61 
62     private final byte[] tempBuf = new byte[1];
63 
64     /**
65      * Gets approximate decompressor memory requirements as kibibytes for
66      * the given dictionary size.
67      *
68      * @param       dictSize    LZMA2 dictionary size as bytes, must be
69      *                          in the range [<code>DICT_SIZE_MIN</code>,
70      *                          <code>DICT_SIZE_MAX</code>]
71      *
72      * @return      approximate memory requirements as kibibytes (KiB)
73      */
getMemoryUsage(int dictSize)74     public static int getMemoryUsage(int dictSize) {
75         // The base state is around 30-40 KiB (probabilities etc.),
76         // range decoder needs COMPRESSED_SIZE_MAX bytes for buffering,
77         // and LZ decoder needs a dictionary buffer.
78         return 40 + COMPRESSED_SIZE_MAX / 1024 + getDictSize(dictSize) / 1024;
79     }
80 
getDictSize(int dictSize)81     private static int getDictSize(int dictSize) {
82         if (dictSize < DICT_SIZE_MIN || dictSize > DICT_SIZE_MAX)
83             throw new IllegalArgumentException(
84                     "Unsupported dictionary size " + dictSize);
85 
86         // Round dictionary size upward to a multiple of 16. This way LZMA
87         // can use LZDecoder.getPos() for calculating LZMA's posMask.
88         // Note that this check is needed only for raw LZMA2 streams; it is
89         // redundant with .xz.
90         return (dictSize + 15) & ~15;
91     }
92 
93     /**
94      * Creates a new input stream that decompresses raw LZMA2 data
95      * from <code>in</code>.
96      * <p>
97      * The caller needs to know the dictionary size used when compressing;
98      * the dictionary size isn't stored as part of a raw LZMA2 stream.
99      * <p>
100      * Specifying a too small dictionary size will prevent decompressing
101      * the stream. Specifying a too big dictionary is waste of memory but
102      * decompression will work.
103      * <p>
104      * There is no need to specify a dictionary bigger than
105      * the uncompressed size of the data even if a bigger dictionary
106      * was used when compressing. If you know the uncompressed size
107      * of the data, this might allow saving some memory.
108      *
109      * @param       in          input stream from which LZMA2-compressed
110      *                          data is read
111      *
112      * @param       dictSize    LZMA2 dictionary size as bytes, must be
113      *                          in the range [<code>DICT_SIZE_MIN</code>,
114      *                          <code>DICT_SIZE_MAX</code>]
115      */
LZMA2InputStream(InputStream in, int dictSize)116     public LZMA2InputStream(InputStream in, int dictSize) {
117         this(in, dictSize, null);
118     }
119 
120     /**
121      * Creates a new LZMA2 decompressor using a preset dictionary.
122      * <p>
123      * This is like <code>LZMA2InputStream(InputStream, int)</code> except
124      * that the dictionary may be initialized using a preset dictionary.
125      * If a preset dictionary was used when compressing the data, the
126      * same preset dictionary must be provided when decompressing.
127      *
128      * @param       in          input stream from which LZMA2-compressed
129      *                          data is read
130      *
131      * @param       dictSize    LZMA2 dictionary size as bytes, must be
132      *                          in the range [<code>DICT_SIZE_MIN</code>,
133      *                          <code>DICT_SIZE_MAX</code>]
134      *
135      * @param       presetDict  preset dictionary or <code>null</code>
136      *                          to use no preset dictionary
137      */
LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict)138     public LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict) {
139         this(in, dictSize, presetDict, ArrayCache.getDefaultCache());
140     }
141 
142     /**
143      * Creates a new LZMA2 decompressor using a preset dictionary
144      * and array cache.
145      * <p>
146      * This is like <code>LZMA2InputStream(InputStream, int, byte[])</code>
147      * except that this also takes the <code>arrayCache</code> argument.
148      *
149      * @param       in          input stream from which LZMA2-compressed
150      *                          data is read
151      *
152      * @param       dictSize    LZMA2 dictionary size as bytes, must be
153      *                          in the range [<code>DICT_SIZE_MIN</code>,
154      *                          <code>DICT_SIZE_MAX</code>]
155      *
156      * @param       presetDict  preset dictionary or <code>null</code>
157      *                          to use no preset dictionary
158      *
159      * @param       arrayCache  cache to be used for allocating large arrays
160      *
161      * @since 1.7
162      */
LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict, ArrayCache arrayCache)163     LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict,
164                      ArrayCache arrayCache) {
165         // Check for null because otherwise null isn't detect
166         // in this constructor.
167         if (in == null)
168             throw new NullPointerException();
169 
170         this.arrayCache = arrayCache;
171         this.in = new DataInputStream(in);
172         this.rc = new RangeDecoderFromBuffer(COMPRESSED_SIZE_MAX, arrayCache);
173         this.lz = new LZDecoder(getDictSize(dictSize), presetDict, arrayCache);
174 
175         if (presetDict != null && presetDict.length > 0)
176             needDictReset = false;
177     }
178 
179     /**
180      * Decompresses the next byte from this input stream.
181      * <p>
182      * Reading lots of data with <code>read()</code> from this input stream
183      * may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
184      * if you need to read lots of data one byte at a time.
185      *
186      * @return      the next decompressed byte, or <code>-1</code>
187      *              to indicate the end of the compressed stream
188      *
189      * @throws      CorruptedInputException
190      *
191      * @throws      XZIOException if the stream has been closed
192      *
193      * @throws      EOFException
194      *                          compressed input is truncated or corrupt
195      *
196      * @throws      IOException may be thrown by <code>in</code>
197      */
read()198     public int read() throws IOException {
199         return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF);
200     }
201 
202     /**
203      * Decompresses into an array of bytes.
204      * <p>
205      * If <code>len</code> is zero, no bytes are read and <code>0</code>
206      * is returned. Otherwise this will block until <code>len</code>
207      * bytes have been decompressed, the end of the LZMA2 stream is reached,
208      * or an exception is thrown.
209      *
210      * @param       buf         target buffer for uncompressed data
211      * @param       off         start offset in <code>buf</code>
212      * @param       len         maximum number of uncompressed bytes to read
213      *
214      * @return      number of bytes read, or <code>-1</code> to indicate
215      *              the end of the compressed stream
216      *
217      * @throws      CorruptedInputException
218      *
219      * @throws      XZIOException if the stream has been closed
220      *
221      * @throws      EOFException
222      *                          compressed input is truncated or corrupt
223      *
224      * @throws      IOException may be thrown by <code>in</code>
225      */
read(byte[] buf, int off, int len)226     public int read(byte[] buf, int off, int len) throws IOException {
227         if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
228             throw new IndexOutOfBoundsException();
229 
230         if (len == 0)
231             return 0;
232 
233         if (in == null)
234             throw new XZIOException("Stream closed");
235 
236         if (exception != null)
237             throw exception;
238 
239         if (endReached)
240             return -1;
241 
242         try {
243             int size = 0;
244 
245             while (len > 0) {
246                 if (uncompressedSize == 0) {
247                     decodeChunkHeader();
248                     if (endReached)
249                         return size == 0 ? -1 : size;
250                 }
251 
252                 int copySizeMax = Math.min(uncompressedSize, len);
253 
254                 if (!isLZMAChunk) {
255                     lz.copyUncompressed(in, copySizeMax);
256                 } else {
257                     lz.setLimit(copySizeMax);
258                     lzma.decode();
259                 }
260 
261                 int copiedSize = lz.flush(buf, off);
262                 off += copiedSize;
263                 len -= copiedSize;
264                 size += copiedSize;
265                 uncompressedSize -= copiedSize;
266 
267                 if (uncompressedSize == 0)
268                     if (!rc.isFinished() || lz.hasPending())
269                         throw new CorruptedInputException();
270             }
271 
272             return size;
273 
274         } catch (IOException e) {
275             exception = e;
276             throw e;
277         }
278     }
279 
decodeChunkHeader()280     private void decodeChunkHeader() throws IOException {
281         int control = in.readUnsignedByte();
282 
283         if (control == 0x00) {
284             endReached = true;
285             putArraysToCache();
286             return;
287         }
288 
289         if (control >= 0xE0 || control == 0x01) {
290             needProps = true;
291             needDictReset = false;
292             lz.reset();
293         } else if (needDictReset) {
294             throw new CorruptedInputException();
295         }
296 
297         if (control >= 0x80) {
298             isLZMAChunk = true;
299 
300             uncompressedSize = (control & 0x1F) << 16;
301             uncompressedSize += in.readUnsignedShort() + 1;
302 
303             int compressedSize = in.readUnsignedShort() + 1;
304 
305             if (control >= 0xC0) {
306                 needProps = false;
307                 decodeProps();
308 
309             } else if (needProps) {
310                 throw new CorruptedInputException();
311 
312             } else if (control >= 0xA0) {
313                 lzma.reset();
314             }
315 
316             rc.prepareInputBuffer(in, compressedSize);
317 
318         } else if (control > 0x02) {
319             throw new CorruptedInputException();
320 
321         } else {
322             isLZMAChunk = false;
323             uncompressedSize = in.readUnsignedShort() + 1;
324         }
325     }
326 
decodeProps()327     private void decodeProps() throws IOException {
328         int props = in.readUnsignedByte();
329 
330         if (props > (4 * 5 + 4) * 9 + 8)
331             throw new CorruptedInputException();
332 
333         int pb = props / (9 * 5);
334         props -= pb * 9 * 5;
335         int lp = props / 9;
336         int lc = props - lp * 9;
337 
338         if (lc + lp > 4)
339             throw new CorruptedInputException();
340 
341         lzma = new LZMADecoder(lz, rc, lc, lp, pb);
342     }
343 
344     /**
345      * Returns the number of uncompressed bytes that can be read
346      * without blocking. The value is returned with an assumption
347      * that the compressed input data will be valid. If the compressed
348      * data is corrupt, <code>CorruptedInputException</code> may get
349      * thrown before the number of bytes claimed to be available have
350      * been read from this input stream.
351      * <p>
352      * In LZMA2InputStream, the return value will be non-zero when the
353      * decompressor is in the middle of an LZMA2 chunk. The return value
354      * will then be the number of uncompressed bytes remaining from that
355      * chunk. The return value can also be non-zero in the middle of
356      * an uncompressed chunk, but then the return value depends also on
357      * the <code>available()</code> method of the underlying InputStream.
358      *
359      * @return      the number of uncompressed bytes that can be read
360      *              without blocking
361      */
available()362     public int available() throws IOException {
363         if (in == null)
364             throw new XZIOException("Stream closed");
365 
366         if (exception != null)
367             throw exception;
368 
369         return isLZMAChunk ? uncompressedSize
370                            : Math.min(uncompressedSize, in.available());
371     }
372 
putArraysToCache()373     private void putArraysToCache() {
374         if (lz != null) {
375             lz.putArraysToCache(arrayCache);
376             lz = null;
377 
378             rc.putArraysToCache(arrayCache);
379             rc = null;
380         }
381     }
382 
383     /**
384      * Closes the stream and calls <code>in.close()</code>.
385      * If the stream was already closed, this does nothing.
386      *
387      * @throws  IOException if thrown by <code>in.close()</code>
388      */
close()389     public void close() throws IOException {
390         if (in != null) {
391             putArraysToCache();
392 
393             try {
394                 in.close();
395             } finally {
396                 in = null;
397             }
398         }
399     }
400 }
401