1 /*
2  * LZMA2InputStream
3  *
4  * Authors: Lasse Collin <lasse.collin@tukaani.org>
5  *          Igor Pavlov <http://7-zip.org/>
6  *
7  * This file has been put into the public domain.
8  * You can do whatever you want with this file.
9  */
10 
11 package org.tukaani.xz;
12 
13 import java.io.InputStream;
14 import java.io.DataInputStream;
15 import java.io.IOException;
16 import org.tukaani.xz.lz.LZDecoder;
17 import org.tukaani.xz.rangecoder.RangeDecoderFromBuffer;
18 import org.tukaani.xz.lzma.LZMADecoder;
19 
20 /**
21  * Decompresses a raw LZMA2 stream (no XZ headers).
22  */
23 public class LZMA2InputStream extends InputStream {
24     /**
25      * Smallest valid LZMA2 dictionary size.
26      * <p>
27      * Very tiny dictionaries would be a performance problem, so
28      * the minimum is 4 KiB.
29      */
30     public static final int DICT_SIZE_MIN = 4096;
31 
32     /**
33      * Largest dictionary size supported by this implementation.
34      * <p>
35      * The LZMA2 algorithm allows dictionaries up to one byte less than 4 GiB.
36      * This implementation supports only 16 bytes less than 2 GiB for raw
37      * LZMA2 streams, and for .xz files the maximum is 1.5 GiB. This
38      * limitation is due to Java using signed 32-bit integers for array
39      * indexing. The limitation shouldn't matter much in practice since so
40      * huge dictionaries are not normally used.
41      */
42     public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15;
43 
44     private static final int COMPRESSED_SIZE_MAX = 1 << 16;
45 
46     private DataInputStream in;
47 
48     private final LZDecoder lz;
49     private final RangeDecoderFromBuffer rc
50             = new RangeDecoderFromBuffer(COMPRESSED_SIZE_MAX);
51     private LZMADecoder lzma;
52 
53     private int uncompressedSize = 0;
54     private boolean isLZMAChunk;
55 
56     private boolean needDictReset = true;
57     private boolean needProps = true;
58     private boolean endReached = false;
59 
60     private IOException exception = null;
61 
62     private final byte[] tempBuf = new byte[1];
63 
64     /**
65      * Gets approximate decompressor memory requirements as kibibytes for
66      * the given dictionary size.
67      *
68      * @param       dictSize    LZMA2 dictionary size as bytes, must be
69      *                          in the range [<code>DICT_SIZE_MIN</code>,
70      *                          <code>DICT_SIZE_MAX</code>]
71      *
72      * @return      approximate memory requirements as kibibytes (KiB)
73      */
getMemoryUsage(int dictSize)74     public static int getMemoryUsage(int dictSize) {
75         // The base state is around 30-40 KiB (probabilities etc.),
76         // range decoder needs COMPRESSED_SIZE_MAX bytes for buffering,
77         // and LZ decoder needs a dictionary buffer.
78         return 40 + COMPRESSED_SIZE_MAX / 1024 + getDictSize(dictSize) / 1024;
79     }
80 
getDictSize(int dictSize)81     private static int getDictSize(int dictSize) {
82         if (dictSize < DICT_SIZE_MIN || dictSize > DICT_SIZE_MAX)
83             throw new IllegalArgumentException(
84                     "Unsupported dictionary size " + dictSize);
85 
86         // Round dictionary size upward to a multiple of 16. This way LZMA
87         // can use LZDecoder.getPos() for calculating LZMA's posMask.
88         // Note that this check is needed only for raw LZMA2 streams; it is
89         // redundant with .xz.
90         return (dictSize + 15) & ~15;
91     }
92 
93     /**
94      * Creates a new input stream that decompresses raw LZMA2 data
95      * from <code>in</code>.
96      * <p>
97      * The caller needs to know the dictionary size used when compressing;
98      * the dictionary size isn't stored as part of a raw LZMA2 stream.
99      * <p>
100      * Specifying a too small dictionary size will prevent decompressing
101      * the stream. Specifying a too big dictionary is waste of memory but
102      * decompression will work.
103      * <p>
104      * There is no need to specify a dictionary bigger than
105      * the uncompressed size of the data even if a bigger dictionary
106      * was used when compressing. If you know the uncompressed size
107      * of the data, this might allow saving some memory.
108      *
109      * @param       in          input stream from which LZMA2-compressed
110      *                          data is read
111      *
112      * @param       dictSize    LZMA2 dictionary size as bytes, must be
113      *                          in the range [<code>DICT_SIZE_MIN</code>,
114      *                          <code>DICT_SIZE_MAX</code>]
115      */
LZMA2InputStream(InputStream in, int dictSize)116     public LZMA2InputStream(InputStream in, int dictSize) {
117         this(in, dictSize, null);
118     }
119 
120     /**
121      * Creates a new LZMA2 decompressor using a preset dictionary.
122      * <p>
123      * This is like <code>LZMA2InputStream(InputStream, int)</code> except
124      * that the dictionary may be initialized using a preset dictionary.
125      * If a preset dictionary was used when compressing the data, the
126      * same preset dictionary must be provided when decompressing.
127      *
128      * @param       in          input stream from which LZMA2-compressed
129      *                          data is read
130      *
131      * @param       dictSize    LZMA2 dictionary size as bytes, must be
132      *                          in the range [<code>DICT_SIZE_MIN</code>,
133      *                          <code>DICT_SIZE_MAX</code>]
134      *
135      * @param       presetDict  preset dictionary or <code>null</code>
136      *                          to use no preset dictionary
137      */
LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict)138     public LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict) {
139         // Check for null because otherwise null isn't detect
140         // in this constructor.
141         if (in == null)
142             throw new NullPointerException();
143 
144         this.in = new DataInputStream(in);
145         this.lz = new LZDecoder(getDictSize(dictSize), presetDict);
146 
147         if (presetDict != null && presetDict.length > 0)
148             needDictReset = false;
149     }
150 
151     /**
152      * Decompresses the next byte from this input stream.
153      * <p>
154      * Reading lots of data with <code>read()</code> from this input stream
155      * may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
156      * if you need to read lots of data one byte at a time.
157      *
158      * @return      the next decompressed byte, or <code>-1</code>
159      *              to indicate the end of the compressed stream
160      *
161      * @throws      CorruptedInputException
162      *
163      * @throws      XZIOException if the stream has been closed
164      *
165      * @throws      EOFException
166      *                          compressed input is truncated or corrupt
167      *
168      * @throws      IOException may be thrown by <code>in</code>
169      */
read()170     public int read() throws IOException {
171         return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF);
172     }
173 
174     /**
175      * Decompresses into an array of bytes.
176      * <p>
177      * If <code>len</code> is zero, no bytes are read and <code>0</code>
178      * is returned. Otherwise this will block until <code>len</code>
179      * bytes have been decompressed, the end of the LZMA2 stream is reached,
180      * or an exception is thrown.
181      *
182      * @param       buf         target buffer for uncompressed data
183      * @param       off         start offset in <code>buf</code>
184      * @param       len         maximum number of uncompressed bytes to read
185      *
186      * @return      number of bytes read, or <code>-1</code> to indicate
187      *              the end of the compressed stream
188      *
189      * @throws      CorruptedInputException
190      *
191      * @throws      XZIOException if the stream has been closed
192      *
193      * @throws      EOFException
194      *                          compressed input is truncated or corrupt
195      *
196      * @throws      IOException may be thrown by <code>in</code>
197      */
read(byte[] buf, int off, int len)198     public int read(byte[] buf, int off, int len) throws IOException {
199         if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
200             throw new IndexOutOfBoundsException();
201 
202         if (len == 0)
203             return 0;
204 
205         if (in == null)
206             throw new XZIOException("Stream closed");
207 
208         if (exception != null)
209             throw exception;
210 
211         if (endReached)
212             return -1;
213 
214         try {
215             int size = 0;
216 
217             while (len > 0) {
218                 if (uncompressedSize == 0) {
219                     decodeChunkHeader();
220                     if (endReached)
221                         return size == 0 ? -1 : size;
222                 }
223 
224                 int copySizeMax = Math.min(uncompressedSize, len);
225 
226                 if (!isLZMAChunk) {
227                     lz.copyUncompressed(in, copySizeMax);
228                 } else {
229                     lz.setLimit(copySizeMax);
230                     lzma.decode();
231                     if (!rc.isInBufferOK())
232                         throw new CorruptedInputException();
233                 }
234 
235                 int copiedSize = lz.flush(buf, off);
236                 off += copiedSize;
237                 len -= copiedSize;
238                 size += copiedSize;
239                 uncompressedSize -= copiedSize;
240 
241                 if (uncompressedSize == 0)
242                     if (!rc.isFinished() || lz.hasPending())
243                         throw new CorruptedInputException();
244             }
245 
246             return size;
247 
248         } catch (IOException e) {
249             exception = e;
250             throw e;
251         }
252     }
253 
decodeChunkHeader()254     private void decodeChunkHeader() throws IOException {
255         int control = in.readUnsignedByte();
256 
257         if (control == 0x00) {
258             endReached = true;
259             return;
260         }
261 
262         if (control >= 0xE0 || control == 0x01) {
263             needProps = true;
264             needDictReset = false;
265             lz.reset();
266         } else if (needDictReset) {
267             throw new CorruptedInputException();
268         }
269 
270         if (control >= 0x80) {
271             isLZMAChunk = true;
272 
273             uncompressedSize = (control & 0x1F) << 16;
274             uncompressedSize += in.readUnsignedShort() + 1;
275 
276             int compressedSize = in.readUnsignedShort() + 1;
277 
278             if (control >= 0xC0) {
279                 needProps = false;
280                 decodeProps();
281 
282             } else if (needProps) {
283                 throw new CorruptedInputException();
284 
285             } else if (control >= 0xA0) {
286                 lzma.reset();
287             }
288 
289             rc.prepareInputBuffer(in, compressedSize);
290 
291         } else if (control > 0x02) {
292             throw new CorruptedInputException();
293 
294         } else {
295             isLZMAChunk = false;
296             uncompressedSize = in.readUnsignedShort() + 1;
297         }
298     }
299 
decodeProps()300     private void decodeProps() throws IOException {
301         int props = in.readUnsignedByte();
302 
303         if (props > (4 * 5 + 4) * 9 + 8)
304             throw new CorruptedInputException();
305 
306         int pb = props / (9 * 5);
307         props -= pb * 9 * 5;
308         int lp = props / 9;
309         int lc = props - lp * 9;
310 
311         if (lc + lp > 4)
312             throw new CorruptedInputException();
313 
314         lzma = new LZMADecoder(lz, rc, lc, lp, pb);
315     }
316 
317     /**
318      * Returns the number of uncompressed bytes that can be read
319      * without blocking. The value is returned with an assumption
320      * that the compressed input data will be valid. If the compressed
321      * data is corrupt, <code>CorruptedInputException</code> may get
322      * thrown before the number of bytes claimed to be available have
323      * been read from this input stream.
324      * <p>
325      * In LZMA2InputStream, the return value will be non-zero when the
326      * decompressor is in the middle of an LZMA2 chunk. The return value
327      * will then be the number of uncompressed bytes remaining from that
328      * chunk.
329      *
330      * @return      the number of uncompressed bytes that can be read
331      *              without blocking
332      */
available()333     public int available() throws IOException {
334         if (in == null)
335             throw new XZIOException("Stream closed");
336 
337         if (exception != null)
338             throw exception;
339 
340         return uncompressedSize;
341     }
342 
343     /**
344      * Closes the stream and calls <code>in.close()</code>.
345      * If the stream was already closed, this does nothing.
346      *
347      * @throws  IOException if thrown by <code>in.close()</code>
348      */
close()349     public void close() throws IOException {
350         if (in != null) {
351             try {
352                 in.close();
353             } finally {
354                 in = null;
355             }
356         }
357     }
358 }
359