1 /*
2  * SeekableXZInputStream
3  *
4  * Author: Lasse Collin <lasse.collin@tukaani.org>
5  *
6  * This file has been put into the public domain.
7  * You can do whatever you want with this file.
8  */
9 
10 package org.tukaani.xz;
11 
12 import java.util.Arrays;
13 import java.util.ArrayList;
14 import java.io.DataInputStream;
15 import java.io.IOException;
16 import java.io.EOFException;
17 import org.tukaani.xz.common.DecoderUtil;
18 import org.tukaani.xz.common.StreamFlags;
19 import org.tukaani.xz.check.Check;
20 import org.tukaani.xz.index.IndexDecoder;
21 import org.tukaani.xz.index.BlockInfo;
22 
23 /**
24  * Decompresses a .xz file in random access mode.
25  * This supports decompressing concatenated .xz files.
26  * <p>
27  * Each .xz file consist of one or more Streams. Each Stream consist of zero
28  * or more Blocks. Each Stream contains an Index of Streams' Blocks.
29  * The Indexes from all Streams are loaded in RAM by a constructor of this
30  * class. A typical .xz file has only one Stream, and parsing its Index will
31  * need only three or four seeks.
32  * <p>
33  * To make random access possible, the data in a .xz file must be splitted
34  * into multiple Blocks of reasonable size. Decompression can only start at
35  * a Block boundary. When seeking to an uncompressed position that is not at
36  * a Block boundary, decompression starts at the beginning of the Block and
37  * throws away data until the target position is reached. Thus, smaller Blocks
38  * mean faster seeks to arbitrary uncompressed positions. On the other hand,
39  * smaller Blocks mean worse compression. So one has to make a compromise
40  * between random access speed and compression ratio.
41  * <p>
42  * Implementation note: This class uses linear search to locate the correct
43  * Stream from the data structures in RAM. It was the simplest to implement
44  * and should be fine as long as there aren't too many Streams. The correct
45  * Block inside a Stream is located using binary search and thus is fast
46  * even with a huge number of Blocks.
47  *
48  * <h4>Memory usage</h4>
49  * <p>
50  * The amount of memory needed for the Indexes is taken into account when
51  * checking the memory usage limit. Each Stream is calculated to need at
52  * least 1&nbsp;KiB of memory and each Block 16 bytes of memory, rounded up
53  * to the next kibibyte. So unless the file has a huge number of Streams or
54  * Blocks, these don't take significant amount of memory.
55  *
56  * <h4>Creating random-accessible .xz files</h4>
57  * <p>
58  * When using {@link XZOutputStream}, a new Block can be started by calling
59  * its {@link XZOutputStream#endBlock() endBlock} method. If you know
60  * that the decompressor will only need to seek to certain uncompressed
61  * positions, it can be a good idea to start a new Block at (some of) these
62  * positions (and only at these positions to get better compression ratio).
63  * <p>
64  * liblzma in XZ Utils supports starting a new Block with
65  * <code>LZMA_FULL_FLUSH</code>. XZ Utils 5.1.1alpha added threaded
66  * compression which creates multi-Block .xz files. XZ Utils 5.1.1alpha
67  * also added the option <code>--block-size=SIZE</code> to the xz command
68  * line tool. XZ Utils 5.1.2alpha added a partial implementation of
69  * <code>--block-list=SIZES</code> which allows specifying sizes of
70  * individual Blocks.
71  *
72  * @see SeekableFileInputStream
73  * @see XZInputStream
74  * @see XZOutputStream
75  */
76 public class SeekableXZInputStream extends SeekableInputStream {
77     /**
78      * Cache for big arrays.
79      */
80     private final ArrayCache arrayCache;
81 
82     /**
83      * The input stream containing XZ compressed data.
84      */
85     private SeekableInputStream in;
86 
87     /**
88      * Memory usage limit after the memory usage of the IndexDecoders have
89      * been substracted.
90      */
91     private final int memoryLimit;
92 
93     /**
94      * Memory usage of the IndexDecoders.
95      * <code>memoryLimit + indexMemoryUsage</code> equals the original
96      * memory usage limit that was passed to the constructor.
97      */
98     private int indexMemoryUsage = 0;
99 
100     /**
101      * List of IndexDecoders, one for each Stream in the file.
102      * The list is in reverse order: The first element is
103      * the last Stream in the file.
104      */
105     private final ArrayList<IndexDecoder> streams
106             = new ArrayList<IndexDecoder>();
107 
108     /**
109      * Bitmask of all Check IDs seen.
110      */
111     private int checkTypes = 0;
112 
113     /**
114      * Uncompressed size of the file (all Streams).
115      */
116     private long uncompressedSize = 0;
117 
118     /**
119      * Uncompressed size of the largest XZ Block in the file.
120      */
121     private long largestBlockSize = 0;
122 
123     /**
124      * Number of XZ Blocks in the file.
125      */
126     private int blockCount = 0;
127 
128     /**
129      * Size and position information about the current Block.
130      * If there are no Blocks, all values will be <code>-1</code>.
131      */
132     private final BlockInfo curBlockInfo;
133 
134     /**
135      * Temporary (and cached) information about the Block whose information
136      * is queried via <code>getBlockPos</code> and related functions.
137      */
138     private final BlockInfo queriedBlockInfo;
139 
140     /**
141      * Integrity Check in the current XZ Stream. The constructor leaves
142      * this to point to the Check of the first Stream.
143      */
144     private Check check;
145 
146     /**
147      * Flag indicating if the integrity checks will be verified.
148      */
149     private final boolean verifyCheck;
150 
151     /**
152      * Decoder of the current XZ Block, if any.
153      */
154     private BlockInputStream blockDecoder = null;
155 
156     /**
157      * Current uncompressed position.
158      */
159     private long curPos = 0;
160 
161     /**
162      * Target position for seeking.
163      */
164     private long seekPos;
165 
166     /**
167      * True when <code>seek(long)</code> has been called but the actual
168      * seeking hasn't been done yet.
169      */
170     private boolean seekNeeded = false;
171 
172     /**
173      * True when end of the file was reached. This can be cleared by
174      * calling <code>seek(long)</code>.
175      */
176     private boolean endReached = false;
177 
178     /**
179      * Pending exception from an earlier error.
180      */
181     private IOException exception = null;
182 
183     /**
184      * Temporary buffer for read(). This avoids reallocating memory
185      * on every read() call.
186      */
187     private final byte[] tempBuf = new byte[1];
188 
189     /**
190      * Creates a new seekable XZ decompressor without a memory usage limit.
191      *
192      * @param       in          seekable input stream containing one or more
193      *                          XZ Streams; the whole input stream is used
194      *
195      * @throws      XZFormatException
196      *                          input is not in the XZ format
197      *
198      * @throws      CorruptedInputException
199      *                          XZ data is corrupt or truncated
200      *
201      * @throws      UnsupportedOptionsException
202      *                          XZ headers seem valid but they specify
203      *                          options not supported by this implementation
204      *
205      * @throws      EOFException
206      *                          less than 6 bytes of input was available
207      *                          from <code>in</code>, or (unlikely) the size
208      *                          of the underlying stream got smaller while
209      *                          this was reading from it
210      *
211      * @throws      IOException may be thrown by <code>in</code>
212      */
SeekableXZInputStream(SeekableInputStream in)213     public SeekableXZInputStream(SeekableInputStream in)
214             throws IOException {
215         this(in, -1);
216     }
217 
218     /**
219      * Creates a new seekable XZ decompressor without a memory usage limit.
220      * <p>
221      * This is identical to
222      * <code>SeekableXZInputStream(SeekableInputStream)</code> except that
223      * this also takes the <code>arrayCache</code> argument.
224      *
225      * @param       in          seekable input stream containing one or more
226      *                          XZ Streams; the whole input stream is used
227      *
228      * @param       arrayCache  cache to be used for allocating large arrays
229      *
230      * @throws      XZFormatException
231      *                          input is not in the XZ format
232      *
233      * @throws      CorruptedInputException
234      *                          XZ data is corrupt or truncated
235      *
236      * @throws      UnsupportedOptionsException
237      *                          XZ headers seem valid but they specify
238      *                          options not supported by this implementation
239      *
240      * @throws      EOFException
241      *                          less than 6 bytes of input was available
242      *                          from <code>in</code>, or (unlikely) the size
243      *                          of the underlying stream got smaller while
244      *                          this was reading from it
245      *
246      * @throws      IOException may be thrown by <code>in</code>
247      *
248      * @since 1.7
249      */
SeekableXZInputStream(SeekableInputStream in, ArrayCache arrayCache)250     public SeekableXZInputStream(SeekableInputStream in, ArrayCache arrayCache)
251             throws IOException {
252         this(in, -1, arrayCache);
253     }
254 
255     /**
256      * Creates a new seekable XZ decomporessor with an optional
257      * memory usage limit.
258      *
259      * @param       in          seekable input stream containing one or more
260      *                          XZ Streams; the whole input stream is used
261      *
262      * @param       memoryLimit memory usage limit in kibibytes (KiB)
263      *                          or <code>-1</code> to impose no
264      *                          memory usage limit
265      *
266      * @throws      XZFormatException
267      *                          input is not in the XZ format
268      *
269      * @throws      CorruptedInputException
270      *                          XZ data is corrupt or truncated
271      *
272      * @throws      UnsupportedOptionsException
273      *                          XZ headers seem valid but they specify
274      *                          options not supported by this implementation
275      *
276      * @throws      MemoryLimitException
277      *                          decoded XZ Indexes would need more memory
278      *                          than allowed by the memory usage limit
279      *
280      * @throws      EOFException
281      *                          less than 6 bytes of input was available
282      *                          from <code>in</code>, or (unlikely) the size
283      *                          of the underlying stream got smaller while
284      *                          this was reading from it
285      *
286      * @throws      IOException may be thrown by <code>in</code>
287      */
SeekableXZInputStream(SeekableInputStream in, int memoryLimit)288     public SeekableXZInputStream(SeekableInputStream in, int memoryLimit)
289             throws IOException {
290         this(in, memoryLimit, true);
291     }
292 
293     /**
294      * Creates a new seekable XZ decomporessor with an optional
295      * memory usage limit.
296      * <p>
297      * This is identical to
298      * <code>SeekableXZInputStream(SeekableInputStream,int)</code>
299      * except that this also takes the <code>arrayCache</code> argument.
300      *
301      * @param       in          seekable input stream containing one or more
302      *                          XZ Streams; the whole input stream is used
303      *
304      * @param       memoryLimit memory usage limit in kibibytes (KiB)
305      *                          or <code>-1</code> to impose no
306      *                          memory usage limit
307      *
308      * @param       arrayCache  cache to be used for allocating large arrays
309      *
310      * @throws      XZFormatException
311      *                          input is not in the XZ format
312      *
313      * @throws      CorruptedInputException
314      *                          XZ data is corrupt or truncated
315      *
316      * @throws      UnsupportedOptionsException
317      *                          XZ headers seem valid but they specify
318      *                          options not supported by this implementation
319      *
320      * @throws      MemoryLimitException
321      *                          decoded XZ Indexes would need more memory
322      *                          than allowed by the memory usage limit
323      *
324      * @throws      EOFException
325      *                          less than 6 bytes of input was available
326      *                          from <code>in</code>, or (unlikely) the size
327      *                          of the underlying stream got smaller while
328      *                          this was reading from it
329      *
330      * @throws      IOException may be thrown by <code>in</code>
331      *
332      * @since 1.7
333      */
SeekableXZInputStream(SeekableInputStream in, int memoryLimit, ArrayCache arrayCache)334     public SeekableXZInputStream(SeekableInputStream in, int memoryLimit,
335                                  ArrayCache arrayCache)
336             throws IOException {
337         this(in, memoryLimit, true, arrayCache);
338     }
339 
340     /**
341      * Creates a new seekable XZ decomporessor with an optional
342      * memory usage limit and ability to disable verification
343      * of integrity checks.
344      * <p>
345      * Note that integrity check verification should almost never be disabled.
346      * Possible reasons to disable integrity check verification:
347      * <ul>
348      *   <li>Trying to recover data from a corrupt .xz file.</li>
349      *   <li>Speeding up decompression. This matters mostly with SHA-256
350      *   or with files that have compressed extremely well. It's recommended
351      *   that integrity checking isn't disabled for performance reasons
352      *   unless the file integrity is verified externally in some other
353      *   way.</li>
354      * </ul>
355      * <p>
356      * <code>verifyCheck</code> only affects the integrity check of
357      * the actual compressed data. The CRC32 fields in the headers
358      * are always verified.
359      *
360      * @param       in          seekable input stream containing one or more
361      *                          XZ Streams; the whole input stream is used
362      *
363      * @param       memoryLimit memory usage limit in kibibytes (KiB)
364      *                          or <code>-1</code> to impose no
365      *                          memory usage limit
366      *
367      * @param       verifyCheck if <code>true</code>, the integrity checks
368      *                          will be verified; this should almost never
369      *                          be set to <code>false</code>
370      *
371      * @throws      XZFormatException
372      *                          input is not in the XZ format
373      *
374      * @throws      CorruptedInputException
375      *                          XZ data is corrupt or truncated
376      *
377      * @throws      UnsupportedOptionsException
378      *                          XZ headers seem valid but they specify
379      *                          options not supported by this implementation
380      *
381      * @throws      MemoryLimitException
382      *                          decoded XZ Indexes would need more memory
383      *                          than allowed by the memory usage limit
384      *
385      * @throws      EOFException
386      *                          less than 6 bytes of input was available
387      *                          from <code>in</code>, or (unlikely) the size
388      *                          of the underlying stream got smaller while
389      *                          this was reading from it
390      *
391      * @throws      IOException may be thrown by <code>in</code>
392      *
393      * @since 1.6
394      */
SeekableXZInputStream(SeekableInputStream in, int memoryLimit, boolean verifyCheck)395     public SeekableXZInputStream(SeekableInputStream in, int memoryLimit,
396                                  boolean verifyCheck)
397             throws IOException {
398         this(in, memoryLimit, verifyCheck, ArrayCache.getDefaultCache());
399     }
400 
401     /**
402      * Creates a new seekable XZ decomporessor with an optional
403      * memory usage limit and ability to disable verification
404      * of integrity checks.
405      * <p>
406      * This is identical to
407      * <code>SeekableXZInputStream(SeekableInputStream,int,boolean)</code>
408      * except that this also takes the <code>arrayCache</code> argument.
409      *
410      * @param       in          seekable input stream containing one or more
411      *                          XZ Streams; the whole input stream is used
412      *
413      * @param       memoryLimit memory usage limit in kibibytes (KiB)
414      *                          or <code>-1</code> to impose no
415      *                          memory usage limit
416      *
417      * @param       verifyCheck if <code>true</code>, the integrity checks
418      *                          will be verified; this should almost never
419      *                          be set to <code>false</code>
420      *
421      * @param       arrayCache  cache to be used for allocating large arrays
422      *
423      * @throws      XZFormatException
424      *                          input is not in the XZ format
425      *
426      * @throws      CorruptedInputException
427      *                          XZ data is corrupt or truncated
428      *
429      * @throws      UnsupportedOptionsException
430      *                          XZ headers seem valid but they specify
431      *                          options not supported by this implementation
432      *
433      * @throws      MemoryLimitException
434      *                          decoded XZ Indexes would need more memory
435      *                          than allowed by the memory usage limit
436      *
437      * @throws      EOFException
438      *                          less than 6 bytes of input was available
439      *                          from <code>in</code>, or (unlikely) the size
440      *                          of the underlying stream got smaller while
441      *                          this was reading from it
442      *
443      * @throws      IOException may be thrown by <code>in</code>
444      *
445      * @since 1.7
446      */
SeekableXZInputStream(SeekableInputStream in, int memoryLimit, boolean verifyCheck, ArrayCache arrayCache)447     public SeekableXZInputStream(SeekableInputStream in, int memoryLimit,
448                                  boolean verifyCheck, ArrayCache arrayCache)
449             throws IOException {
450         this.arrayCache = arrayCache;
451         this.verifyCheck = verifyCheck;
452         this.in = in;
453         DataInputStream inData = new DataInputStream(in);
454 
455         // Check the magic bytes in the beginning of the file.
456         {
457             in.seek(0);
458             byte[] buf = new byte[XZ.HEADER_MAGIC.length];
459             inData.readFully(buf);
460             if (!Arrays.equals(buf, XZ.HEADER_MAGIC))
461                 throw new XZFormatException();
462         }
463 
464         // Get the file size and verify that it is a multiple of 4 bytes.
465         long pos = in.length();
466         if ((pos & 3) != 0)
467             throw new CorruptedInputException(
468                     "XZ file size is not a multiple of 4 bytes");
469 
470         // Parse the headers starting from the end of the file.
471         byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE];
472         long streamPadding = 0;
473 
474         while (pos > 0) {
475             if (pos < DecoderUtil.STREAM_HEADER_SIZE)
476                 throw new CorruptedInputException();
477 
478             // Read the potential Stream Footer.
479             in.seek(pos - DecoderUtil.STREAM_HEADER_SIZE);
480             inData.readFully(buf);
481 
482             // Skip Stream Padding four bytes at a time.
483             // Skipping more at once would be faster,
484             // but usually there isn't much Stream Padding.
485             if (buf[8] == 0x00 && buf[9] == 0x00 && buf[10] == 0x00
486                     && buf[11] == 0x00) {
487                 streamPadding += 4;
488                 pos -= 4;
489                 continue;
490             }
491 
492             // It's not Stream Padding. Update pos.
493             pos -= DecoderUtil.STREAM_HEADER_SIZE;
494 
495             // Decode the Stream Footer and check if Backward Size
496             // looks reasonable.
497             StreamFlags streamFooter = DecoderUtil.decodeStreamFooter(buf);
498             if (streamFooter.backwardSize >= pos)
499                 throw new CorruptedInputException(
500                         "Backward Size in XZ Stream Footer is too big");
501 
502             // Check that the Check ID is supported. Store it in case this
503             // is the first Stream in the file.
504             check = Check.getInstance(streamFooter.checkType);
505 
506             // Remember which Check IDs have been seen.
507             checkTypes |= 1 << streamFooter.checkType;
508 
509             // Seek to the beginning of the Index.
510             in.seek(pos - streamFooter.backwardSize);
511 
512             // Decode the Index field.
513             IndexDecoder index;
514             try {
515                 index = new IndexDecoder(in, streamFooter, streamPadding,
516                                          memoryLimit);
517             } catch (MemoryLimitException e) {
518                 // IndexDecoder doesn't know how much memory we had
519                 // already needed so we need to recreate the exception.
520                 assert memoryLimit >= 0;
521                 throw new MemoryLimitException(
522                         e.getMemoryNeeded() + indexMemoryUsage,
523                         memoryLimit + indexMemoryUsage);
524             }
525 
526             // Update the memory usage and limit counters.
527             indexMemoryUsage += index.getMemoryUsage();
528             if (memoryLimit >= 0) {
529                 memoryLimit -= index.getMemoryUsage();
530                 assert memoryLimit >= 0;
531             }
532 
533             // Remember the uncompressed size of the largest Block.
534             if (largestBlockSize < index.getLargestBlockSize())
535                 largestBlockSize = index.getLargestBlockSize();
536 
537             // Calculate the offset to the beginning of this XZ Stream and
538             // check that it looks sane.
539             long off = index.getStreamSize() - DecoderUtil.STREAM_HEADER_SIZE;
540             if (pos < off)
541                 throw new CorruptedInputException("XZ Index indicates "
542                         + "too big compressed size for the XZ Stream");
543 
544             // Seek to the beginning of this Stream.
545             pos -= off;
546             in.seek(pos);
547 
548             // Decode the Stream Header.
549             inData.readFully(buf);
550             StreamFlags streamHeader = DecoderUtil.decodeStreamHeader(buf);
551 
552             // Verify that the Stream Header matches the Stream Footer.
553             if (!DecoderUtil.areStreamFlagsEqual(streamHeader, streamFooter))
554                 throw new CorruptedInputException(
555                         "XZ Stream Footer does not match Stream Header");
556 
557             // Update the total uncompressed size of the file and check that
558             // it doesn't overflow.
559             uncompressedSize += index.getUncompressedSize();
560             if (uncompressedSize < 0)
561                 throw new UnsupportedOptionsException("XZ file is too big");
562 
563             // Update the Block count and check that it fits into an int.
564             blockCount += index.getRecordCount();
565             if (blockCount < 0)
566                 throw new UnsupportedOptionsException(
567                         "XZ file has over " + Integer.MAX_VALUE + " Blocks");
568 
569             // Add this Stream to the list of Streams.
570             streams.add(index);
571 
572             // Reset to be ready to parse the next Stream.
573             streamPadding = 0;
574         }
575 
576         assert pos == 0;
577 
578         // Save it now that indexMemoryUsage has been substracted from it.
579         this.memoryLimit = memoryLimit;
580 
581         // Store the relative offsets of the Streams. This way we don't
582         // need to recalculate them in this class when seeking; the
583         // IndexDecoder instances will handle them.
584         IndexDecoder prev = streams.get(streams.size() - 1);
585         for (int i = streams.size() - 2; i >= 0; --i) {
586             IndexDecoder cur = streams.get(i);
587             cur.setOffsets(prev);
588             prev = cur;
589         }
590 
591         // Initialize curBlockInfo to point to the first Stream.
592         // The blockNumber will be left to -1 so that .hasNext()
593         // and .setNext() work to get the first Block when starting
594         // to decompress from the beginning of the file.
595         IndexDecoder first = streams.get(streams.size() - 1);
596         curBlockInfo = new BlockInfo(first);
597 
598         // queriedBlockInfo needs to be allocated too. The Stream used for
599         // initialization doesn't matter though.
600         queriedBlockInfo = new BlockInfo(first);
601     }
602 
603     /**
604      * Gets the types of integrity checks used in the .xz file.
605      * Multiple checks are possible only if there are multiple
606      * concatenated XZ Streams.
607      * <p>
608      * The returned value has a bit set for every check type that is present.
609      * For example, if CRC64 and SHA-256 were used, the return value is
610      * <code>(1&nbsp;&lt;&lt;&nbsp;XZ.CHECK_CRC64)
611      * | (1&nbsp;&lt;&lt;&nbsp;XZ.CHECK_SHA256)</code>.
612      */
getCheckTypes()613     public int getCheckTypes() {
614         return checkTypes;
615     }
616 
617     /**
618      * Gets the amount of memory in kibibytes (KiB) used by
619      * the data structures needed to locate the XZ Blocks.
620      * This is usually useless information but since it is calculated
621      * for memory usage limit anyway, it is nice to make it available to too.
622      */
getIndexMemoryUsage()623     public int getIndexMemoryUsage() {
624         return indexMemoryUsage;
625     }
626 
627     /**
628      * Gets the uncompressed size of the largest XZ Block in bytes.
629      * This can be useful if you want to check that the file doesn't
630      * have huge XZ Blocks which could make seeking to arbitrary offsets
631      * very slow. Note that huge Blocks don't automatically mean that
632      * seeking would be slow, for example, seeking to the beginning of
633      * any Block is always fast.
634      */
getLargestBlockSize()635     public long getLargestBlockSize() {
636         return largestBlockSize;
637     }
638 
639     /**
640      * Gets the number of Streams in the .xz file.
641      *
642      * @since 1.3
643      */
getStreamCount()644     public int getStreamCount() {
645         return streams.size();
646     }
647 
648     /**
649      * Gets the number of Blocks in the .xz file.
650      *
651      * @since 1.3
652      */
getBlockCount()653     public int getBlockCount() {
654         return blockCount;
655     }
656 
657     /**
658      * Gets the uncompressed start position of the given Block.
659      *
660      * @throws  IndexOutOfBoundsException if
661      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
662      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
663      *
664      * @since 1.3
665      */
getBlockPos(int blockNumber)666     public long getBlockPos(int blockNumber) {
667         locateBlockByNumber(queriedBlockInfo, blockNumber);
668         return queriedBlockInfo.uncompressedOffset;
669     }
670 
671     /**
672      * Gets the uncompressed size of the given Block.
673      *
674      * @throws  IndexOutOfBoundsException if
675      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
676      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
677      *
678      * @since 1.3
679      */
getBlockSize(int blockNumber)680     public long getBlockSize(int blockNumber) {
681         locateBlockByNumber(queriedBlockInfo, blockNumber);
682         return queriedBlockInfo.uncompressedSize;
683     }
684 
685     /**
686      * Gets the position where the given compressed Block starts in
687      * the underlying .xz file.
688      * This information is rarely useful to the users of this class.
689      *
690      * @throws  IndexOutOfBoundsException if
691      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
692      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
693      *
694      * @since 1.3
695      */
getBlockCompPos(int blockNumber)696     public long getBlockCompPos(int blockNumber) {
697         locateBlockByNumber(queriedBlockInfo, blockNumber);
698         return queriedBlockInfo.compressedOffset;
699     }
700 
701     /**
702      * Gets the compressed size of the given Block.
703      * This together with the uncompressed size can be used to calculate
704      * the compression ratio of the specific Block.
705      *
706      * @throws  IndexOutOfBoundsException if
707      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
708      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
709      *
710      * @since 1.3
711      */
getBlockCompSize(int blockNumber)712     public long getBlockCompSize(int blockNumber) {
713         locateBlockByNumber(queriedBlockInfo, blockNumber);
714         return (queriedBlockInfo.unpaddedSize + 3) & ~3;
715     }
716 
717     /**
718      * Gets integrity check type (Check ID) of the given Block.
719      *
720      * @throws  IndexOutOfBoundsException if
721      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
722      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
723      *
724      * @see #getCheckTypes()
725      *
726      * @since 1.3
727      */
getBlockCheckType(int blockNumber)728     public int getBlockCheckType(int blockNumber) {
729         locateBlockByNumber(queriedBlockInfo, blockNumber);
730         return queriedBlockInfo.getCheckType();
731     }
732 
733     /**
734      * Gets the number of the Block that contains the byte at the given
735      * uncompressed position.
736      *
737      * @throws  IndexOutOfBoundsException if
738      *          <code>pos&nbsp;&lt;&nbsp;0</code> or
739      *          <code>pos&nbsp;&gt;=&nbsp;length()</code>.
740      *
741      * @since 1.3
742      */
getBlockNumber(long pos)743     public int getBlockNumber(long pos) {
744         locateBlockByPos(queriedBlockInfo, pos);
745         return queriedBlockInfo.blockNumber;
746     }
747 
748     /**
749      * Decompresses the next byte from this input stream.
750      *
751      * @return      the next decompressed byte, or <code>-1</code>
752      *              to indicate the end of the compressed stream
753      *
754      * @throws      CorruptedInputException
755      * @throws      UnsupportedOptionsException
756      * @throws      MemoryLimitException
757      *
758      * @throws      XZIOException if the stream has been closed
759      *
760      * @throws      IOException may be thrown by <code>in</code>
761      */
read()762     public int read() throws IOException {
763         return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF);
764     }
765 
766     /**
767      * Decompresses into an array of bytes.
768      * <p>
769      * If <code>len</code> is zero, no bytes are read and <code>0</code>
770      * is returned. Otherwise this will try to decompress <code>len</code>
771      * bytes of uncompressed data. Less than <code>len</code> bytes may
772      * be read only in the following situations:
773      * <ul>
774      *   <li>The end of the compressed data was reached successfully.</li>
775      *   <li>An error is detected after at least one but less than
776      *       <code>len</code> bytes have already been successfully
777      *       decompressed. The next call with non-zero <code>len</code>
778      *       will immediately throw the pending exception.</li>
779      *   <li>An exception is thrown.</li>
780      * </ul>
781      *
782      * @param       buf         target buffer for uncompressed data
783      * @param       off         start offset in <code>buf</code>
784      * @param       len         maximum number of uncompressed bytes to read
785      *
786      * @return      number of bytes read, or <code>-1</code> to indicate
787      *              the end of the compressed stream
788      *
789      * @throws      CorruptedInputException
790      * @throws      UnsupportedOptionsException
791      * @throws      MemoryLimitException
792      *
793      * @throws      XZIOException if the stream has been closed
794      *
795      * @throws      IOException may be thrown by <code>in</code>
796      */
read(byte[] buf, int off, int len)797     public int read(byte[] buf, int off, int len) throws IOException {
798         if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
799             throw new IndexOutOfBoundsException();
800 
801         if (len == 0)
802             return 0;
803 
804         if (in == null)
805             throw new XZIOException("Stream closed");
806 
807         if (exception != null)
808             throw exception;
809 
810         int size = 0;
811 
812         try {
813             if (seekNeeded)
814                 seek();
815 
816             if (endReached)
817                 return -1;
818 
819             while (len > 0) {
820                 if (blockDecoder == null) {
821                     seek();
822                     if (endReached)
823                         break;
824                 }
825 
826                 int ret = blockDecoder.read(buf, off, len);
827 
828                 if (ret > 0) {
829                     curPos += ret;
830                     size += ret;
831                     off += ret;
832                     len -= ret;
833                 } else if (ret == -1) {
834                     blockDecoder = null;
835                 }
836             }
837         } catch (IOException e) {
838             // We know that the file isn't simply truncated because we could
839             // parse the Indexes in the constructor. So convert EOFException
840             // to CorruptedInputException.
841             if (e instanceof EOFException)
842                 e = new CorruptedInputException();
843 
844             exception = e;
845             if (size == 0)
846                 throw e;
847         }
848 
849         return size;
850     }
851 
852     /**
853      * Returns the number of uncompressed bytes that can be read
854      * without blocking. The value is returned with an assumption
855      * that the compressed input data will be valid. If the compressed
856      * data is corrupt, <code>CorruptedInputException</code> may get
857      * thrown before the number of bytes claimed to be available have
858      * been read from this input stream.
859      *
860      * @return      the number of uncompressed bytes that can be read
861      *              without blocking
862      */
available()863     public int available() throws IOException {
864         if (in == null)
865             throw new XZIOException("Stream closed");
866 
867         if (exception != null)
868             throw exception;
869 
870         if (endReached || seekNeeded || blockDecoder == null)
871             return 0;
872 
873         return blockDecoder.available();
874     }
875 
876     /**
877      * Closes the stream and calls <code>in.close()</code>.
878      * If the stream was already closed, this does nothing.
879      * <p>
880      * This is equivalent to <code>close(true)</code>.
881      *
882      * @throws  IOException if thrown by <code>in.close()</code>
883      */
close()884     public void close() throws IOException {
885         close(true);
886     }
887 
888     /**
889      * Closes the stream and optionally calls <code>in.close()</code>.
890      * If the stream was already closed, this does nothing.
891      * If <code>close(false)</code> has been called, a further
892      * call of <code>close(true)</code> does nothing (it doesn't call
893      * <code>in.close()</code>).
894      * <p>
895      * If you don't want to close the underlying <code>InputStream</code>,
896      * there is usually no need to worry about closing this stream either;
897      * it's fine to do nothing and let the garbage collector handle it.
898      * However, if you are using {@link ArrayCache}, <code>close(false)</code>
899      * can be useful to put the allocated arrays back to the cache without
900      * closing the underlying <code>InputStream</code>.
901      * <p>
902      * Note that if you successfully reach the end of the stream
903      * (<code>read</code> returns <code>-1</code>), the arrays are
904      * automatically put back to the cache by that <code>read</code> call. In
905      * this situation <code>close(false)</code> is redundant (but harmless).
906      *
907      * @throws  IOException if thrown by <code>in.close()</code>
908      *
909      * @since 1.7
910      */
close(boolean closeInput)911     public void close(boolean closeInput) throws IOException {
912         if (in != null) {
913             if (blockDecoder != null) {
914                 blockDecoder.close();
915                 blockDecoder = null;
916             }
917 
918             try {
919                 if (closeInput)
920                     in.close();
921             } finally {
922                 in = null;
923             }
924         }
925     }
926 
927     /**
928      * Gets the uncompressed size of this input stream. If there are multiple
929      * XZ Streams, the total uncompressed size of all XZ Streams is returned.
930      */
length()931     public long length() {
932         return uncompressedSize;
933     }
934 
935     /**
936      * Gets the current uncompressed position in this input stream.
937      *
938      * @throws      XZIOException if the stream has been closed
939      */
position()940     public long position() throws IOException {
941         if (in == null)
942             throw new XZIOException("Stream closed");
943 
944         return seekNeeded ? seekPos : curPos;
945     }
946 
947     /**
948      * Seeks to the specified absolute uncompressed position in the stream.
949      * This only stores the new position, so this function itself is always
950      * very fast. The actual seek is done when <code>read</code> is called
951      * to read at least one byte.
952      * <p>
953      * Seeking past the end of the stream is possible. In that case
954      * <code>read</code> will return <code>-1</code> to indicate
955      * the end of the stream.
956      *
957      * @param       pos         new uncompressed read position
958      *
959      * @throws      XZIOException
960      *                          if <code>pos</code> is negative, or
961      *                          if stream has been closed
962      */
seek(long pos)963     public void seek(long pos) throws IOException {
964         if (in == null)
965             throw new XZIOException("Stream closed");
966 
967         if (pos < 0)
968             throw new XZIOException("Negative seek position: " + pos);
969 
970         seekPos = pos;
971         seekNeeded = true;
972     }
973 
974     /**
975      * Seeks to the beginning of the given XZ Block.
976      *
977      * @throws      XZIOException
978      *              if <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
979      *              <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>,
980      *              or if stream has been closed
981      *
982      * @since 1.3
983      */
seekToBlock(int blockNumber)984     public void seekToBlock(int blockNumber) throws IOException {
985         if (in == null)
986             throw new XZIOException("Stream closed");
987 
988         if (blockNumber < 0 || blockNumber >= blockCount)
989             throw new XZIOException("Invalid XZ Block number: " + blockNumber);
990 
991         // This is a bit silly implementation. Here we locate the uncompressed
992         // offset of the specified Block, then when doing the actual seek in
993         // seek(), we need to find the Block number based on seekPos.
994         seekPos = getBlockPos(blockNumber);
995         seekNeeded = true;
996     }
997 
998     /**
999      * Does the actual seeking. This is also called when <code>read</code>
1000      * needs a new Block to decode.
1001      */
seek()1002     private void seek() throws IOException {
1003         // If seek(long) wasn't called, we simply need to get the next Block
1004         // from the same Stream. If there are no more Blocks in this Stream,
1005         // then we behave as if seek(long) had been called.
1006         if (!seekNeeded) {
1007             if (curBlockInfo.hasNext()) {
1008                 curBlockInfo.setNext();
1009                 initBlockDecoder();
1010                 return;
1011             }
1012 
1013             seekPos = curPos;
1014         }
1015 
1016         seekNeeded = false;
1017 
1018         // Check if we are seeking to or past the end of the file.
1019         if (seekPos >= uncompressedSize) {
1020             curPos = seekPos;
1021 
1022             if (blockDecoder != null) {
1023                 blockDecoder.close();
1024                 blockDecoder = null;
1025             }
1026 
1027             endReached = true;
1028             return;
1029         }
1030 
1031         endReached = false;
1032 
1033         // Locate the Block that contains the uncompressed target position.
1034         locateBlockByPos(curBlockInfo, seekPos);
1035 
1036         // Seek in the underlying stream and create a new Block decoder
1037         // only if really needed. We can skip it if the current position
1038         // is already in the correct Block and the target position hasn't
1039         // been decompressed yet.
1040         //
1041         // NOTE: If curPos points to the beginning of this Block, it's
1042         // because it was left there after decompressing an earlier Block.
1043         // In that case, decoding of the current Block hasn't been started
1044         // yet. (Decoding of a Block won't be started until at least one
1045         // byte will also be read from it.)
1046         if (!(curPos > curBlockInfo.uncompressedOffset && curPos <= seekPos)) {
1047             // Seek to the beginning of the Block.
1048             in.seek(curBlockInfo.compressedOffset);
1049 
1050             // Since it is possible that this Block is from a different
1051             // Stream than the previous Block, initialize a new Check.
1052             check = Check.getInstance(curBlockInfo.getCheckType());
1053 
1054             // Create a new Block decoder.
1055             initBlockDecoder();
1056             curPos = curBlockInfo.uncompressedOffset;
1057         }
1058 
1059         // If the target wasn't at a Block boundary, decompress and throw
1060         // away data to reach the target position.
1061         if (seekPos > curPos) {
1062             // NOTE: The "if" below is there just in case. In this situation,
1063             // blockDecoder.skip will always skip the requested amount
1064             // or throw an exception.
1065             long skipAmount = seekPos - curPos;
1066             if (blockDecoder.skip(skipAmount) != skipAmount)
1067                 throw new CorruptedInputException();
1068 
1069             curPos = seekPos;
1070         }
1071     }
1072 
1073     /**
1074      * Locates the Block that contains the given uncompressed position.
1075      */
locateBlockByPos(BlockInfo info, long pos)1076     private void locateBlockByPos(BlockInfo info, long pos) {
1077         if (pos < 0 || pos >= uncompressedSize)
1078             throw new IndexOutOfBoundsException(
1079                     "Invalid uncompressed position: " + pos);
1080 
1081         // Locate the Stream that contains the target position.
1082         IndexDecoder index;
1083         for (int i = 0; ; ++i) {
1084             index = streams.get(i);
1085             if (index.hasUncompressedOffset(pos))
1086                 break;
1087         }
1088 
1089         // Locate the Block from the Stream that contains the target position.
1090         index.locateBlock(info, pos);
1091 
1092         assert (info.compressedOffset & 3) == 0;
1093         assert info.uncompressedSize > 0;
1094         assert pos >= info.uncompressedOffset;
1095         assert pos < info.uncompressedOffset + info.uncompressedSize;
1096     }
1097 
1098     /**
1099      * Locates the given Block and stores information about it
1100      * to <code>info</code>.
1101      */
1102     private void locateBlockByNumber(BlockInfo info, int blockNumber) {
1103         // Validate.
1104         if (blockNumber < 0 || blockNumber >= blockCount)
1105             throw new IndexOutOfBoundsException(
1106                     "Invalid XZ Block number: " + blockNumber);
1107 
1108         // Skip the search if info already points to the correct Block.
1109         if (info.blockNumber == blockNumber)
1110             return;
1111 
1112         // Search the Stream that contains the given Block and then
1113         // search the Block from that Stream.
1114         for (int i = 0; ; ++i) {
1115             IndexDecoder index = streams.get(i);
1116             if (index.hasRecord(blockNumber)) {
1117                 index.setBlockInfo(info, blockNumber);
1118                 return;
1119             }
1120         }
1121     }
1122 
1123     /**
1124      * Initializes a new BlockInputStream. This is a helper function for
1125      * <code>seek()</code>.
1126      */
1127     private void initBlockDecoder() throws IOException {
1128         try {
1129             // Set it to null first so that GC can collect it if memory
1130             // runs tight when initializing a new BlockInputStream.
1131             if (blockDecoder != null) {
1132                 blockDecoder.close();
1133                 blockDecoder = null;
1134             }
1135 
1136             blockDecoder = new BlockInputStream(
1137                     in, check, verifyCheck, memoryLimit,
1138                     curBlockInfo.unpaddedSize, curBlockInfo.uncompressedSize,
1139                     arrayCache);
1140         } catch (MemoryLimitException e) {
1141             // BlockInputStream doesn't know how much memory we had
1142             // already needed so we need to recreate the exception.
1143             assert memoryLimit >= 0;
1144             throw new MemoryLimitException(
1145                     e.getMemoryNeeded() + indexMemoryUsage,
1146                     memoryLimit + indexMemoryUsage);
1147         } catch (IndexIndicatorException e) {
1148             // It cannot be Index so the file must be corrupt.
1149             throw new CorruptedInputException();
1150         }
1151     }
1152 }
1153