1 /*
2  * SeekableXZInputStream
3  *
4  * Author: Lasse Collin <lasse.collin@tukaani.org>
5  *
6  * This file has been put into the public domain.
7  * You can do whatever you want with this file.
8  */
9 
10 package org.tukaani.xz;
11 
12 import java.util.Arrays;
13 import java.util.ArrayList;
14 import java.io.DataInputStream;
15 import java.io.IOException;
16 import java.io.EOFException;
17 import org.tukaani.xz.common.DecoderUtil;
18 import org.tukaani.xz.common.StreamFlags;
19 import org.tukaani.xz.check.Check;
20 import org.tukaani.xz.index.IndexDecoder;
21 import org.tukaani.xz.index.BlockInfo;
22 
23 /**
24  * Decompresses a .xz file in random access mode.
25  * This supports decompressing concatenated .xz files.
26  * <p>
27  * Each .xz file consist of one or more Streams. Each Stream consist of zero
28  * or more Blocks. Each Stream contains an Index of Streams' Blocks.
29  * The Indexes from all Streams are loaded in RAM by a constructor of this
30  * class. A typical .xz file has only one Stream, and parsing its Index will
31  * need only three or four seeks.
32  * <p>
33  * To make random access possible, the data in a .xz file must be splitted
34  * into multiple Blocks of reasonable size. Decompression can only start at
35  * a Block boundary. When seeking to an uncompressed position that is not at
36  * a Block boundary, decompression starts at the beginning of the Block and
37  * throws away data until the target position is reached. Thus, smaller Blocks
38  * mean faster seeks to arbitrary uncompressed positions. On the other hand,
39  * smaller Blocks mean worse compression. So one has to make a compromise
40  * between random access speed and compression ratio.
41  * <p>
42  * Implementation note: This class uses linear search to locate the correct
43  * Stream from the data structures in RAM. It was the simplest to implement
44  * and should be fine as long as there aren't too many Streams. The correct
45  * Block inside a Stream is located using binary search and thus is fast
46  * even with a huge number of Blocks.
47  *
48  * <h4>Memory usage</h4>
49  * <p>
50  * The amount of memory needed for the Indexes is taken into account when
51  * checking the memory usage limit. Each Stream is calculated to need at
52  * least 1&nbsp;KiB of memory and each Block 16 bytes of memory, rounded up
53  * to the next kibibyte. So unless the file has a huge number of Streams or
54  * Blocks, these don't take significant amount of memory.
55  *
56  * <h4>Creating random-accessible .xz files</h4>
57  * <p>
58  * When using {@link XZOutputStream}, a new Block can be started by calling
59  * its {@link XZOutputStream#endBlock() endBlock} method. If you know
60  * that the decompressor will only need to seek to certain uncompressed
61  * positions, it can be a good idea to start a new Block at (some of) these
62  * positions (and only at these positions to get better compression ratio).
63  * <p>
64  * liblzma in XZ Utils supports starting a new Block with
65  * <code>LZMA_FULL_FLUSH</code>. XZ Utils 5.1.1alpha added threaded
66  * compression which creates multi-Block .xz files. XZ Utils 5.1.1alpha
67  * also added the option <code>--block-size=SIZE</code> to the xz command
68  * line tool. XZ Utils 5.1.2alpha added a partial implementation of
69  * <code>--block-list=SIZES</code> which allows specifying sizes of
70  * individual Blocks.
71  *
72  * @see SeekableFileInputStream
73  * @see XZInputStream
74  * @see XZOutputStream
75  */
76 public class SeekableXZInputStream extends SeekableInputStream {
77     /**
78      * The input stream containing XZ compressed data.
79      */
80     private SeekableInputStream in;
81 
82     /**
83      * Memory usage limit after the memory usage of the IndexDecoders have
84      * been substracted.
85      */
86     private final int memoryLimit;
87 
88     /**
89      * Memory usage of the IndexDecoders.
90      * <code>memoryLimit + indexMemoryUsage</code> equals the original
91      * memory usage limit that was passed to the constructor.
92      */
93     private int indexMemoryUsage = 0;
94 
95     /**
96      * List of IndexDecoders, one for each Stream in the file.
97      * The list is in reverse order: The first element is
98      * the last Stream in the file.
99      */
100     private final ArrayList streams = new ArrayList();
101 
102     /**
103      * Bitmask of all Check IDs seen.
104      */
105     private int checkTypes = 0;
106 
107     /**
108      * Uncompressed size of the file (all Streams).
109      */
110     private long uncompressedSize = 0;
111 
112     /**
113      * Uncompressed size of the largest XZ Block in the file.
114      */
115     private long largestBlockSize = 0;
116 
117     /**
118      * Number of XZ Blocks in the file.
119      */
120     private int blockCount = 0;
121 
122     /**
123      * Size and position information about the current Block.
124      * If there are no Blocks, all values will be <code>-1</code>.
125      */
126     private final BlockInfo curBlockInfo;
127 
128     /**
129      * Temporary (and cached) information about the Block whose information
130      * is queried via <code>getBlockPos</code> and related functions.
131      */
132     private final BlockInfo queriedBlockInfo;
133 
134     /**
135      * Integrity Check in the current XZ Stream. The constructor leaves
136      * this to point to the Check of the first Stream.
137      */
138     private Check check;
139 
140     /**
141      * Flag indicating if the integrity checks will be verified.
142      */
143     private final boolean verifyCheck;
144 
145     /**
146      * Decoder of the current XZ Block, if any.
147      */
148     private BlockInputStream blockDecoder = null;
149 
150     /**
151      * Current uncompressed position.
152      */
153     private long curPos = 0;
154 
155     /**
156      * Target position for seeking.
157      */
158     private long seekPos;
159 
160     /**
161      * True when <code>seek(long)</code> has been called but the actual
162      * seeking hasn't been done yet.
163      */
164     private boolean seekNeeded = false;
165 
166     /**
167      * True when end of the file was reached. This can be cleared by
168      * calling <code>seek(long)</code>.
169      */
170     private boolean endReached = false;
171 
172     /**
173      * Pending exception from an earlier error.
174      */
175     private IOException exception = null;
176 
177     /**
178      * Temporary buffer for read(). This avoids reallocating memory
179      * on every read() call.
180      */
181     private final byte[] tempBuf = new byte[1];
182 
183     /**
184      * Creates a new seekable XZ decompressor without a memory usage limit.
185      *
186      * @param       in          seekable input stream containing one or more
187      *                          XZ Streams; the whole input stream is used
188      *
189      * @throws      XZFormatException
190      *                          input is not in the XZ format
191      *
192      * @throws      CorruptedInputException
193      *                          XZ data is corrupt or truncated
194      *
195      * @throws      UnsupportedOptionsException
196      *                          XZ headers seem valid but they specify
197      *                          options not supported by this implementation
198      *
199      * @throws      EOFException
200      *                          less than 6 bytes of input was available
201      *                          from <code>in</code>, or (unlikely) the size
202      *                          of the underlying stream got smaller while
203      *                          this was reading from it
204      *
205      * @throws      IOException may be thrown by <code>in</code>
206      */
SeekableXZInputStream(SeekableInputStream in)207     public SeekableXZInputStream(SeekableInputStream in)
208             throws IOException {
209         this(in, -1);
210     }
211 
212     /**
213      * Creates a new seekable XZ decomporessor with an optional
214      * memory usage limit.
215      *
216      * @param       in          seekable input stream containing one or more
217      *                          XZ Streams; the whole input stream is used
218      *
219      * @param       memoryLimit memory usage limit in kibibytes (KiB)
220      *                          or <code>-1</code> to impose no
221      *                          memory usage limit
222      *
223      * @throws      XZFormatException
224      *                          input is not in the XZ format
225      *
226      * @throws      CorruptedInputException
227      *                          XZ data is corrupt or truncated
228      *
229      * @throws      UnsupportedOptionsException
230      *                          XZ headers seem valid but they specify
231      *                          options not supported by this implementation
232      *
233      * @throws      MemoryLimitException
234      *                          decoded XZ Indexes would need more memory
235      *                          than allowed by the memory usage limit
236      *
237      * @throws      EOFException
238      *                          less than 6 bytes of input was available
239      *                          from <code>in</code>, or (unlikely) the size
240      *                          of the underlying stream got smaller while
241      *                          this was reading from it
242      *
243      * @throws      IOException may be thrown by <code>in</code>
244      */
SeekableXZInputStream(SeekableInputStream in, int memoryLimit)245     public SeekableXZInputStream(SeekableInputStream in, int memoryLimit)
246             throws IOException {
247         this(in, memoryLimit, true);
248     }
249 
250     /**
251      * Creates a new seekable XZ decomporessor with an optional
252      * memory usage limit and ability to disable verification
253      * of integrity checks.
254      * <p>
255      * Note that integrity check verification should almost never be disabled.
256      * Possible reasons to disable integrity check verification:
257      * <ul>
258      *   <li>Trying to recover data from a corrupt .xz file.</li>
259      *   <li>Speeding up decompression. This matters mostly with SHA-256
260      *   or with files that have compressed extremely well. It's recommended
261      *   that integrity checking isn't disabled for performance reasons
262      *   unless the file integrity is verified externally in some other
263      *   way.</li>
264      * </ul>
265      * <p>
266      * <code>verifyCheck</code> only affects the integrity check of
267      * the actual compressed data. The CRC32 fields in the headers
268      * are always verified.
269      *
270      * @param       in          seekable input stream containing one or more
271      *                          XZ Streams; the whole input stream is used
272      *
273      * @param       memoryLimit memory usage limit in kibibytes (KiB)
274      *                          or <code>-1</code> to impose no
275      *                          memory usage limit
276      *
277      * @param       verifyCheck if <code>true</code>, the integrity checks
278      *                          will be verified; this should almost never
279      *                          be set to <code>false</code>
280      *
281      * @throws      XZFormatException
282      *                          input is not in the XZ format
283      *
284      * @throws      CorruptedInputException
285      *                          XZ data is corrupt or truncated
286      *
287      * @throws      UnsupportedOptionsException
288      *                          XZ headers seem valid but they specify
289      *                          options not supported by this implementation
290      *
291      * @throws      MemoryLimitException
292      *                          decoded XZ Indexes would need more memory
293      *                          than allowed by the memory usage limit
294      *
295      * @throws      EOFException
296      *                          less than 6 bytes of input was available
297      *                          from <code>in</code>, or (unlikely) the size
298      *                          of the underlying stream got smaller while
299      *                          this was reading from it
300      *
301      * @throws      IOException may be thrown by <code>in</code>
302      *
303      * @since 1.6
304      */
SeekableXZInputStream(SeekableInputStream in, int memoryLimit, boolean verifyCheck)305     public SeekableXZInputStream(SeekableInputStream in, int memoryLimit,
306                                  boolean verifyCheck)
307             throws IOException {
308         this.verifyCheck = verifyCheck;
309         this.in = in;
310         DataInputStream inData = new DataInputStream(in);
311 
312         // Check the magic bytes in the beginning of the file.
313         {
314             in.seek(0);
315             byte[] buf = new byte[XZ.HEADER_MAGIC.length];
316             inData.readFully(buf);
317             if (!Arrays.equals(buf, XZ.HEADER_MAGIC))
318                 throw new XZFormatException();
319         }
320 
321         // Get the file size and verify that it is a multiple of 4 bytes.
322         long pos = in.length();
323         if ((pos & 3) != 0)
324             throw new CorruptedInputException(
325                     "XZ file size is not a multiple of 4 bytes");
326 
327         // Parse the headers starting from the end of the file.
328         byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE];
329         long streamPadding = 0;
330 
331         while (pos > 0) {
332             if (pos < DecoderUtil.STREAM_HEADER_SIZE)
333                 throw new CorruptedInputException();
334 
335             // Read the potential Stream Footer.
336             in.seek(pos - DecoderUtil.STREAM_HEADER_SIZE);
337             inData.readFully(buf);
338 
339             // Skip Stream Padding four bytes at a time.
340             // Skipping more at once would be faster,
341             // but usually there isn't much Stream Padding.
342             if (buf[8] == 0x00 && buf[9] == 0x00 && buf[10] == 0x00
343                     && buf[11] == 0x00) {
344                 streamPadding += 4;
345                 pos -= 4;
346                 continue;
347             }
348 
349             // It's not Stream Padding. Update pos.
350             pos -= DecoderUtil.STREAM_HEADER_SIZE;
351 
352             // Decode the Stream Footer and check if Backward Size
353             // looks reasonable.
354             StreamFlags streamFooter = DecoderUtil.decodeStreamFooter(buf);
355             if (streamFooter.backwardSize >= pos)
356                 throw new CorruptedInputException(
357                         "Backward Size in XZ Stream Footer is too big");
358 
359             // Check that the Check ID is supported. Store it in case this
360             // is the first Stream in the file.
361             check = Check.getInstance(streamFooter.checkType);
362 
363             // Remember which Check IDs have been seen.
364             checkTypes |= 1 << streamFooter.checkType;
365 
366             // Seek to the beginning of the Index.
367             in.seek(pos - streamFooter.backwardSize);
368 
369             // Decode the Index field.
370             IndexDecoder index;
371             try {
372                 index = new IndexDecoder(in, streamFooter, streamPadding,
373                                          memoryLimit);
374             } catch (MemoryLimitException e) {
375                 // IndexDecoder doesn't know how much memory we had
376                 // already needed so we need to recreate the exception.
377                 assert memoryLimit >= 0;
378                 throw new MemoryLimitException(
379                         e.getMemoryNeeded() + indexMemoryUsage,
380                         memoryLimit + indexMemoryUsage);
381             }
382 
383             // Update the memory usage and limit counters.
384             indexMemoryUsage += index.getMemoryUsage();
385             if (memoryLimit >= 0) {
386                 memoryLimit -= index.getMemoryUsage();
387                 assert memoryLimit >= 0;
388             }
389 
390             // Remember the uncompressed size of the largest Block.
391             if (largestBlockSize < index.getLargestBlockSize())
392                 largestBlockSize = index.getLargestBlockSize();
393 
394             // Calculate the offset to the beginning of this XZ Stream and
395             // check that it looks sane.
396             long off = index.getStreamSize() - DecoderUtil.STREAM_HEADER_SIZE;
397             if (pos < off)
398                 throw new CorruptedInputException("XZ Index indicates "
399                         + "too big compressed size for the XZ Stream");
400 
401             // Seek to the beginning of this Stream.
402             pos -= off;
403             in.seek(pos);
404 
405             // Decode the Stream Header.
406             inData.readFully(buf);
407             StreamFlags streamHeader = DecoderUtil.decodeStreamHeader(buf);
408 
409             // Verify that the Stream Header matches the Stream Footer.
410             if (!DecoderUtil.areStreamFlagsEqual(streamHeader, streamFooter))
411                 throw new CorruptedInputException(
412                         "XZ Stream Footer does not match Stream Header");
413 
414             // Update the total uncompressed size of the file and check that
415             // it doesn't overflow.
416             uncompressedSize += index.getUncompressedSize();
417             if (uncompressedSize < 0)
418                 throw new UnsupportedOptionsException("XZ file is too big");
419 
420             // Update the Block count and check that it fits into an int.
421             blockCount += index.getRecordCount();
422             if (blockCount < 0)
423                 throw new UnsupportedOptionsException(
424                         "XZ file has over " + Integer.MAX_VALUE + " Blocks");
425 
426             // Add this Stream to the list of Streams.
427             streams.add(index);
428 
429             // Reset to be ready to parse the next Stream.
430             streamPadding = 0;
431         }
432 
433         assert pos == 0;
434 
435         // Save it now that indexMemoryUsage has been substracted from it.
436         this.memoryLimit = memoryLimit;
437 
438         // Store the relative offsets of the Streams. This way we don't
439         // need to recalculate them in this class when seeking; the
440         // IndexDecoder instances will handle them.
441         IndexDecoder prev = (IndexDecoder)streams.get(streams.size() - 1);
442         for (int i = streams.size() - 2; i >= 0; --i) {
443             IndexDecoder cur = (IndexDecoder)streams.get(i);
444             cur.setOffsets(prev);
445             prev = cur;
446         }
447 
448         // Initialize curBlockInfo to point to the first Stream.
449         // The blockNumber will be left to -1 so that .hasNext()
450         // and .setNext() work to get the first Block when starting
451         // to decompress from the beginning of the file.
452         IndexDecoder first = (IndexDecoder)streams.get(streams.size() - 1);
453         curBlockInfo = new BlockInfo(first);
454 
455         // queriedBlockInfo needs to be allocated too. The Stream used for
456         // initialization doesn't matter though.
457         queriedBlockInfo = new BlockInfo(first);
458     }
459 
460     /**
461      * Gets the types of integrity checks used in the .xz file.
462      * Multiple checks are possible only if there are multiple
463      * concatenated XZ Streams.
464      * <p>
465      * The returned value has a bit set for every check type that is present.
466      * For example, if CRC64 and SHA-256 were used, the return value is
467      * <code>(1&nbsp;&lt;&lt;&nbsp;XZ.CHECK_CRC64)
468      * | (1&nbsp;&lt;&lt;&nbsp;XZ.CHECK_SHA256)</code>.
469      */
getCheckTypes()470     public int getCheckTypes() {
471         return checkTypes;
472     }
473 
474     /**
475      * Gets the amount of memory in kibibytes (KiB) used by
476      * the data structures needed to locate the XZ Blocks.
477      * This is usually useless information but since it is calculated
478      * for memory usage limit anyway, it is nice to make it available to too.
479      */
getIndexMemoryUsage()480     public int getIndexMemoryUsage() {
481         return indexMemoryUsage;
482     }
483 
484     /**
485      * Gets the uncompressed size of the largest XZ Block in bytes.
486      * This can be useful if you want to check that the file doesn't
487      * have huge XZ Blocks which could make seeking to arbitrary offsets
488      * very slow. Note that huge Blocks don't automatically mean that
489      * seeking would be slow, for example, seeking to the beginning of
490      * any Block is always fast.
491      */
getLargestBlockSize()492     public long getLargestBlockSize() {
493         return largestBlockSize;
494     }
495 
496     /**
497      * Gets the number of Streams in the .xz file.
498      *
499      * @since 1.3
500      */
getStreamCount()501     public int getStreamCount() {
502         return streams.size();
503     }
504 
505     /**
506      * Gets the number of Blocks in the .xz file.
507      *
508      * @since 1.3
509      */
getBlockCount()510     public int getBlockCount() {
511         return blockCount;
512     }
513 
514     /**
515      * Gets the uncompressed start position of the given Block.
516      *
517      * @throws  IndexOutOfBoundsException if
518      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
519      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
520      *
521      * @since 1.3
522      */
getBlockPos(int blockNumber)523     public long getBlockPos(int blockNumber) {
524         locateBlockByNumber(queriedBlockInfo, blockNumber);
525         return queriedBlockInfo.uncompressedOffset;
526     }
527 
528     /**
529      * Gets the uncompressed size of the given Block.
530      *
531      * @throws  IndexOutOfBoundsException if
532      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
533      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
534      *
535      * @since 1.3
536      */
getBlockSize(int blockNumber)537     public long getBlockSize(int blockNumber) {
538         locateBlockByNumber(queriedBlockInfo, blockNumber);
539         return queriedBlockInfo.uncompressedSize;
540     }
541 
542     /**
543      * Gets the position where the given compressed Block starts in
544      * the underlying .xz file.
545      * This information is rarely useful to the users of this class.
546      *
547      * @throws  IndexOutOfBoundsException if
548      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
549      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
550      *
551      * @since 1.3
552      */
getBlockCompPos(int blockNumber)553     public long getBlockCompPos(int blockNumber) {
554         locateBlockByNumber(queriedBlockInfo, blockNumber);
555         return queriedBlockInfo.compressedOffset;
556     }
557 
558     /**
559      * Gets the compressed size of the given Block.
560      * This together with the uncompressed size can be used to calculate
561      * the compression ratio of the specific Block.
562      *
563      * @throws  IndexOutOfBoundsException if
564      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
565      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
566      *
567      * @since 1.3
568      */
getBlockCompSize(int blockNumber)569     public long getBlockCompSize(int blockNumber) {
570         locateBlockByNumber(queriedBlockInfo, blockNumber);
571         return (queriedBlockInfo.unpaddedSize + 3) & ~3;
572     }
573 
574     /**
575      * Gets integrity check type (Check ID) of the given Block.
576      *
577      * @throws  IndexOutOfBoundsException if
578      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
579      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
580      *
581      * @see #getCheckTypes()
582      *
583      * @since 1.3
584      */
getBlockCheckType(int blockNumber)585     public int getBlockCheckType(int blockNumber) {
586         locateBlockByNumber(queriedBlockInfo, blockNumber);
587         return queriedBlockInfo.getCheckType();
588     }
589 
590     /**
591      * Gets the number of the Block that contains the byte at the given
592      * uncompressed position.
593      *
594      * @throws  IndexOutOfBoundsException if
595      *          <code>pos&nbsp;&lt;&nbsp;0</code> or
596      *          <code>pos&nbsp;&gt;=&nbsp;length()</code>.
597      *
598      * @since 1.3
599      */
getBlockNumber(long pos)600     public int getBlockNumber(long pos) {
601         locateBlockByPos(queriedBlockInfo, pos);
602         return queriedBlockInfo.blockNumber;
603     }
604 
605     /**
606      * Decompresses the next byte from this input stream.
607      *
608      * @return      the next decompressed byte, or <code>-1</code>
609      *              to indicate the end of the compressed stream
610      *
611      * @throws      CorruptedInputException
612      * @throws      UnsupportedOptionsException
613      * @throws      MemoryLimitException
614      *
615      * @throws      XZIOException if the stream has been closed
616      *
617      * @throws      IOException may be thrown by <code>in</code>
618      */
read()619     public int read() throws IOException {
620         return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF);
621     }
622 
623     /**
624      * Decompresses into an array of bytes.
625      * <p>
626      * If <code>len</code> is zero, no bytes are read and <code>0</code>
627      * is returned. Otherwise this will try to decompress <code>len</code>
628      * bytes of uncompressed data. Less than <code>len</code> bytes may
629      * be read only in the following situations:
630      * <ul>
631      *   <li>The end of the compressed data was reached successfully.</li>
632      *   <li>An error is detected after at least one but less than
633      *       <code>len</code> bytes have already been successfully
634      *       decompressed. The next call with non-zero <code>len</code>
635      *       will immediately throw the pending exception.</li>
636      *   <li>An exception is thrown.</li>
637      * </ul>
638      *
639      * @param       buf         target buffer for uncompressed data
640      * @param       off         start offset in <code>buf</code>
641      * @param       len         maximum number of uncompressed bytes to read
642      *
643      * @return      number of bytes read, or <code>-1</code> to indicate
644      *              the end of the compressed stream
645      *
646      * @throws      CorruptedInputException
647      * @throws      UnsupportedOptionsException
648      * @throws      MemoryLimitException
649      *
650      * @throws      XZIOException if the stream has been closed
651      *
652      * @throws      IOException may be thrown by <code>in</code>
653      */
read(byte[] buf, int off, int len)654     public int read(byte[] buf, int off, int len) throws IOException {
655         if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
656             throw new IndexOutOfBoundsException();
657 
658         if (len == 0)
659             return 0;
660 
661         if (in == null)
662             throw new XZIOException("Stream closed");
663 
664         if (exception != null)
665             throw exception;
666 
667         int size = 0;
668 
669         try {
670             if (seekNeeded)
671                 seek();
672 
673             if (endReached)
674                 return -1;
675 
676             while (len > 0) {
677                 if (blockDecoder == null) {
678                     seek();
679                     if (endReached)
680                         break;
681                 }
682 
683                 int ret = blockDecoder.read(buf, off, len);
684 
685                 if (ret > 0) {
686                     curPos += ret;
687                     size += ret;
688                     off += ret;
689                     len -= ret;
690                 } else if (ret == -1) {
691                     blockDecoder = null;
692                 }
693             }
694         } catch (IOException e) {
695             // We know that the file isn't simply truncated because we could
696             // parse the Indexes in the constructor. So convert EOFException
697             // to CorruptedInputException.
698             if (e instanceof EOFException)
699                 e = new CorruptedInputException();
700 
701             exception = e;
702             if (size == 0)
703                 throw e;
704         }
705 
706         return size;
707     }
708 
709     /**
710      * Returns the number of uncompressed bytes that can be read
711      * without blocking. The value is returned with an assumption
712      * that the compressed input data will be valid. If the compressed
713      * data is corrupt, <code>CorruptedInputException</code> may get
714      * thrown before the number of bytes claimed to be available have
715      * been read from this input stream.
716      *
717      * @return      the number of uncompressed bytes that can be read
718      *              without blocking
719      */
available()720     public int available() throws IOException {
721         if (in == null)
722             throw new XZIOException("Stream closed");
723 
724         if (exception != null)
725             throw exception;
726 
727         if (endReached || seekNeeded || blockDecoder == null)
728             return 0;
729 
730         return blockDecoder.available();
731     }
732 
733     /**
734      * Closes the stream and calls <code>in.close()</code>.
735      * If the stream was already closed, this does nothing.
736      *
737      * @throws  IOException if thrown by <code>in.close()</code>
738      */
close()739     public void close() throws IOException {
740         if (in != null) {
741             try {
742                 in.close();
743             } finally {
744                 in = null;
745             }
746         }
747     }
748 
749     /**
750      * Gets the uncompressed size of this input stream. If there are multiple
751      * XZ Streams, the total uncompressed size of all XZ Streams is returned.
752      */
length()753     public long length() {
754         return uncompressedSize;
755     }
756 
757     /**
758      * Gets the current uncompressed position in this input stream.
759      *
760      * @throws      XZIOException if the stream has been closed
761      */
position()762     public long position() throws IOException {
763         if (in == null)
764             throw new XZIOException("Stream closed");
765 
766         return seekNeeded ? seekPos : curPos;
767     }
768 
769     /**
770      * Seeks to the specified absolute uncompressed position in the stream.
771      * This only stores the new position, so this function itself is always
772      * very fast. The actual seek is done when <code>read</code> is called
773      * to read at least one byte.
774      * <p>
775      * Seeking past the end of the stream is possible. In that case
776      * <code>read</code> will return <code>-1</code> to indicate
777      * the end of the stream.
778      *
779      * @param       pos         new uncompressed read position
780      *
781      * @throws      XZIOException
782      *                          if <code>pos</code> is negative, or
783      *                          if stream has been closed
784      */
seek(long pos)785     public void seek(long pos) throws IOException {
786         if (in == null)
787             throw new XZIOException("Stream closed");
788 
789         if (pos < 0)
790             throw new XZIOException("Negative seek position: " + pos);
791 
792         seekPos = pos;
793         seekNeeded = true;
794     }
795 
796     /**
797      * Seeks to the beginning of the given XZ Block.
798      *
799      * @throws      XZIOException
800      *              if <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
801      *              <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>,
802      *              or if stream has been closed
803      *
804      * @since 1.3
805      */
seekToBlock(int blockNumber)806     public void seekToBlock(int blockNumber) throws IOException {
807         if (in == null)
808             throw new XZIOException("Stream closed");
809 
810         if (blockNumber < 0 || blockNumber >= blockCount)
811             throw new XZIOException("Invalid XZ Block number: " + blockNumber);
812 
813         // This is a bit silly implementation. Here we locate the uncompressed
814         // offset of the specified Block, then when doing the actual seek in
815         // seek(), we need to find the Block number based on seekPos.
816         seekPos = getBlockPos(blockNumber);
817         seekNeeded = true;
818     }
819 
820     /**
821      * Does the actual seeking. This is also called when <code>read</code>
822      * needs a new Block to decode.
823      */
seek()824     private void seek() throws IOException {
825         // If seek(long) wasn't called, we simply need to get the next Block
826         // from the same Stream. If there are no more Blocks in this Stream,
827         // then we behave as if seek(long) had been called.
828         if (!seekNeeded) {
829             if (curBlockInfo.hasNext()) {
830                 curBlockInfo.setNext();
831                 initBlockDecoder();
832                 return;
833             }
834 
835             seekPos = curPos;
836         }
837 
838         seekNeeded = false;
839 
840         // Check if we are seeking to or past the end of the file.
841         if (seekPos >= uncompressedSize) {
842             curPos = seekPos;
843             blockDecoder = null;
844             endReached = true;
845             return;
846         }
847 
848         endReached = false;
849 
850         // Locate the Block that contains the uncompressed target position.
851         locateBlockByPos(curBlockInfo, seekPos);
852 
853         // Seek in the underlying stream and create a new Block decoder
854         // only if really needed. We can skip it if the current position
855         // is already in the correct Block and the target position hasn't
856         // been decompressed yet.
857         //
858         // NOTE: If curPos points to the beginning of this Block, it's
859         // because it was left there after decompressing an earlier Block.
860         // In that case, decoding of the current Block hasn't been started
861         // yet. (Decoding of a Block won't be started until at least one
862         // byte will also be read from it.)
863         if (!(curPos > curBlockInfo.uncompressedOffset && curPos <= seekPos)) {
864             // Seek to the beginning of the Block.
865             in.seek(curBlockInfo.compressedOffset);
866 
867             // Since it is possible that this Block is from a different
868             // Stream than the previous Block, initialize a new Check.
869             check = Check.getInstance(curBlockInfo.getCheckType());
870 
871             // Create a new Block decoder.
872             initBlockDecoder();
873             curPos = curBlockInfo.uncompressedOffset;
874         }
875 
876         // If the target wasn't at a Block boundary, decompress and throw
877         // away data to reach the target position.
878         if (seekPos > curPos) {
879             // NOTE: The "if" below is there just in case. In this situation,
880             // blockDecoder.skip will always skip the requested amount
881             // or throw an exception.
882             long skipAmount = seekPos - curPos;
883             if (blockDecoder.skip(skipAmount) != skipAmount)
884                 throw new CorruptedInputException();
885 
886             curPos = seekPos;
887         }
888     }
889 
890     /**
891      * Locates the Block that contains the given uncompressed position.
892      */
locateBlockByPos(BlockInfo info, long pos)893     private void locateBlockByPos(BlockInfo info, long pos) {
894         if (pos < 0 || pos >= uncompressedSize)
895             throw new IndexOutOfBoundsException(
896                     "Invalid uncompressed position: " + pos);
897 
898         // Locate the Stream that contains the target position.
899         IndexDecoder index;
900         for (int i = 0; ; ++i) {
901             index = (IndexDecoder)streams.get(i);
902             if (index.hasUncompressedOffset(pos))
903                 break;
904         }
905 
906         // Locate the Block from the Stream that contains the target position.
907         index.locateBlock(info, pos);
908 
909         assert (info.compressedOffset & 3) == 0;
910         assert info.uncompressedSize > 0;
911         assert pos >= info.uncompressedOffset;
912         assert pos < info.uncompressedOffset + info.uncompressedSize;
913     }
914 
915     /**
916      * Locates the given Block and stores information about it
917      * to <code>info</code>.
918      */
919     private void locateBlockByNumber(BlockInfo info, int blockNumber) {
920         // Validate.
921         if (blockNumber < 0 || blockNumber >= blockCount)
922             throw new IndexOutOfBoundsException(
923                     "Invalid XZ Block number: " + blockNumber);
924 
925         // Skip the search if info already points to the correct Block.
926         if (info.blockNumber == blockNumber)
927             return;
928 
929         // Search the Stream that contains the given Block and then
930         // search the Block from that Stream.
931         for (int i = 0; ; ++i) {
932             IndexDecoder index = (IndexDecoder)streams.get(i);
933             if (index.hasRecord(blockNumber)) {
934                 index.setBlockInfo(info, blockNumber);
935                 return;
936             }
937         }
938     }
939 
940     /**
941      * Initializes a new BlockInputStream. This is a helper function for
942      * <code>seek()</code>.
943      */
944     private void initBlockDecoder() throws IOException {
945         try {
946             // Set it to null first so that GC can collect it if memory
947             // runs tight when initializing a new BlockInputStream.
948             blockDecoder = null;
949             blockDecoder = new BlockInputStream(
950                     in, check, verifyCheck, memoryLimit,
951                     curBlockInfo.unpaddedSize, curBlockInfo.uncompressedSize);
952         } catch (MemoryLimitException e) {
953             // BlockInputStream doesn't know how much memory we had
954             // already needed so we need to recreate the exception.
955             assert memoryLimit >= 0;
956             throw new MemoryLimitException(
957                     e.getMemoryNeeded() + indexMemoryUsage,
958                     memoryLimit + indexMemoryUsage);
959         } catch (IndexIndicatorException e) {
960             // It cannot be Index so the file must be corrupt.
961             throw new CorruptedInputException();
962         }
963     }
964 }
965