tukaani/xz/LZMAInputStream.java

/*
 * LZMAInputStream
 *
 * Authors: Lasse Collin <lasse.collin@tukaani.org>
 *          Igor Pavlov <http://7-zip.org/>
 *
 * This file has been put into the public domain.
 * You can do whatever you want with this file.
 */

package org.tukaani.xz;

import java.io.InputStream;
import java.io.DataInputStream;
import java.io.IOException;
import org.tukaani.xz.lz.LZDecoder;
import org.tukaani.xz.rangecoder.RangeDecoderFromStream;
import org.tukaani.xz.lzma.LZMADecoder;

/**
 * Decompresses legacy .lzma files and raw LZMA streams (no .lzma header).
 * <p>
 * <b>IMPORTANT:</b> In contrast to other classes in this package, this class
 * reads data from its input stream one byte at a time. If the input stream
 * is for example {@link java.io.FileInputStream}, wrapping it into
 * {@link java.io.BufferedInputStream} tends to improve performance a lot.
 * This is not automatically done by this class because there may be use
 * cases where it is desired that this class won't read any bytes past
 * the end of the LZMA stream.
 * <p>
 * Even when using <code>BufferedInputStream</code>, the performance tends
 * to be worse (maybe 10-20&nbsp;% slower) than with {@link LZMA2InputStream}
 * or {@link XZInputStream} (when the .xz file contains LZMA2-compressed data).
 *
 * @since 1.4
 */
public class LZMAInputStream extends InputStream {
    /**
     * Largest dictionary size supported by this implementation.
     * <p>
     * LZMA allows dictionaries up to one byte less than 4 GiB. This
     * implementation supports only 16 bytes less than 2 GiB. This
     * limitation is due to Java using signed 32-bit integers for array
     * indexing. The limitation shouldn't matter much in practice since so
     * huge dictionaries are not normally used.
     */
    public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15;

    private InputStream in;
    private LZDecoder lz;
    private RangeDecoderFromStream rc;
    private LZMADecoder lzma;

    private boolean endReached = false;

    private final byte[] tempBuf = new byte[1];

    /**
     * Number of uncompressed bytes left to be decompressed, or -1 if
     * the end marker is used.
     */
    private long remainingSize;

    private IOException exception = null;

    /**
     * Gets approximate decompressor memory requirements as kibibytes for
     * the given dictionary size and LZMA properties byte (lc, lp, and pb).
     *
     * @param       dictSize    LZMA dictionary size as bytes, should be
     *                          in the range [<code>0</code>,
     *                          <code>DICT_SIZE_MAX</code>]
     *
     * @param       propsByte   LZMA properties byte that encodes the values
     *                          of lc, lp, and pb
     *
     * @return      approximate memory requirements as kibibytes (KiB)
     *
     * @throws      UnsupportedOptionsException
     *                          if <code>dictSize</code> is outside
     *                          the range [<code>0</code>,
     *                          <code>DICT_SIZE_MAX</code>]
     *
     * @throws      CorruptedInputException
     *                          if <code>propsByte</code> is invalid
     */
    public static int getMemoryUsage(int dictSize, byte propsByte)
            throws UnsupportedOptionsException, CorruptedInputException {
        if (dictSize < 0 || dictSize > DICT_SIZE_MAX)
            throw new UnsupportedOptionsException(
                    "LZMA dictionary is too big for this implementation");

        int props = propsByte & 0xFF;
        if (props > (4 * 5 + 4) * 9 + 8)
            throw new CorruptedInputException("Invalid LZMA properties byte");

        props %= 9 * 5;
        int lp = props / 9;
        int lc = props - lp * 9;

        return getMemoryUsage(dictSize, lc, lp);
    }

    /**
     * Gets approximate decompressor memory requirements as kibibytes for
     * the given dictionary size, lc, and lp. Note that pb isn't needed.
     *
     * @param       dictSize    LZMA dictionary size as bytes, must be
     *                          in the range [<code>0</code>,
     *                          <code>DICT_SIZE_MAX</code>]
     *
     * @param       lc          number of literal context bits, must be
     *                          in the range [0, 8]
     *
     * @param       lp          number of literal position bits, must be
     *                          in the range [0, 4]
     *
     * @return      approximate memory requirements as kibibytes (KiB)
     */
    public static int getMemoryUsage(int dictSize, int lc, int lp) {
        if (lc < 0 || lc > 8 || lp < 0 || lp > 4)
            throw new IllegalArgumentException("Invalid lc or lp");

        // Probability variables have the type "short". There are
        // 0x300 (768) probability variables in each literal subcoder.
        // The number of literal subcoders is 2^(lc + lp).
        //
        // Roughly 10 KiB for the base state + LZ decoder's dictionary buffer
        // + sizeof(short) * number probability variables per literal subcoder
        //   * number of literal subcoders
        return 10 + getDictSize(dictSize) / 1024
               + ((2 * 0x300) << (lc + lp)) / 1024;
    }

    private static int getDictSize(int dictSize) {
        if (dictSize < 0 || dictSize > DICT_SIZE_MAX)
            throw new IllegalArgumentException(
                    "LZMA dictionary is too big for this implementation");

        // For performance reasons, use a 4 KiB dictionary if something
        // smaller was requested. It's a rare situation and the performance
        // difference isn't huge, and it starts to matter mostly when the
        // dictionary is just a few bytes. But we need to handle the special
        // case of dictSize == 0 anyway, which is an allowed value but in
        // practice means one-byte dictionary.
        //
        // Note that using a dictionary bigger than specified in the headers
        // can hide errors if there is a reference to data beyond the original
        // dictionary size but is still within 4 KiB.
        if (dictSize < 4096)
            dictSize = 4096;

        // Round dictionary size upward to a multiple of 16. This way LZMA
        // can use LZDecoder.getPos() for calculating LZMA's posMask.
        return (dictSize + 15) & ~15;
    }

    /**
     * Creates a new .lzma file format decompressor without
     * a memory usage limit.
     *
     * @param       in          input stream from which .lzma data is read;
     *                          it might be a good idea to wrap it in
     *                          <code>BufferedInputStream</code>, see the
     *                          note at the top of this page
     *
     * @throws      CorruptedInputException
     *                          file is corrupt or perhaps not in
     *                          the .lzma format at all
     *
     * @throws      UnsupportedOptionsException
     *                          dictionary size or uncompressed size is too
     *                          big for this implementation
     *
     * @throws      EOFException
     *                          file is truncated or perhaps not in
     *                          the .lzma format at all
     *
     * @throws      IOException may be thrown by <code>in</code>
     */
    public LZMAInputStream(InputStream in) throws IOException {
        this(in, -1);
    }

    /**
     * Creates a new .lzma file format decompressor with an optional
     * memory usage limit.
     *
     * @param       in          input stream from which .lzma data is read;
     *                          it might be a good idea to wrap it in
     *                          <code>BufferedInputStream</code>, see the
     *                          note at the top of this page
     *
     * @param       memoryLimit memory usage limit in kibibytes (KiB)
     *                          or <code>-1</code> to impose no
     *                          memory usage limit
     *
     * @throws      CorruptedInputException
     *                          file is corrupt or perhaps not in
     *                          the .lzma format at all
     *
     * @throws      UnsupportedOptionsException
     *                          dictionary size or uncompressed size is too
     *                          big for this implementation
     *
     * @throws      MemoryLimitException
     *                          memory usage limit was exceeded
     *
     * @throws      EOFException
     *                          file is truncated or perhaps not in
     *                          the .lzma format at all
     *
     * @throws      IOException may be thrown by <code>in</code>
     */
    public LZMAInputStream(InputStream in, int memoryLimit)
            throws IOException {
        DataInputStream inData = new DataInputStream(in);

        // Properties byte (lc, lp, and pb)
        byte propsByte = inData.readByte();

        // Dictionary size is an unsigned 32-bit little endian integer.
        int dictSize = 0;
        for (int i = 0; i < 4; ++i)
            dictSize |= inData.readUnsignedByte() << (8 * i);

        // Uncompressed size is an unsigned 64-bit little endian integer.
        // The maximum 64-bit value is a special case (becomes -1 here)
        // which indicates that the end marker is used instead of knowing
        // the uncompressed size beforehand.
        long uncompSize = 0;
        for (int i = 0; i < 8; ++i)
            uncompSize |= (long)inData.readUnsignedByte() << (8 * i);

        // Check the memory usage limit.
        int memoryNeeded = getMemoryUsage(dictSize, propsByte);
        if (memoryLimit != -1 && memoryNeeded > memoryLimit)
            throw new MemoryLimitException(memoryNeeded, memoryLimit);

        initialize(in, uncompSize, propsByte, dictSize, null);
    }

    /**
     * Creates a new input stream that decompresses raw LZMA data (no .lzma
     * header) from <code>in</code>.
     * <p>
     * The caller needs to know if the "end of payload marker (EOPM)" alias
     * "end of stream marker (EOS marker)" alias "end marker" present.
     * If the end marker isn't used, the caller must know the exact
     * uncompressed size of the stream.
     * <p>
     * The caller also needs to provide the LZMA properties byte that encodes
     * the number of literal context bits (lc), literal position bits (lp),
     * and position bits (pb).
     * <p>
     * The dictionary size used when compressing is also needed. Specifying
     * a too small dictionary size will prevent decompressing the stream.
     * Specifying a too big dictionary is waste of memory but decompression
     * will work.
     * <p>
     * There is no need to specify a dictionary bigger than
     * the uncompressed size of the data even if a bigger dictionary
     * was used when compressing. If you know the uncompressed size
     * of the data, this might allow saving some memory.
     *
     * @param       in          input stream from which compressed
     *                          data is read
     *
     * @param       uncompSize  uncompressed size of the LZMA stream or -1
     *                          if the end marker is used in the LZMA stream
     *
     * @param       propsByte   LZMA properties byte that has the encoded
     *                          values for literal context bits (lc), literal
     *                          position bits (lp), and position bits (pb)
     *
     * @param       dictSize    dictionary size as bytes, must be in the range
     *                          [<code>0</code>, <code>DICT_SIZE_MAX</code>]
     *
     * @throws      CorruptedInputException
     *                          if <code>propsByte</code> is invalid or
     *                          the first input byte is not 0x00
     *
     * @throws      UnsupportedOptionsException
     *                          dictionary size or uncompressed size is too
     *                          big for this implementation
     *
     *
     */
    public LZMAInputStream(InputStream in, long uncompSize, byte propsByte,
                           int dictSize) throws IOException {
        initialize(in, uncompSize, propsByte, dictSize, null);
    }

    /**
     * Creates a new input stream that decompresses raw LZMA data (no .lzma
     * header) from <code>in</code> optionally with a preset dictionary.
     *
     * @param       in          input stream from which LZMA-compressed
     *                          data is read
     *
     * @param       uncompSize  uncompressed size of the LZMA stream or -1
     *                          if the end marker is used in the LZMA stream
     *
     * @param       propsByte   LZMA properties byte that has the encoded
     *                          values for literal context bits (lc), literal
     *                          position bits (lp), and position bits (pb)
     *
     * @param       dictSize    dictionary size as bytes, must be in the range
     *                          [<code>0</code>, <code>DICT_SIZE_MAX</code>]
     *
     * @param       presetDict  preset dictionary or <code>null</code>
     *                          to use no preset dictionary
     *
     * @throws      CorruptedInputException
     *                          if <code>propsByte</code> is invalid or
     *                          the first input byte is not 0x00
     *
     * @throws      UnsupportedOptionsException
     *                          dictionary size or uncompressed size is too
     *                          big for this implementation
     *
     * @throws      EOFException file is truncated or corrupt
     *
     * @throws      IOException may be thrown by <code>in</code>
     */
    public LZMAInputStream(InputStream in, long uncompSize, byte propsByte,
                           int dictSize, byte[] presetDict)
            throws IOException {
        initialize(in, uncompSize, propsByte, dictSize, presetDict);
    }

    /**
     * Creates a new input stream that decompresses raw LZMA data (no .lzma
     * header) from <code>in</code> optionally with a preset dictionary.
     *
     * @param       in          input stream from which LZMA-compressed
     *                          data is read
     *
     * @param       uncompSize  uncompressed size of the LZMA stream or -1
     *                          if the end marker is used in the LZMA stream
     *
     * @param       lc          number of literal context bits, must be
     *                          in the range [0, 8]
     *
     * @param       lp          number of literal position bits, must be
     *                          in the range [0, 4]
     *
     * @param       pb          number position bits, must be
     *                          in the range [0, 4]
     *
     * @param       dictSize    dictionary size as bytes, must be in the range
     *                          [<code>0</code>, <code>DICT_SIZE_MAX</code>]
     *
     * @param       presetDict  preset dictionary or <code>null</code>
     *                          to use no preset dictionary
     *
     * @throws      CorruptedInputException
     *                          if the first input byte is not 0x00
     *
     * @throws      EOFException file is truncated or corrupt
     *
     * @throws      IOException may be thrown by <code>in</code>
     */
    public LZMAInputStream(InputStream in, long uncompSize,
                           int lc, int lp, int pb,
                           int dictSize, byte[] presetDict)
            throws IOException {
        initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict);
    }

    private void initialize(InputStream in, long uncompSize, byte propsByte,
                            int dictSize, byte[] presetDict)
            throws IOException {
        // Validate the uncompressed size since the other "initialize" throws
        // IllegalArgumentException if uncompSize < -1.
        if (uncompSize < -1)
            throw new UnsupportedOptionsException(
                    "Uncompressed size is too big");

        // Decode the properties byte. In contrast to LZMA2, there is no
        // limit of lc + lp <= 4.
        int props = propsByte & 0xFF;
        if (props > (4 * 5 + 4) * 9 + 8)
            throw new CorruptedInputException("Invalid LZMA properties byte");

        int pb = props / (9 * 5);
        props -= pb * 9 * 5;
        int lp = props / 9;
        int lc = props - lp * 9;

        // Validate the dictionary size since the other "initialize" throws
        // IllegalArgumentException if dictSize is not supported.
        if (dictSize < 0 || dictSize > DICT_SIZE_MAX)
            throw new UnsupportedOptionsException(
                    "LZMA dictionary is too big for this implementation");

        initialize(in, uncompSize, lc, lp, pb, dictSize, presetDict);
    }

    private void initialize(InputStream in, long uncompSize,
                            int lc, int lp, int pb,
                            int dictSize, byte[] presetDict)
            throws IOException {
        // getDictSize validates dictSize and gives a message in
        // the exception too, so skip validating dictSize here.
        if (uncompSize < -1 || lc < 0 || lc > 8 || lp < 0 || lp > 4
                || pb < 0 || pb > 4)
            throw new IllegalArgumentException();

        this.in = in;

        // If uncompressed size is known, use it to avoid wasting memory for
        // a uselessly large dictionary buffer.
        dictSize = getDictSize(dictSize);
        if (uncompSize >= 0 && dictSize > uncompSize)
            dictSize = getDictSize((int)uncompSize);

        lz = new LZDecoder(getDictSize(dictSize), presetDict);
        rc = new RangeDecoderFromStream(in);
        lzma = new LZMADecoder(lz, rc, lc, lp, pb);
        remainingSize = uncompSize;
    }

    /**
     * Decompresses the next byte from this input stream.
     * <p>
     * Reading lots of data with <code>read()</code> from this input stream
     * may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
     * if you need to read lots of data one byte at a time.
     *
     * @return      the next decompressed byte, or <code>-1</code>
     *              to indicate the end of the compressed stream
     *
     * @throws      CorruptedInputException
     *
     * @throws      XZIOException if the stream has been closed
     *
     * @throws      EOFException
     *                          compressed input is truncated or corrupt
     *
     * @throws      IOException may be thrown by <code>in</code>
     */
    public int read() throws IOException {
        return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF);
    }

    /**
     * Decompresses into an array of bytes.
     * <p>
     * If <code>len</code> is zero, no bytes are read and <code>0</code>
     * is returned. Otherwise this will block until <code>len</code>
     * bytes have been decompressed, the end of the LZMA stream is reached,
     * or an exception is thrown.
     *
     * @param       buf         target buffer for uncompressed data
     * @param       off         start offset in <code>buf</code>
     * @param       len         maximum number of uncompressed bytes to read
     *
     * @return      number of bytes read, or <code>-1</code> to indicate
     *              the end of the compressed stream
     *
     * @throws      CorruptedInputException
     *
     * @throws      XZIOException if the stream has been closed
     *
     * @throws      EOFException compressed input is truncated or corrupt
     *
     * @throws      IOException may be thrown by <code>in</code>
     */
    public int read(byte[] buf, int off, int len) throws IOException {
        if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
            throw new IndexOutOfBoundsException();

        if (len == 0)
            return 0;

        if (in == null)
            throw new XZIOException("Stream closed");

        if (exception != null)
            throw exception;

        if (endReached)
            return -1;

        try {
            int size = 0;

            while (len > 0) {
                // If uncompressed size is known and thus no end marker will
                // be present, set the limit so that the uncompressed size
                // won't be exceeded.
                int copySizeMax = len;
                if (remainingSize >= 0 && remainingSize < len)
                    copySizeMax = (int)remainingSize;

                lz.setLimit(copySizeMax);

                // Decode into the dictionary buffer.
                try {
                    lzma.decode();
                } catch (CorruptedInputException e) {
                    // The end marker is encoded with a LZMA symbol that
                    // indicates maximum match distance. This is larger
                    // than any supported dictionary and thus causes
                    // CorruptedInputException from LZDecoder.repeat.
                    if (remainingSize != -1 || !lzma.endMarkerDetected())
                        throw e;

                    endReached = true;

                    // The exception makes lzma.decode() miss the last range
                    // decoder normalization, so do it here. This might
                    // cause an IOException if it needs to read a byte
                    // from the input stream.
                    rc.normalize();
                }

                // Copy from the dictionary to buf.
                int copiedSize = lz.flush(buf, off);
                off += copiedSize;
                len -= copiedSize;
                size += copiedSize;

                if (remainingSize >= 0) {
                    // Update the number of bytes left to be decompressed.
                    remainingSize -= copiedSize;
                    assert remainingSize >= 0;

                    if (remainingSize == 0)
                        endReached = true;
                }

                if (endReached) {
                    // Checking these helps a lot when catching corrupt
                    // or truncated .lzma files. LZMA Utils doesn't do
                    // the first check and thus it accepts many invalid
                    // files that this implementation and XZ Utils don't.
                    if (!rc.isFinished() || lz.hasPending())
                        throw new CorruptedInputException();

                    return size == 0 ? -1 : size;
                }
            }

            return size;

        } catch (IOException e) {
            exception = e;
            throw e;
        }
    }

    /**
     * Closes the stream and calls <code>in.close()</code>.
     * If the stream was already closed, this does nothing.
     *
     * @throws  IOException if thrown by <code>in.close()</code>
     */
    public void close() throws IOException {
        if (in != null) {
            try {
                in.close();
            } finally {
                in = null;
            }
        }
    }
}