xz/lz/LZEncoder.java

/*
 * LZEncoder
 *
 * Authors: Lasse Collin <lasse.collin@tukaani.org>
 *          Igor Pavlov <http://7-zip.org/>
 *
 * This file has been put into the public domain.
 * You can do whatever you want with this file.
 */

package org.tukaani.xz.lz;

import java.io.OutputStream;
import java.io.IOException;
import org.tukaani.xz.ArrayCache;

public abstract class LZEncoder {
    public static final int MF_HC4 = 0x04;
    public static final int MF_BT4 = 0x14;

    /**
     * Number of bytes to keep available before the current byte
     * when moving the LZ window.
     */
    private final int keepSizeBefore;

    /**
     * Number of bytes that must be available, the current byte included,
     * to make hasEnoughData return true. Flushing and finishing are
     * naturally exceptions to this since there cannot be any data after
     * the end of the uncompressed input.
     */
    private final int keepSizeAfter;

    final int matchLenMax;
    final int niceLen;

    final byte[] buf;
    final int bufSize; // To avoid buf.length with an array-cached buf.

    int readPos = -1;
    private int readLimit = -1;
    private boolean finishing = false;
    private int writePos = 0;
    private int pendingSize = 0;

    static void normalize(int[] positions, int positionsCount,
                          int normalizationOffset) {
        for (int i = 0; i < positionsCount; ++i) {
            if (positions[i] <= normalizationOffset)
                positions[i] = 0;
            else
                positions[i] -= normalizationOffset;
        }
    }

    /**
     * Gets the size of the LZ window buffer that needs to be allocated.
     */
    private static int getBufSize(
            int dictSize, int extraSizeBefore, int extraSizeAfter,
            int matchLenMax) {
        int keepSizeBefore = extraSizeBefore + dictSize;
        int keepSizeAfter = extraSizeAfter + matchLenMax;
        int reserveSize = Math.min(dictSize / 2 + (256 << 10), 512 << 20);
        return keepSizeBefore + keepSizeAfter + reserveSize;
    }

    /**
     * Gets approximate memory usage of the LZEncoder base structure and
     * the match finder as kibibytes.
     */
    public static int getMemoryUsage(
            int dictSize, int extraSizeBefore, int extraSizeAfter,
            int matchLenMax, int mf) {
        // Buffer size + a little extra
        int m = getBufSize(dictSize, extraSizeBefore, extraSizeAfter,
                           matchLenMax) / 1024 + 10;

        switch (mf) {
            case MF_HC4:
                m += HC4.getMemoryUsage(dictSize);
                break;

            case MF_BT4:
                m += BT4.getMemoryUsage(dictSize);
                break;

            default:
                throw new IllegalArgumentException();
        }

        return m;
    }

    /**
     * Creates a new LZEncoder.
     * <p>
     * @param       dictSize    dictionary size
     *
     * @param       extraSizeBefore
     *                          number of bytes to keep available in the
     *                          history in addition to dictSize
     *
     * @param       extraSizeAfter
     *                          number of bytes that must be available
     *                          after current position + matchLenMax
     *
     * @param       niceLen     if a match of at least <code>niceLen</code>
     *                          bytes is found, be happy with it and don't
     *                          stop looking for longer matches
     *
     * @param       matchLenMax don't test for matches longer than
     *                          <code>matchLenMax</code> bytes
     *
     * @param       mf          match finder ID
     *
     * @param       depthLimit  match finder search depth limit
     */
    public static LZEncoder getInstance(
            int dictSize, int extraSizeBefore, int extraSizeAfter,
            int niceLen, int matchLenMax, int mf, int depthLimit,
            ArrayCache arrayCache) {
        switch (mf) {
            case MF_HC4:
                return new HC4(dictSize, extraSizeBefore, extraSizeAfter,
                               niceLen, matchLenMax, depthLimit, arrayCache);

            case MF_BT4:
                return new BT4(dictSize, extraSizeBefore, extraSizeAfter,
                               niceLen, matchLenMax, depthLimit, arrayCache);
        }

        throw new IllegalArgumentException();
    }

    /**
     * Creates a new LZEncoder. See <code>getInstance</code>.
     */
    LZEncoder(int dictSize, int extraSizeBefore, int extraSizeAfter,
              int niceLen, int matchLenMax, ArrayCache arrayCache) {
        bufSize = getBufSize(dictSize, extraSizeBefore, extraSizeAfter,
                             matchLenMax);
        buf = arrayCache.getByteArray(bufSize, false);

        keepSizeBefore = extraSizeBefore + dictSize;
        keepSizeAfter = extraSizeAfter + matchLenMax;

        this.matchLenMax = matchLenMax;
        this.niceLen = niceLen;
    }

    public void putArraysToCache(ArrayCache arrayCache) {
        arrayCache.putArray(buf);
    }

    /**
     * Sets a preset dictionary. If a preset dictionary is wanted, this
     * function must be called immediately after creating the LZEncoder
     * before any data has been encoded.
     */
    public void setPresetDict(int dictSize, byte[] presetDict) {
        assert !isStarted();
        assert writePos == 0;

        if (presetDict != null) {
            // If the preset dictionary buffer is bigger than the dictionary
            // size, copy only the tail of the preset dictionary.
            int copySize = Math.min(presetDict.length, dictSize);
            int offset = presetDict.length - copySize;
            System.arraycopy(presetDict, offset, buf, 0, copySize);
            writePos += copySize;
            skip(copySize);
        }
    }

    /**
     * Moves data from the end of the buffer to the beginning, discarding
     * old data and making space for new input.
     */
    private void moveWindow() {
        // Align the move to a multiple of 16 bytes. LZMA2 needs this
        // because it uses the lowest bits from readPos to get the
        // alignment of the uncompressed data.
        int moveOffset = (readPos + 1 - keepSizeBefore) & ~15;
        int moveSize = writePos - moveOffset;
        System.arraycopy(buf, moveOffset, buf, 0, moveSize);

        readPos -= moveOffset;
        readLimit -= moveOffset;
        writePos -= moveOffset;
    }

    /**
     * Copies new data into the LZEncoder's buffer.
     */
    public int fillWindow(byte[] in, int off, int len) {
        assert !finishing;

        // Move the sliding window if needed.
        if (readPos >= bufSize - keepSizeAfter)
            moveWindow();

        // Try to fill the dictionary buffer. If it becomes full,
        // some of the input bytes may be left unused.
        if (len > bufSize - writePos)
            len = bufSize - writePos;

        System.arraycopy(in, off, buf, writePos, len);
        writePos += len;

        // Set the new readLimit but only if there's enough data to allow
        // encoding of at least one more byte.
        if (writePos >= keepSizeAfter)
            readLimit = writePos - keepSizeAfter;

        processPendingBytes();

        // Tell the caller how much input we actually copied into
        // the dictionary.
        return len;
    }

    /**
     * Process pending bytes remaining from preset dictionary initialization
     * or encoder flush operation.
     */
    private void processPendingBytes() {
        // After flushing or setting a preset dictionary there will be
        // pending data that hasn't been ran through the match finder yet.
        // Run it through the match finder now if there is enough new data
        // available (readPos < readLimit) that the encoder may encode at
        // least one more input byte. This way we don't waste any time
        // looping in the match finder (and marking the same bytes as
        // pending again) if the application provides very little new data
        // per write call.
        if (pendingSize > 0 && readPos < readLimit) {
            readPos -= pendingSize;
            int oldPendingSize = pendingSize;
            pendingSize = 0;
            skip(oldPendingSize);
            assert pendingSize < oldPendingSize;
        }
    }

    /**
     * Returns true if at least one byte has already been run through
     * the match finder.
     */
    public boolean isStarted() {
        return readPos != -1;
    }

    /**
     * Marks that all the input needs to be made available in
     * the encoded output.
     */
    public void setFlushing() {
        readLimit = writePos - 1;
        processPendingBytes();
    }

    /**
     * Marks that there is no more input remaining. The read position
     * can be advanced until the end of the data.
     */
    public void setFinishing() {
        readLimit = writePos - 1;
        finishing = true;
        processPendingBytes();
    }

    /**
     * Tests if there is enough input available to let the caller encode
     * at least one more byte.
     */
    public boolean hasEnoughData(int alreadyReadLen) {
        return readPos - alreadyReadLen < readLimit;
    }

    public void copyUncompressed(OutputStream out, int backward, int len)
            throws IOException {
        out.write(buf, readPos + 1 - backward, len);
    }

    /**
     * Get the number of bytes available, including the current byte.
     * <p>
     * Note that the result is undefined if <code>getMatches</code> or
     * <code>skip</code> hasn't been called yet and no preset dictionary
     * is being used.
     */
    public int getAvail() {
        assert isStarted();
        return writePos - readPos;
    }

    /**
     * Gets the lowest four bits of the absolute offset of the current byte.
     * Bits other than the lowest four are undefined.
     */
    public int getPos() {
        return readPos;
    }

    /**
     * Gets the byte from the given backward offset.
     * <p>
     * The current byte is at <code>0</code>, the previous byte
     * at <code>1</code> etc. To get a byte at zero-based distance,
     * use <code>getByte(dist + 1)<code>.
     * <p>
     * This function is equivalent to <code>getByte(0, backward)</code>.
     */
    public int getByte(int backward) {
        return buf[readPos - backward] & 0xFF;
    }

    /**
     * Gets the byte from the given forward minus backward offset.
     * The forward offset is added to the current position. This lets
     * one read bytes ahead of the current byte.
     */
    public int getByte(int forward, int backward) {
        return buf[readPos + forward - backward] & 0xFF;
    }

    /**
     * Get the length of a match at the given distance.
     *
     * @param       dist        zero-based distance of the match to test
     * @param       lenLimit    don't test for a match longer than this
     *
     * @return      length of the match; it is in the range [0, lenLimit]
     */
    public int getMatchLen(int dist, int lenLimit) {
        int backPos = readPos - dist - 1;
        int len = 0;

        while (len < lenLimit && buf[readPos + len] == buf[backPos + len])
            ++len;

        return len;
    }

    /**
     * Get the length of a match at the given distance and forward offset.
     *
     * @param       forward     forward offset
     * @param       dist        zero-based distance of the match to test
     * @param       lenLimit    don't test for a match longer than this
     *
     * @return      length of the match; it is in the range [0, lenLimit]
     */
    public int getMatchLen(int forward, int dist, int lenLimit) {
        int curPos = readPos + forward;
        int backPos = curPos - dist - 1;
        int len = 0;

        while (len < lenLimit && buf[curPos + len] == buf[backPos + len])
            ++len;

        return len;
    }

    /**
     * Verifies that the matches returned by the match finder are valid.
     * This is meant to be used in an assert statement. This is totally
     * useless for actual encoding since match finder's results should
     * naturally always be valid if it isn't broken.
     *
     * @param       matches     return value from <code>getMatches</code>
     *
     * @return      true if matches are valid, false if match finder is broken
     */
    public boolean verifyMatches(Matches matches) {
        int lenLimit = Math.min(getAvail(), matchLenMax);

        for (int i = 0; i < matches.count; ++i)
            if (getMatchLen(matches.dist[i], lenLimit) != matches.len[i])
                return false;

        return true;
    }

    /**
     * Moves to the next byte, checks if there is enough input available,
     * and returns the amount of input available.
     *
     * @param       requiredForFlushing
     *                          minimum number of available bytes when
     *                          flushing; encoding may be continued with
     *                          new input after flushing
     * @param       requiredForFinishing
     *                          minimum number of available bytes when
     *                          finishing; encoding must not be continued
     *                          after finishing or the match finder state
     *                          may be corrupt
     *
     * @return      the number of bytes available or zero if there
     *              is not enough input available
     */
    int movePos(int requiredForFlushing, int requiredForFinishing) {
        assert requiredForFlushing >= requiredForFinishing;

        ++readPos;
        int avail = writePos - readPos;

        if (avail < requiredForFlushing) {
            if (avail < requiredForFinishing || !finishing) {
                ++pendingSize;
                avail = 0;
            }
        }

        return avail;
    }

    /**
     * Runs match finder for the next byte and returns the matches found.
     */
    public abstract Matches getMatches();

    /**
     * Skips the given number of bytes in the match finder.
     */
    public abstract void skip(int len);
}