1 /*
2  * LZEncoder
3  *
4  * Authors: Lasse Collin <lasse.collin@tukaani.org>
5  *          Igor Pavlov <http://7-zip.org/>
6  *
7  * This file has been put into the public domain.
8  * You can do whatever you want with this file.
9  */
10 
11 package org.tukaani.xz.lz;
12 
13 import java.io.OutputStream;
14 import java.io.IOException;
15 import org.tukaani.xz.ArrayCache;
16 
17 public abstract class LZEncoder {
18     public static final int MF_HC4 = 0x04;
19     public static final int MF_BT4 = 0x14;
20 
21     /**
22      * Number of bytes to keep available before the current byte
23      * when moving the LZ window.
24      */
25     private final int keepSizeBefore;
26 
27     /**
28      * Number of bytes that must be available, the current byte included,
29      * to make hasEnoughData return true. Flushing and finishing are
30      * naturally exceptions to this since there cannot be any data after
31      * the end of the uncompressed input.
32      */
33     private final int keepSizeAfter;
34 
35     final int matchLenMax;
36     final int niceLen;
37 
38     final byte[] buf;
39     final int bufSize; // To avoid buf.length with an array-cached buf.
40 
41     int readPos = -1;
42     private int readLimit = -1;
43     private boolean finishing = false;
44     private int writePos = 0;
45     private int pendingSize = 0;
46 
normalize(int[] positions, int positionsCount, int normalizationOffset)47     static void normalize(int[] positions, int positionsCount,
48                           int normalizationOffset) {
49         for (int i = 0; i < positionsCount; ++i) {
50             if (positions[i] <= normalizationOffset)
51                 positions[i] = 0;
52             else
53                 positions[i] -= normalizationOffset;
54         }
55     }
56 
57     /**
58      * Gets the size of the LZ window buffer that needs to be allocated.
59      */
getBufSize( int dictSize, int extraSizeBefore, int extraSizeAfter, int matchLenMax)60     private static int getBufSize(
61             int dictSize, int extraSizeBefore, int extraSizeAfter,
62             int matchLenMax) {
63         int keepSizeBefore = extraSizeBefore + dictSize;
64         int keepSizeAfter = extraSizeAfter + matchLenMax;
65         int reserveSize = Math.min(dictSize / 2 + (256 << 10), 512 << 20);
66         return keepSizeBefore + keepSizeAfter + reserveSize;
67     }
68 
69     /**
70      * Gets approximate memory usage of the LZEncoder base structure and
71      * the match finder as kibibytes.
72      */
getMemoryUsage( int dictSize, int extraSizeBefore, int extraSizeAfter, int matchLenMax, int mf)73     public static int getMemoryUsage(
74             int dictSize, int extraSizeBefore, int extraSizeAfter,
75             int matchLenMax, int mf) {
76         // Buffer size + a little extra
77         int m = getBufSize(dictSize, extraSizeBefore, extraSizeAfter,
78                            matchLenMax) / 1024 + 10;
79 
80         switch (mf) {
81             case MF_HC4:
82                 m += HC4.getMemoryUsage(dictSize);
83                 break;
84 
85             case MF_BT4:
86                 m += BT4.getMemoryUsage(dictSize);
87                 break;
88 
89             default:
90                 throw new IllegalArgumentException();
91         }
92 
93         return m;
94     }
95 
96     /**
97      * Creates a new LZEncoder.
98      * <p>
99      * @param       dictSize    dictionary size
100      *
101      * @param       extraSizeBefore
102      *                          number of bytes to keep available in the
103      *                          history in addition to dictSize
104      *
105      * @param       extraSizeAfter
106      *                          number of bytes that must be available
107      *                          after current position + matchLenMax
108      *
109      * @param       niceLen     if a match of at least <code>niceLen</code>
110      *                          bytes is found, be happy with it and don't
111      *                          stop looking for longer matches
112      *
113      * @param       matchLenMax don't test for matches longer than
114      *                          <code>matchLenMax</code> bytes
115      *
116      * @param       mf          match finder ID
117      *
118      * @param       depthLimit  match finder search depth limit
119      */
getInstance( int dictSize, int extraSizeBefore, int extraSizeAfter, int niceLen, int matchLenMax, int mf, int depthLimit, ArrayCache arrayCache)120     public static LZEncoder getInstance(
121             int dictSize, int extraSizeBefore, int extraSizeAfter,
122             int niceLen, int matchLenMax, int mf, int depthLimit,
123             ArrayCache arrayCache) {
124         switch (mf) {
125             case MF_HC4:
126                 return new HC4(dictSize, extraSizeBefore, extraSizeAfter,
127                                niceLen, matchLenMax, depthLimit, arrayCache);
128 
129             case MF_BT4:
130                 return new BT4(dictSize, extraSizeBefore, extraSizeAfter,
131                                niceLen, matchLenMax, depthLimit, arrayCache);
132         }
133 
134         throw new IllegalArgumentException();
135     }
136 
137     /**
138      * Creates a new LZEncoder. See <code>getInstance</code>.
139      */
LZEncoder(int dictSize, int extraSizeBefore, int extraSizeAfter, int niceLen, int matchLenMax, ArrayCache arrayCache)140     LZEncoder(int dictSize, int extraSizeBefore, int extraSizeAfter,
141               int niceLen, int matchLenMax, ArrayCache arrayCache) {
142         bufSize = getBufSize(dictSize, extraSizeBefore, extraSizeAfter,
143                              matchLenMax);
144         buf = arrayCache.getByteArray(bufSize, false);
145 
146         keepSizeBefore = extraSizeBefore + dictSize;
147         keepSizeAfter = extraSizeAfter + matchLenMax;
148 
149         this.matchLenMax = matchLenMax;
150         this.niceLen = niceLen;
151     }
152 
putArraysToCache(ArrayCache arrayCache)153     public void putArraysToCache(ArrayCache arrayCache) {
154         arrayCache.putArray(buf);
155     }
156 
157     /**
158      * Sets a preset dictionary. If a preset dictionary is wanted, this
159      * function must be called immediately after creating the LZEncoder
160      * before any data has been encoded.
161      */
setPresetDict(int dictSize, byte[] presetDict)162     public void setPresetDict(int dictSize, byte[] presetDict) {
163         assert !isStarted();
164         assert writePos == 0;
165 
166         if (presetDict != null) {
167             // If the preset dictionary buffer is bigger than the dictionary
168             // size, copy only the tail of the preset dictionary.
169             int copySize = Math.min(presetDict.length, dictSize);
170             int offset = presetDict.length - copySize;
171             System.arraycopy(presetDict, offset, buf, 0, copySize);
172             writePos += copySize;
173             skip(copySize);
174         }
175     }
176 
177     /**
178      * Moves data from the end of the buffer to the beginning, discarding
179      * old data and making space for new input.
180      */
moveWindow()181     private void moveWindow() {
182         // Align the move to a multiple of 16 bytes. LZMA2 needs this
183         // because it uses the lowest bits from readPos to get the
184         // alignment of the uncompressed data.
185         int moveOffset = (readPos + 1 - keepSizeBefore) & ~15;
186         int moveSize = writePos - moveOffset;
187         System.arraycopy(buf, moveOffset, buf, 0, moveSize);
188 
189         readPos -= moveOffset;
190         readLimit -= moveOffset;
191         writePos -= moveOffset;
192     }
193 
194     /**
195      * Copies new data into the LZEncoder's buffer.
196      */
fillWindow(byte[] in, int off, int len)197     public int fillWindow(byte[] in, int off, int len) {
198         assert !finishing;
199 
200         // Move the sliding window if needed.
201         if (readPos >= bufSize - keepSizeAfter)
202             moveWindow();
203 
204         // Try to fill the dictionary buffer. If it becomes full,
205         // some of the input bytes may be left unused.
206         if (len > bufSize - writePos)
207             len = bufSize - writePos;
208 
209         System.arraycopy(in, off, buf, writePos, len);
210         writePos += len;
211 
212         // Set the new readLimit but only if there's enough data to allow
213         // encoding of at least one more byte.
214         if (writePos >= keepSizeAfter)
215             readLimit = writePos - keepSizeAfter;
216 
217         processPendingBytes();
218 
219         // Tell the caller how much input we actually copied into
220         // the dictionary.
221         return len;
222     }
223 
224     /**
225      * Process pending bytes remaining from preset dictionary initialization
226      * or encoder flush operation.
227      */
processPendingBytes()228     private void processPendingBytes() {
229         // After flushing or setting a preset dictionary there will be
230         // pending data that hasn't been ran through the match finder yet.
231         // Run it through the match finder now if there is enough new data
232         // available (readPos < readLimit) that the encoder may encode at
233         // least one more input byte. This way we don't waste any time
234         // looping in the match finder (and marking the same bytes as
235         // pending again) if the application provides very little new data
236         // per write call.
237         if (pendingSize > 0 && readPos < readLimit) {
238             readPos -= pendingSize;
239             int oldPendingSize = pendingSize;
240             pendingSize = 0;
241             skip(oldPendingSize);
242             assert pendingSize < oldPendingSize;
243         }
244     }
245 
246     /**
247      * Returns true if at least one byte has already been run through
248      * the match finder.
249      */
250     public boolean isStarted() {
251         return readPos != -1;
252     }
253 
254     /**
255      * Marks that all the input needs to be made available in
256      * the encoded output.
257      */
258     public void setFlushing() {
259         readLimit = writePos - 1;
260         processPendingBytes();
261     }
262 
263     /**
264      * Marks that there is no more input remaining. The read position
265      * can be advanced until the end of the data.
266      */
267     public void setFinishing() {
268         readLimit = writePos - 1;
269         finishing = true;
270         processPendingBytes();
271     }
272 
273     /**
274      * Tests if there is enough input available to let the caller encode
275      * at least one more byte.
276      */
277     public boolean hasEnoughData(int alreadyReadLen) {
278         return readPos - alreadyReadLen < readLimit;
279     }
280 
281     public void copyUncompressed(OutputStream out, int backward, int len)
282             throws IOException {
283         out.write(buf, readPos + 1 - backward, len);
284     }
285 
286     /**
287      * Get the number of bytes available, including the current byte.
288      * <p>
289      * Note that the result is undefined if <code>getMatches</code> or
290      * <code>skip</code> hasn't been called yet and no preset dictionary
291      * is being used.
292      */
293     public int getAvail() {
294         assert isStarted();
295         return writePos - readPos;
296     }
297 
298     /**
299      * Gets the lowest four bits of the absolute offset of the current byte.
300      * Bits other than the lowest four are undefined.
301      */
302     public int getPos() {
303         return readPos;
304     }
305 
306     /**
307      * Gets the byte from the given backward offset.
308      * <p>
309      * The current byte is at <code>0</code>, the previous byte
310      * at <code>1</code> etc. To get a byte at zero-based distance,
311      * use <code>getByte(dist + 1)<code>.
312      * <p>
313      * This function is equivalent to <code>getByte(0, backward)</code>.
314      */
315     public int getByte(int backward) {
316         return buf[readPos - backward] & 0xFF;
317     }
318 
319     /**
320      * Gets the byte from the given forward minus backward offset.
321      * The forward offset is added to the current position. This lets
322      * one read bytes ahead of the current byte.
323      */
324     public int getByte(int forward, int backward) {
325         return buf[readPos + forward - backward] & 0xFF;
326     }
327 
328     /**
329      * Get the length of a match at the given distance.
330      *
331      * @param       dist        zero-based distance of the match to test
332      * @param       lenLimit    don't test for a match longer than this
333      *
334      * @return      length of the match; it is in the range [0, lenLimit]
335      */
336     public int getMatchLen(int dist, int lenLimit) {
337         int backPos = readPos - dist - 1;
338         int len = 0;
339 
340         while (len < lenLimit && buf[readPos + len] == buf[backPos + len])
341             ++len;
342 
343         return len;
344     }
345 
346     /**
347      * Get the length of a match at the given distance and forward offset.
348      *
349      * @param       forward     forward offset
350      * @param       dist        zero-based distance of the match to test
351      * @param       lenLimit    don't test for a match longer than this
352      *
353      * @return      length of the match; it is in the range [0, lenLimit]
354      */
355     public int getMatchLen(int forward, int dist, int lenLimit) {
356         int curPos = readPos + forward;
357         int backPos = curPos - dist - 1;
358         int len = 0;
359 
360         while (len < lenLimit && buf[curPos + len] == buf[backPos + len])
361             ++len;
362 
363         return len;
364     }
365 
366     /**
367      * Verifies that the matches returned by the match finder are valid.
368      * This is meant to be used in an assert statement. This is totally
369      * useless for actual encoding since match finder's results should
370      * naturally always be valid if it isn't broken.
371      *
372      * @param       matches     return value from <code>getMatches</code>
373      *
374      * @return      true if matches are valid, false if match finder is broken
375      */
376     public boolean verifyMatches(Matches matches) {
377         int lenLimit = Math.min(getAvail(), matchLenMax);
378 
379         for (int i = 0; i < matches.count; ++i)
380             if (getMatchLen(matches.dist[i], lenLimit) != matches.len[i])
381                 return false;
382 
383         return true;
384     }
385 
386     /**
387      * Moves to the next byte, checks if there is enough input available,
388      * and returns the amount of input available.
389      *
390      * @param       requiredForFlushing
391      *                          minimum number of available bytes when
392      *                          flushing; encoding may be continued with
393      *                          new input after flushing
394      * @param       requiredForFinishing
395      *                          minimum number of available bytes when
396      *                          finishing; encoding must not be continued
397      *                          after finishing or the match finder state
398      *                          may be corrupt
399      *
400      * @return      the number of bytes available or zero if there
401      *              is not enough input available
402      */
403     int movePos(int requiredForFlushing, int requiredForFinishing) {
404         assert requiredForFlushing >= requiredForFinishing;
405 
406         ++readPos;
407         int avail = writePos - readPos;
408 
409         if (avail < requiredForFlushing) {
410             if (avail < requiredForFinishing || !finishing) {
411                 ++pendingSize;
412                 avail = 0;
413             }
414         }
415 
416         return avail;
417     }
418 
419     /**
420      * Runs match finder for the next byte and returns the matches found.
421      */
422     public abstract Matches getMatches();
423 
424     /**
425      * Skips the given number of bytes in the match finder.
426      */
427     public abstract void skip(int len);
428 }
429