1 /*
2  * LZEncoder
3  *
4  * Authors: Lasse Collin <lasse.collin@tukaani.org>
5  *          Igor Pavlov <http://7-zip.org/>
6  *
7  * This file has been put into the public domain.
8  * You can do whatever you want with this file.
9  */
10 
11 package org.tukaani.xz.lz;
12 
13 import java.io.OutputStream;
14 import java.io.IOException;
15 
16 public abstract class LZEncoder {
17     public static final int MF_HC4 = 0x04;
18     public static final int MF_BT4 = 0x14;
19 
20     /**
21      * Number of bytes to keep available before the current byte
22      * when moving the LZ window.
23      */
24     private final int keepSizeBefore;
25 
26     /**
27      * Number of bytes that must be available, the current byte included,
28      * to make hasEnoughData return true. Flushing and finishing are
29      * naturally exceptions to this since there cannot be any data after
30      * the end of the uncompressed input.
31      */
32     private final int keepSizeAfter;
33 
34     final int matchLenMax;
35     final int niceLen;
36 
37     final byte[] buf;
38 
39     int readPos = -1;
40     private int readLimit = -1;
41     private boolean finishing = false;
42     private int writePos = 0;
43     private int pendingSize = 0;
44 
normalize(int[] positions, int normalizationOffset)45     static void normalize(int[] positions, int normalizationOffset) {
46         for (int i = 0; i < positions.length; ++i) {
47             if (positions[i] <= normalizationOffset)
48                 positions[i] = 0;
49             else
50                 positions[i] -= normalizationOffset;
51         }
52     }
53 
54     /**
55      * Gets the size of the LZ window buffer that needs to be allocated.
56      */
getBufSize( int dictSize, int extraSizeBefore, int extraSizeAfter, int matchLenMax)57     private static int getBufSize(
58             int dictSize, int extraSizeBefore, int extraSizeAfter,
59             int matchLenMax) {
60         int keepSizeBefore = extraSizeBefore + dictSize;
61         int keepSizeAfter = extraSizeAfter + matchLenMax;
62         int reserveSize = Math.min(dictSize / 2 + (256 << 10), 512 << 20);
63         return keepSizeBefore + keepSizeAfter + reserveSize;
64     }
65 
66     /**
67      * Gets approximate memory usage of the LZEncoder base structure and
68      * the match finder as kibibytes.
69      */
getMemoryUsage( int dictSize, int extraSizeBefore, int extraSizeAfter, int matchLenMax, int mf)70     public static int getMemoryUsage(
71             int dictSize, int extraSizeBefore, int extraSizeAfter,
72             int matchLenMax, int mf) {
73         // Buffer size + a little extra
74         int m = getBufSize(dictSize, extraSizeBefore, extraSizeAfter,
75                            matchLenMax) / 1024 + 10;
76 
77         switch (mf) {
78             case MF_HC4:
79                 m += HC4.getMemoryUsage(dictSize);
80                 break;
81 
82             case MF_BT4:
83                 m += BT4.getMemoryUsage(dictSize);
84                 break;
85 
86             default:
87                 throw new IllegalArgumentException();
88         }
89 
90         return m;
91     }
92 
93     /**
94      * Creates a new LZEncoder.
95      * <p>
96      * @param       dictSize    dictionary size
97      *
98      * @param       extraSizeBefore
99      *                          number of bytes to keep available in the
100      *                          history in addition to dictSize
101      *
102      * @param       extraSizeAfter
103      *                          number of bytes that must be available
104      *                          after current position + matchLenMax
105      *
106      * @param       niceLen     if a match of at least <code>niceLen</code>
107      *                          bytes is found, be happy with it and don't
108      *                          stop looking for longer matches
109      *
110      * @param       matchLenMax don't test for matches longer than
111      *                          <code>matchLenMax</code> bytes
112      *
113      * @param       mf          match finder ID
114      *
115      * @param       depthLimit  match finder search depth limit
116      */
getInstance( int dictSize, int extraSizeBefore, int extraSizeAfter, int niceLen, int matchLenMax, int mf, int depthLimit)117     public static LZEncoder getInstance(
118             int dictSize, int extraSizeBefore, int extraSizeAfter,
119             int niceLen, int matchLenMax, int mf, int depthLimit) {
120         switch (mf) {
121             case MF_HC4:
122                 return new HC4(dictSize, extraSizeBefore, extraSizeAfter,
123                                niceLen, matchLenMax, depthLimit);
124 
125             case MF_BT4:
126                 return new BT4(dictSize, extraSizeBefore, extraSizeAfter,
127                                niceLen, matchLenMax, depthLimit);
128         }
129 
130         throw new IllegalArgumentException();
131     }
132 
133     /**
134      * Creates a new LZEncoder. See <code>getInstance</code>.
135      */
LZEncoder(int dictSize, int extraSizeBefore, int extraSizeAfter, int niceLen, int matchLenMax)136     LZEncoder(int dictSize, int extraSizeBefore, int extraSizeAfter,
137               int niceLen, int matchLenMax) {
138         buf = new byte[getBufSize(dictSize, extraSizeBefore, extraSizeAfter,
139                                   matchLenMax)];
140 
141         keepSizeBefore = extraSizeBefore + dictSize;
142         keepSizeAfter = extraSizeAfter + matchLenMax;
143 
144         this.matchLenMax = matchLenMax;
145         this.niceLen = niceLen;
146     }
147 
148     /**
149      * Sets a preset dictionary. If a preset dictionary is wanted, this
150      * function must be called immediately after creating the LZEncoder
151      * before any data has been encoded.
152      */
setPresetDict(int dictSize, byte[] presetDict)153     public void setPresetDict(int dictSize, byte[] presetDict) {
154         assert !isStarted();
155         assert writePos == 0;
156 
157         if (presetDict != null) {
158             // If the preset dictionary buffer is bigger than the dictionary
159             // size, copy only the tail of the preset dictionary.
160             int copySize = Math.min(presetDict.length, dictSize);
161             int offset = presetDict.length - copySize;
162             System.arraycopy(presetDict, offset, buf, 0, copySize);
163             writePos += copySize;
164             skip(copySize);
165         }
166     }
167 
168     /**
169      * Moves data from the end of the buffer to the beginning, discarding
170      * old data and making space for new input.
171      */
moveWindow()172     private void moveWindow() {
173         // Align the move to a multiple of 16 bytes. LZMA2 needs this
174         // because it uses the lowest bits from readPos to get the
175         // alignment of the uncompressed data.
176         int moveOffset = (readPos + 1 - keepSizeBefore) & ~15;
177         int moveSize = writePos - moveOffset;
178         System.arraycopy(buf, moveOffset, buf, 0, moveSize);
179 
180         readPos -= moveOffset;
181         readLimit -= moveOffset;
182         writePos -= moveOffset;
183     }
184 
185     /**
186      * Copies new data into the LZEncoder's buffer.
187      */
fillWindow(byte[] in, int off, int len)188     public int fillWindow(byte[] in, int off, int len) {
189         assert !finishing;
190 
191         // Move the sliding window if needed.
192         if (readPos >= buf.length - keepSizeAfter)
193             moveWindow();
194 
195         // Try to fill the dictionary buffer. If it becomes full,
196         // some of the input bytes may be left unused.
197         if (len > buf.length - writePos)
198             len = buf.length - writePos;
199 
200         System.arraycopy(in, off, buf, writePos, len);
201         writePos += len;
202 
203         // Set the new readLimit but only if there's enough data to allow
204         // encoding of at least one more byte.
205         if (writePos >= keepSizeAfter)
206             readLimit = writePos - keepSizeAfter;
207 
208         processPendingBytes();
209 
210         // Tell the caller how much input we actually copied into
211         // the dictionary.
212         return len;
213     }
214 
215     /**
216      * Process pending bytes remaining from preset dictionary initialization
217      * or encoder flush operation.
218      */
processPendingBytes()219     private void processPendingBytes() {
220         // After flushing or setting a preset dictionary there will be
221         // pending data that hasn't been ran through the match finder yet.
222         // Run it through the match finder now if there is enough new data
223         // available (readPos < readLimit) that the encoder may encode at
224         // least one more input byte. This way we don't waste any time
225         // looping in the match finder (and marking the same bytes as
226         // pending again) if the application provides very little new data
227         // per write call.
228         if (pendingSize > 0 && readPos < readLimit) {
229             readPos -= pendingSize;
230             int oldPendingSize = pendingSize;
231             pendingSize = 0;
232             skip(oldPendingSize);
233             assert pendingSize < oldPendingSize;
234         }
235     }
236 
237     /**
238      * Returns true if at least one byte has already been run through
239      * the match finder.
240      */
241     public boolean isStarted() {
242         return readPos != -1;
243     }
244 
245     /**
246      * Marks that all the input needs to be made available in
247      * the encoded output.
248      */
249     public void setFlushing() {
250         readLimit = writePos - 1;
251         processPendingBytes();
252     }
253 
254     /**
255      * Marks that there is no more input remaining. The read position
256      * can be advanced until the end of the data.
257      */
258     public void setFinishing() {
259         readLimit = writePos - 1;
260         finishing = true;
261         processPendingBytes();
262     }
263 
264     /**
265      * Tests if there is enough input available to let the caller encode
266      * at least one more byte.
267      */
268     public boolean hasEnoughData(int alreadyReadLen) {
269         return readPos - alreadyReadLen < readLimit;
270     }
271 
272     public void copyUncompressed(OutputStream out, int backward, int len)
273             throws IOException {
274         out.write(buf, readPos + 1 - backward, len);
275     }
276 
277     /**
278      * Get the number of bytes available, including the current byte.
279      * <p>
280      * Note that the result is undefined if <code>getMatches</code> or
281      * <code>skip</code> hasn't been called yet and no preset dictionary
282      * is being used.
283      */
284     public int getAvail() {
285         assert isStarted();
286         return writePos - readPos;
287     }
288 
289     /**
290      * Gets the lowest four bits of the absolute offset of the current byte.
291      * Bits other than the lowest four are undefined.
292      */
293     public int getPos() {
294         return readPos;
295     }
296 
297     /**
298      * Gets the byte from the given backward offset.
299      * <p>
300      * The current byte is at <code>0</code>, the previous byte
301      * at <code>1</code> etc. To get a byte at zero-based distance,
302      * use <code>getByte(dist + 1)<code>.
303      * <p>
304      * This function is equivalent to <code>getByte(0, backward)</code>.
305      */
306     public int getByte(int backward) {
307         return buf[readPos - backward] & 0xFF;
308     }
309 
310     /**
311      * Gets the byte from the given forward minus backward offset.
312      * The forward offset is added to the current position. This lets
313      * one read bytes ahead of the current byte.
314      */
315     public int getByte(int forward, int backward) {
316         return buf[readPos + forward - backward] & 0xFF;
317     }
318 
319     /**
320      * Get the length of a match at the given distance.
321      *
322      * @param       dist        zero-based distance of the match to test
323      * @param       lenLimit    don't test for a match longer than this
324      *
325      * @return      length of the match; it is in the range [0, lenLimit]
326      */
327     public int getMatchLen(int dist, int lenLimit) {
328         int backPos = readPos - dist - 1;
329         int len = 0;
330 
331         while (len < lenLimit && buf[readPos + len] == buf[backPos + len])
332             ++len;
333 
334         return len;
335     }
336 
337     /**
338      * Get the length of a match at the given distance and forward offset.
339      *
340      * @param       forward     forward offset
341      * @param       dist        zero-based distance of the match to test
342      * @param       lenLimit    don't test for a match longer than this
343      *
344      * @return      length of the match; it is in the range [0, lenLimit]
345      */
346     public int getMatchLen(int forward, int dist, int lenLimit) {
347         int curPos = readPos + forward;
348         int backPos = curPos - dist - 1;
349         int len = 0;
350 
351         while (len < lenLimit && buf[curPos + len] == buf[backPos + len])
352             ++len;
353 
354         return len;
355     }
356 
357     /**
358      * Verifies that the matches returned by the match finder are valid.
359      * This is meant to be used in an assert statement. This is totally
360      * useless for actual encoding since match finder's results should
361      * naturally always be valid if it isn't broken.
362      *
363      * @param       matches     return value from <code>getMatches</code>
364      *
365      * @return      true if matches are valid, false if match finder is broken
366      */
367     public boolean verifyMatches(Matches matches) {
368         int lenLimit = Math.min(getAvail(), matchLenMax);
369 
370         for (int i = 0; i < matches.count; ++i)
371             if (getMatchLen(matches.dist[i], lenLimit) != matches.len[i])
372                 return false;
373 
374         return true;
375     }
376 
377     /**
378      * Moves to the next byte, checks if there is enough input available,
379      * and returns the amount of input available.
380      *
381      * @param       requiredForFlushing
382      *                          minimum number of available bytes when
383      *                          flushing; encoding may be continued with
384      *                          new input after flushing
385      * @param       requiredForFinishing
386      *                          minimum number of available bytes when
387      *                          finishing; encoding must not be continued
388      *                          after finishing or the match finder state
389      *                          may be corrupt
390      *
391      * @return      the number of bytes available or zero if there
392      *              is not enough input available
393      */
394     int movePos(int requiredForFlushing, int requiredForFinishing) {
395         assert requiredForFlushing >= requiredForFinishing;
396 
397         ++readPos;
398         int avail = writePos - readPos;
399 
400         if (avail < requiredForFlushing) {
401             if (avail < requiredForFinishing || !finishing) {
402                 ++pendingSize;
403                 avail = 0;
404             }
405         }
406 
407         return avail;
408     }
409 
410     /**
411      * Runs match finder for the next byte and returns the matches found.
412      */
413     public abstract Matches getMatches();
414 
415     /**
416      * Skips the given number of bytes in the match finder.
417      */
418     public abstract void skip(int len);
419 }
420