1 /*
2  * LZMA2Options
3  *
4  * Author: Lasse Collin <lasse.collin@tukaani.org>
5  *
6  * This file has been put into the public domain.
7  * You can do whatever you want with this file.
8  */
9 
10 package org.tukaani.xz;
11 
12 import java.io.InputStream;
13 import java.io.IOException;
14 import org.tukaani.xz.lz.LZEncoder;
15 import org.tukaani.xz.lzma.LZMAEncoder;
16 
17 /**
18  * LZMA2 compression options.
19  * <p>
20  * While this allows setting the LZMA2 compression options in detail,
21  * often you only need <code>LZMA2Options()</code> or
22  * <code>LZMA2Options(int)</code>.
23  */
24 public class LZMA2Options extends FilterOptions {
25     /**
26      * Minimum valid compression preset level is 0.
27      */
28     public static final int PRESET_MIN = 0;
29 
30     /**
31      * Maximum valid compression preset level is 9.
32      */
33     public static final int PRESET_MAX = 9;
34 
35     /**
36      * Default compression preset level is 6.
37      */
38     public static final int PRESET_DEFAULT = 6;
39 
40     /**
41      * Minimum dictionary size is 4 KiB.
42      */
43     public static final int DICT_SIZE_MIN = 4096;
44 
45     /**
46      * Maximum dictionary size for compression is 768 MiB.
47      * <p>
48      * The decompressor supports bigger dictionaries, up to almost 2 GiB.
49      * With HC4 the encoder would support dictionaries bigger than 768 MiB.
50      * The 768 MiB limit comes from the current implementation of BT4 where
51      * we would otherwise hit the limits of signed ints in array indexing.
52      * <p>
53      * If you really need bigger dictionary for decompression,
54      * use {@link LZMA2InputStream} directly.
55      */
56     public static final int DICT_SIZE_MAX = 768 << 20;
57 
58     /**
59      * The default dictionary size is 8 MiB.
60      */
61     public static final int DICT_SIZE_DEFAULT = 8 << 20;
62 
63     /**
64      * Maximum value for lc + lp is 4.
65      */
66     public static final int LC_LP_MAX = 4;
67 
68     /**
69      * The default number of literal context bits is 3.
70      */
71     public static final int LC_DEFAULT = 3;
72 
73     /**
74      * The default number of literal position bits is 0.
75      */
76     public static final int LP_DEFAULT = 0;
77 
78     /**
79      * Maximum value for pb is 4.
80      */
81     public static final int PB_MAX = 4;
82 
83     /**
84      * The default number of position bits is 2.
85      */
86     public static final int PB_DEFAULT = 2;
87 
88     /**
89      * Compression mode: uncompressed.
90      * The data is wrapped into a LZMA2 stream without compression.
91      */
92     public static final int MODE_UNCOMPRESSED = 0;
93 
94     /**
95      * Compression mode: fast.
96      * This is usually combined with a hash chain match finder.
97      */
98     public static final int MODE_FAST = LZMAEncoder.MODE_FAST;
99 
100     /**
101      * Compression mode: normal.
102      * This is usually combined with a binary tree match finder.
103      */
104     public static final int MODE_NORMAL = LZMAEncoder.MODE_NORMAL;
105 
106     /**
107      * Minimum value for <code>niceLen</code> is 8.
108      */
109     public static final int NICE_LEN_MIN = 8;
110 
111     /**
112      * Maximum value for <code>niceLen</code> is 273.
113      */
114     public static final int NICE_LEN_MAX = 273;
115 
116     /**
117      * Match finder: Hash Chain 2-3-4
118      */
119     public static final int MF_HC4 = LZEncoder.MF_HC4;
120 
121     /**
122      * Match finder: Binary tree 2-3-4
123      */
124     public static final int MF_BT4 = LZEncoder.MF_BT4;
125 
126     private static final int[] presetToDictSize = {
127             1 << 18, 1 << 20, 1 << 21, 1 << 22, 1 << 22,
128             1 << 23, 1 << 23, 1 << 24, 1 << 25, 1 << 26 };
129 
130     private static final int[] presetToDepthLimit = { 4, 8, 24, 48 };
131 
132     private int dictSize;
133     private byte[] presetDict = null;
134     private int lc;
135     private int lp;
136     private int pb;
137     private int mode;
138     private int niceLen;
139     private int mf;
140     private int depthLimit;
141 
142     /**
143      * Creates new LZMA2 options and sets them to the default values.
144      * This is equivalent to <code>LZMA2Options(PRESET_DEFAULT)</code>.
145      */
LZMA2Options()146     public LZMA2Options() {
147         try {
148             setPreset(PRESET_DEFAULT);
149         } catch (UnsupportedOptionsException e) {
150             assert false;
151             throw new RuntimeException();
152         }
153     }
154 
155     /**
156      * Creates new LZMA2 options and sets them to the given preset.
157      *
158      * @throws      UnsupportedOptionsException
159      *                          <code>preset</code> is not supported
160      */
LZMA2Options(int preset)161     public LZMA2Options(int preset) throws UnsupportedOptionsException {
162         setPreset(preset);
163     }
164 
165     /**
166      * Creates new LZMA2 options and sets them to the given custom values.
167      *
168      * @throws      UnsupportedOptionsException
169      *                          unsupported options were specified
170      */
LZMA2Options(int dictSize, int lc, int lp, int pb, int mode, int niceLen, int mf, int depthLimit)171     public LZMA2Options(int dictSize, int lc, int lp, int pb, int mode,
172                         int niceLen, int mf, int depthLimit)
173             throws UnsupportedOptionsException {
174         setDictSize(dictSize);
175         setLcLp(lc, lp);
176         setPb(pb);
177         setMode(mode);
178         setNiceLen(niceLen);
179         setMatchFinder(mf);
180         setDepthLimit(depthLimit);
181     }
182 
183     /**
184      * Sets the compression options to the given preset.
185      * <p>
186      * The presets 0-3 are fast presets with medium compression.
187      * The presets 4-6 are fairly slow presets with high compression.
188      * The default preset (<code>PRESET_DEFAULT</code>) is 6.
189      * <p>
190      * The presets 7-9 are like the preset 6 but use bigger dictionaries
191      * and have higher compressor and decompressor memory requirements.
192      * Unless the uncompressed size of the file exceeds 8&nbsp;MiB,
193      * 16&nbsp;MiB, or 32&nbsp;MiB, it is waste of memory to use the
194      * presets 7, 8, or 9, respectively.
195      *
196      * @throws      UnsupportedOptionsException
197      *                          <code>preset</code> is not supported
198      */
setPreset(int preset)199     public void setPreset(int preset) throws UnsupportedOptionsException {
200         if (preset < 0 || preset > 9)
201             throw new UnsupportedOptionsException(
202                     "Unsupported preset: " + preset);
203 
204         lc = LC_DEFAULT;
205         lp = LP_DEFAULT;
206         pb = PB_DEFAULT;
207         dictSize = presetToDictSize[preset];
208 
209         if (preset <= 3) {
210             mode = MODE_FAST;
211             mf = MF_HC4;
212             niceLen = preset <= 1 ? 128 : NICE_LEN_MAX;
213             depthLimit = presetToDepthLimit[preset];
214         } else {
215             mode = MODE_NORMAL;
216             mf = MF_BT4;
217             niceLen = (preset == 4) ? 16 : (preset == 5) ? 32 : 64;
218             depthLimit = 0;
219         }
220     }
221 
222     /**
223      * Sets the dictionary size in bytes.
224      * <p>
225      * The dictionary (or history buffer) holds the most recently seen
226      * uncompressed data. Bigger dictionary usually means better compression.
227      * However, using a dictioanary bigger than the size of the uncompressed
228      * data is waste of memory.
229      * <p>
230      * Any value in the range [DICT_SIZE_MIN, DICT_SIZE_MAX] is valid,
231      * but sizes of 2^n and 2^n&nbsp;+&nbsp;2^(n-1) bytes are somewhat
232      * recommended.
233      *
234      * @throws      UnsupportedOptionsException
235      *                          <code>dictSize</code> is not supported
236      */
setDictSize(int dictSize)237     public void setDictSize(int dictSize) throws UnsupportedOptionsException {
238         if (dictSize < DICT_SIZE_MIN)
239             throw new UnsupportedOptionsException(
240                     "LZMA2 dictionary size must be at least 4 KiB: "
241                     + dictSize + " B");
242 
243         if (dictSize > DICT_SIZE_MAX)
244             throw new UnsupportedOptionsException(
245                     "LZMA2 dictionary size must not exceed "
246                     + (DICT_SIZE_MAX >> 20) + " MiB: " + dictSize + " B");
247 
248         this.dictSize = dictSize;
249     }
250 
251     /**
252      * Gets the dictionary size in bytes.
253      */
getDictSize()254     public int getDictSize() {
255         return dictSize;
256     }
257 
258     /**
259      * Sets a preset dictionary. Use null to disable the use of
260      * a preset dictionary. By default there is no preset dictionary.
261      * <p>
262      * <b>The .xz format doesn't support a preset dictionary for now.
263      * Do not set a preset dictionary unless you use raw LZMA2.</b>
264      * <p>
265      * Preset dictionary can be useful when compressing many similar,
266      * relatively small chunks of data independently from each other.
267      * A preset dictionary should contain typical strings that occur in
268      * the files being compressed. The most probable strings should be
269      * near the end of the preset dictionary. The preset dictionary used
270      * for compression is also needed for decompression.
271      */
setPresetDict(byte[] presetDict)272     public void setPresetDict(byte[] presetDict) {
273         this.presetDict = presetDict;
274     }
275 
276     /**
277      * Gets the preset dictionary.
278      */
getPresetDict()279     public byte[] getPresetDict() {
280         return presetDict;
281     }
282 
283     /**
284      * Sets the number of literal context bits and literal position bits.
285      * <p>
286      * The sum of <code>lc</code> and <code>lp</code> is limited to 4.
287      * Trying to exceed it will throw an exception. This function lets
288      * you change both at the same time.
289      *
290      * @throws      UnsupportedOptionsException
291      *                          <code>lc</code> and <code>lp</code>
292      *                          are invalid
293      */
setLcLp(int lc, int lp)294     public void setLcLp(int lc, int lp) throws UnsupportedOptionsException {
295         if (lc < 0 || lp < 0 || lc > LC_LP_MAX || lp > LC_LP_MAX
296                 || lc + lp > LC_LP_MAX)
297             throw new UnsupportedOptionsException(
298                     "lc + lp must not exceed " + LC_LP_MAX + ": "
299                     + lc + " + " + lp);
300 
301         this.lc = lc;
302         this.lp = lp;
303     }
304 
305     /**
306      * Sets the number of literal context bits.
307      * <p>
308      * All bytes that cannot be encoded as matches are encoded as literals.
309      * That is, literals are simply 8-bit bytes that are encoded one at
310      * a time.
311      * <p>
312      * The literal coding makes an assumption that the highest <code>lc</code>
313      * bits of the previous uncompressed byte correlate with the next byte.
314      * For example, in typical English text, an upper-case letter is often
315      * followed by a lower-case letter, and a lower-case letter is usually
316      * followed by another lower-case letter. In the US-ASCII character set,
317      * the highest three bits are 010 for upper-case letters and 011 for
318      * lower-case letters. When <code>lc</code> is at least 3, the literal
319      * coding can take advantage of this property in the  uncompressed data.
320      * <p>
321      * The default value (3) is usually good. If you want maximum compression,
322      * try <code>setLc(4)</code>. Sometimes it helps a little, and sometimes it
323      * makes compression worse. If it makes it worse, test for example
324      * <code>setLc(2)</code> too.
325      *
326      * @throws      UnsupportedOptionsException
327      *                          <code>lc</code> is invalid, or the sum
328      *                          of <code>lc</code> and <code>lp</code>
329      *                          exceed LC_LP_MAX
330      */
setLc(int lc)331     public void setLc(int lc) throws UnsupportedOptionsException {
332         setLcLp(lc, lp);
333     }
334 
335     /**
336      * Sets the number of literal position bits.
337      * <p>
338      * This affets what kind of alignment in the uncompressed data is
339      * assumed when encoding literals. See {@link #setPb(int) setPb} for
340      * more information about alignment.
341      *
342      * @throws      UnsupportedOptionsException
343      *                          <code>lp</code> is invalid, or the sum
344      *                          of <code>lc</code> and <code>lp</code>
345      *                          exceed LC_LP_MAX
346      */
setLp(int lp)347     public void setLp(int lp) throws UnsupportedOptionsException {
348         setLcLp(lc, lp);
349     }
350 
351     /**
352      * Gets the number of literal context bits.
353      */
getLc()354     public int getLc() {
355         return lc;
356     }
357 
358     /**
359      * Gets the number of literal position bits.
360      */
getLp()361     public int getLp() {
362         return lp;
363     }
364 
365     /**
366      * Sets the number of position bits.
367      * <p>
368      * This affects what kind of alignment in the uncompressed data is
369      * assumed in general. The default (2) means four-byte alignment
370      * (2^<code>pb</code> = 2^2 = 4), which is often a good choice when
371      * there's no better guess.
372      * <p>
373      * When the alignment is known, setting the number of position bits
374      * accordingly may reduce the file size a little. For example with text
375      * files having one-byte alignment (US-ASCII, ISO-8859-*, UTF-8), using
376      * <code>setPb(0)</code> can improve compression slightly. For UTF-16
377      * text, <code>setPb(1)</code> is a good choice. If the alignment is
378      * an odd number like 3 bytes, <code>setPb(0)</code> might be the best
379      * choice.
380      * <p>
381      * Even though the assumed alignment can be adjusted with
382      * <code>setPb</code> and <code>setLp</code>, LZMA2 still slightly favors
383      * 16-byte alignment. It might be worth taking into account when designing
384      * file formats that are likely to be often compressed with LZMA2.
385      *
386      * @throws      UnsupportedOptionsException
387      *                          <code>pb</code> is invalid
388      */
setPb(int pb)389     public void setPb(int pb) throws UnsupportedOptionsException {
390         if (pb < 0 || pb > PB_MAX)
391             throw new UnsupportedOptionsException(
392                     "pb must not exceed " + PB_MAX + ": " + pb);
393 
394         this.pb = pb;
395     }
396 
397     /**
398      * Gets the number of position bits.
399      */
getPb()400     public int getPb() {
401         return pb;
402     }
403 
404     /**
405      * Sets the compression mode.
406      * <p>
407      * This specifies the method to analyze the data produced by
408      * a match finder. The default is <code>MODE_FAST</code> for presets
409      * 0-3 and <code>MODE_NORMAL</code> for presets 4-9.
410      * <p>
411      * Usually <code>MODE_FAST</code> is used with Hash Chain match finders
412      * and <code>MODE_NORMAL</code> with Binary Tree match finders. This is
413      * also what the presets do.
414      * <p>
415      * The special mode <code>MODE_UNCOMPRESSED</code> doesn't try to
416      * compress the data at all (and doesn't use a match finder) and will
417      * simply wrap it in uncompressed LZMA2 chunks.
418      *
419      * @throws      UnsupportedOptionsException
420      *                          <code>mode</code> is not supported
421      */
setMode(int mode)422     public void setMode(int mode) throws UnsupportedOptionsException {
423         if (mode < MODE_UNCOMPRESSED || mode > MODE_NORMAL)
424             throw new UnsupportedOptionsException(
425                     "Unsupported compression mode: " + mode);
426 
427         this.mode = mode;
428     }
429 
430     /**
431      * Gets the compression mode.
432      */
getMode()433     public int getMode() {
434         return mode;
435     }
436 
437     /**
438      * Sets the nice length of matches.
439      * Once a match of at least <code>niceLen</code> bytes is found,
440      * the algorithm stops looking for better matches. Higher values tend
441      * to give better compression at the expense of speed. The default
442      * depends on the preset.
443      *
444      * @throws      UnsupportedOptionsException
445      *                          <code>niceLen</code> is invalid
446      */
setNiceLen(int niceLen)447     public void setNiceLen(int niceLen) throws UnsupportedOptionsException {
448         if (niceLen < NICE_LEN_MIN)
449             throw new UnsupportedOptionsException(
450                     "Minimum nice length of matches is "
451                     + NICE_LEN_MIN + " bytes: " + niceLen);
452 
453         if (niceLen > NICE_LEN_MAX)
454             throw new UnsupportedOptionsException(
455                     "Maximum nice length of matches is " + NICE_LEN_MAX
456                     + ": " + niceLen);
457 
458         this.niceLen = niceLen;
459     }
460 
461     /**
462      * Gets the nice length of matches.
463      */
getNiceLen()464     public int getNiceLen() {
465         return niceLen;
466     }
467 
468     /**
469      * Sets the match finder type.
470      * <p>
471      * Match finder has a major effect on compression speed, memory usage,
472      * and compression ratio. Usually Hash Chain match finders are faster
473      * than Binary Tree match finders. The default depends on the preset:
474      * 0-3 use <code>MF_HC4</code> and 4-9 use <code>MF_BT4</code>.
475      *
476      * @throws      UnsupportedOptionsException
477      *                          <code>mf</code> is not supported
478      */
setMatchFinder(int mf)479     public void setMatchFinder(int mf) throws UnsupportedOptionsException {
480         if (mf != MF_HC4 && mf != MF_BT4)
481             throw new UnsupportedOptionsException(
482                     "Unsupported match finder: " + mf);
483 
484         this.mf = mf;
485     }
486 
487     /**
488      * Gets the match finder type.
489      */
getMatchFinder()490     public int getMatchFinder() {
491         return mf;
492     }
493 
494     /**
495      * Sets the match finder search depth limit.
496      * <p>
497      * The default is a special value of <code>0</code> which indicates that
498      * the depth limit should be automatically calculated by the selected
499      * match finder from the nice length of matches.
500      * <p>
501      * Reasonable depth limit for Hash Chain match finders is 4-100 and
502      * 16-1000 for Binary Tree match finders. Using very high values can
503      * make the compressor extremely slow with some files. Avoid settings
504      * higher than 1000 unless you are prepared to interrupt the compression
505      * in case it is taking far too long.
506      *
507      * @throws      UnsupportedOptionsException
508      *                          <code>depthLimit</code> is invalid
509      */
setDepthLimit(int depthLimit)510     public void setDepthLimit(int depthLimit)
511             throws UnsupportedOptionsException {
512         if (depthLimit < 0)
513             throw new UnsupportedOptionsException(
514                     "Depth limit cannot be negative: " + depthLimit);
515 
516         this.depthLimit = depthLimit;
517     }
518 
519     /**
520      * Gets the match finder search depth limit.
521      */
getDepthLimit()522     public int getDepthLimit() {
523         return depthLimit;
524     }
525 
getEncoderMemoryUsage()526     public int getEncoderMemoryUsage() {
527         return (mode == MODE_UNCOMPRESSED)
528                ? UncompressedLZMA2OutputStream.getMemoryUsage()
529                : LZMA2OutputStream.getMemoryUsage(this);
530     }
531 
getOutputStream(FinishableOutputStream out)532     public FinishableOutputStream getOutputStream(FinishableOutputStream out) {
533         if (mode == MODE_UNCOMPRESSED)
534             return new UncompressedLZMA2OutputStream(out);
535 
536         return new LZMA2OutputStream(out, this);
537     }
538 
539     /**
540      * Gets how much memory the LZMA2 decoder will need to decompress the data
541      * that was encoded with these options and stored in a .xz file.
542      * <p>
543      * The returned value may bigger than the value returned by a direct call
544      * to {@link LZMA2InputStream#getMemoryUsage(int)} if the dictionary size
545      * is not 2^n or 2^n&nbsp;+&nbsp;2^(n-1) bytes. This is because the .xz
546      * headers store the dictionary size in such a format and other values
547      * are rounded up to the next such value. Such rounding is harmess except
548      * it might waste some memory if an unsual dictionary size is used.
549      * <p>
550      * If you use raw LZMA2 streams and unusual dictioanary size, call
551      * {@link LZMA2InputStream#getMemoryUsage} directly to get raw decoder
552      * memory requirements.
553      */
getDecoderMemoryUsage()554     public int getDecoderMemoryUsage() {
555         // Round the dictionary size up to the next 2^n or 2^n + 2^(n-1).
556         int d = dictSize - 1;
557         d |= d >>> 2;
558         d |= d >>> 3;
559         d |= d >>> 4;
560         d |= d >>> 8;
561         d |= d >>> 16;
562         return LZMA2InputStream.getMemoryUsage(d + 1);
563     }
564 
getInputStream(InputStream in)565     public InputStream getInputStream(InputStream in) throws IOException {
566         return new LZMA2InputStream(in, dictSize);
567     }
568 
getFilterEncoder()569     FilterEncoder getFilterEncoder() {
570         return new LZMA2Encoder(this);
571     }
572 
clone()573     public Object clone() {
574         try {
575             return super.clone();
576         } catch (CloneNotSupportedException e) {
577             assert false;
578             throw new RuntimeException();
579         }
580     }
581 }
582