1 /* compress.c - deflate/inflate code for zip, gzip, zlib, and raw
2  *
3  * Copyright 2014 Rob Landley <rob@landley.net>
4  *
5  * The inflate/deflate code lives here, so the various things that use it
6  * either live here or call these commands to pipe data through them.
7  *
8  * Divergence from posix: replace obsolete/patented "compress" with mutiplexer.
9  * (gzip already replaces "uncompress".)
10  *
11  * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
12  * LSB 4.1 has gzip, gunzip, and zcat
13  * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
14 
15 // Accept many different kinds of command line argument.
16 // Leave Lrg at end so flag values line up.
17 
18 USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN))
19 USE_GZIP(NEWTOY(gzip, USE_GZIP_D("d")"19dcflqStvgLRz[!gLRz]", TOYFLAG_USR|TOYFLAG_BIN))
20 USE_ZCAT(NEWTOY(zcat, 0, TOYFLAG_USR|TOYFLAG_BIN))
21 USE_GUNZIP(NEWTOY(gunzip, "cflqStv", TOYFLAG_USR|TOYFLAG_BIN))
22 
23 //zip unzip gzip gunzip zcat
24 
25 config COMPRESS
26   bool "compress"
27   default n
28   help
29     usage: compress [-zgLR19] [FILE]
30 
31     Compress or decompress file (or stdin) using "deflate" algorithm.
32 
33     -1	min compression
34     -9	max compression (default)
35     -g	gzip (default)
36     -L	zlib
37     -R	raw
38     -z	zip
39 
40 config GZIP
41   bool "gzip"
42   default y
43   depends on COMPRESS
44   help
45     usage: gzip [-19cfqStvzgLR] [FILE...]
46 
47     Compess (deflate) file(s). With no files, compress stdin to stdout.
48 
49     On successful decompression, compressed files are replaced with the
50     uncompressed version. The input file is removed and replaced with
51     a new file without the .gz extension (with same ownership/permissions).
52 
53     -1	Minimal compression (fastest)
54     -9	Max compression (default)
55     -c	cat to stdout (act as zcat)
56     -f	force (if output file exists, input is tty, unrecognized extension)
57     -q	quiet (no warnings)
58     -S	specify exension (default .*)
59     -t	test compressed file(s)
60     -v	verbose (like -l, but compress files)
61 
62     Compression type:
63     -g gzip (default)    -L zlib    -R raw    -z zip
64 
65 config GZIP_D
66   bool
67   default y
68   depends on GZIP && DECOMPRESS
69   help
70     usage: gzip [-d]
71 
72     -d	decompress (act as gunzip)
73 
74 config DECOMPRESS
75   bool "decompress"
76   default n
77   help
78     usage: compress [-zglrcd9] [FILE]
79 
80     Compress or decompress file (or stdin) using "deflate" algorithm.
81 
82     -c	compress with -g gzip (default)  -l zlib  -r raw  -z zip
83     -d	decompress (autodetects type)
84 
85 
86 config ZCAT
87   bool "zcat"
88   default y
89   depends on DECOMPRESS
90   help
91     usage: zcat [FILE...]
92 
93     Decompress deflated file(s) to stdout
94 
95 config GUNZIP
96   bool "gunzip"
97   default y
98   depends on DECOMPRESS
99   help
100     usage: gunzip [-cflqStv] [FILE...]
101 
102     Decompess (deflate) file(s). With no files, compress stdin to stdout.
103 
104     On successful decompression, compressed files are replaced with the
105     uncompressed version. The input file is removed and replaced with
106     a new file without the .gz extension (with same ownership/permissions).
107 
108     -c	cat to stdout (act as zcat)
109     -f	force (output file exists, input is tty, unrecognized extension)
110     -l	list compressed/uncompressed/ratio/name for each input file.
111     -q	quiet (no warnings)
112     -S	specify exension (default .*)
113     -t	test compressed file(s)
114     -v	verbose (like -l, but decompress files)
115 */
116 
117 #define FOR_compress
118 #include "toys.h"
119 
120 GLOBALS(
121   // Huffman codes: base offset and extra bits tables (length and distance)
122   char lenbits[29], distbits[30];
123   unsigned short lenbase[29], distbase[30];
124   void *fixdisthuff, *fixlithuff;
125 
126   // CRC
127   void (*crcfunc)(char *data, int len);
128   unsigned crc;
129 
130   // Compressed data buffer
131   char *data;
132   unsigned pos, len;
133   int infd, outfd;
134 
135   // Tables only used for deflation
136   unsigned short *hashhead, *hashchain;
137 )
138 
139 // little endian bit buffer
140 struct bitbuf {
141   int fd, bitpos, len, max;
142   char buf[];
143 };
144 
145 // malloc a struct bitbuf
bitbuf_init(int fd,int size)146 struct bitbuf *bitbuf_init(int fd, int size)
147 {
148   struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
149 
150   bb->max = size;
151   bb->fd = fd;
152 
153   return bb;
154 }
155 
156 // Advance bitpos without the overhead of recording bits
bitbuf_skip(struct bitbuf * bb,int bits)157 void bitbuf_skip(struct bitbuf *bb, int bits)
158 {
159   int pos = bb->bitpos + bits, len = bb->len << 3;
160 
161   while (pos >= len) {
162     pos -= len;
163     len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
164     if (bb->len < 1) perror_exit("inflate EOF");
165   }
166   bb->bitpos = pos;
167 }
168 
169 // Optimized single bit inlined version
bitbuf_bit(struct bitbuf * bb)170 static inline int bitbuf_bit(struct bitbuf *bb)
171 {
172   int bufpos = bb->bitpos>>3;
173 
174   if (bufpos == bb->len) {
175     bitbuf_skip(bb, 0);
176     bufpos = 0;
177   }
178 
179   return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
180 }
181 
182 // Fetch the next X bits from the bitbuf, little endian
bitbuf_get(struct bitbuf * bb,int bits)183 unsigned bitbuf_get(struct bitbuf *bb, int bits)
184 {
185   int result = 0, offset = 0;
186 
187   while (bits) {
188     int click = bb->bitpos >> 3, blow, blen;
189 
190     // Load more data if buffer empty
191     if (click == bb->len) bitbuf_skip(bb, click = 0);
192 
193     // grab bits from next byte
194     blow = bb->bitpos & 7;
195     blen = 8-blow;
196     if (blen > bits) blen = bits;
197     result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
198     offset += blen;
199     bits -= blen;
200     bb->bitpos += blen;
201   }
202 
203   return result;
204 }
205 
bitbuf_flush(struct bitbuf * bb)206 void bitbuf_flush(struct bitbuf *bb)
207 {
208   if (!bb->bitpos) return;
209 
210   xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8);
211   memset(bb->buf, 0, bb->max);
212   bb->bitpos = 0;
213 }
214 
bitbuf_put(struct bitbuf * bb,int data,int len)215 void bitbuf_put(struct bitbuf *bb, int data, int len)
216 {
217   while (len) {
218     int click = bb->bitpos >> 3, blow, blen;
219 
220     // Flush buffer if necessary
221     if (click == bb->max) {
222       bitbuf_flush(bb);
223       click = 0;
224     }
225     blow = bb->bitpos & 7;
226     blen = 8-blow;
227     if (blen > len) blen = len;
228     bb->buf[click] |= data << blow;
229     bb->bitpos += blen;
230     data >>= blen;
231     len -= blen;
232   }
233 }
234 
output_byte(char sym)235 static void output_byte(char sym)
236 {
237   int pos = TT.pos++ & 32767;
238 
239   TT.data[pos] = sym;
240 
241   if (!pos) {
242     xwrite(TT.outfd, TT.data, 32768);
243     if (TT.crcfunc) TT.crcfunc(TT.data, 32768);
244   }
245 }
246 
247 // Huffman coding uses bits to traverse a binary tree to a leaf node,
248 // By placing frequently occurring symbols at shorter paths, frequently
249 // used symbols may be represented in fewer bits than uncommon symbols.
250 
251 struct huff {
252   unsigned short length[16];
253   unsigned short symbol[288];
254 };
255 
256 // Create simple huffman tree from array of bit lengths.
257 
258 // The symbols in the huffman trees are sorted (first by bit length
259 // of the code to reach them, then by symbol number). This means that given
260 // the bit length of each symbol, we can construct a unique tree.
len2huff(struct huff * huff,char bitlen[],int len)261 static void len2huff(struct huff *huff, char bitlen[], int len)
262 {
263   int offset[16];
264   int i;
265 
266   // Count number of codes at each bit length
267   memset(huff, 0, sizeof(struct huff));
268   for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
269 
270   // Sort symbols by bit length. (They'll remain sorted by symbol within that.)
271   *huff->length = *offset = 0;
272   for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
273 
274   for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
275 }
276 
277 // Fetch and decode next huffman coded symbol from bitbuf.
278 // This takes advantage of the sorting to navigate the tree as an array:
279 // each time we fetch a bit we have all the codes at that bit level in
280 // order with no gaps.
huff_and_puff(struct bitbuf * bb,struct huff * huff)281 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
282 {
283   unsigned short *length = huff->length;
284   int start = 0, offset = 0;
285 
286   // Traverse through the bit lengths until our code is in this range
287   for (;;) {
288     offset = (offset << 1) | bitbuf_bit(bb);
289     start += *++length;
290     if ((offset -= *length) < 0) break;
291     if ((length - huff->length) & 16) error_exit("bad symbol");
292   }
293 
294   return huff->symbol[start + offset];
295 }
296 
297 // Decompress deflated data from bitbuf to TT.outfd.
inflate(struct bitbuf * bb)298 static void inflate(struct bitbuf *bb)
299 {
300   TT.crc = ~0;
301   // repeat until spanked
302   for (;;) {
303     int final, type;
304 
305     final = bitbuf_get(bb, 1);
306     type = bitbuf_get(bb, 2);
307 
308     if (type == 3) error_exit("bad type");
309 
310     // Uncompressed block?
311     if (!type) {
312       int len, nlen;
313 
314       // Align to byte, read length
315       bitbuf_skip(bb, (8-bb->bitpos)&7);
316       len = bitbuf_get(bb, 16);
317       nlen = bitbuf_get(bb, 16);
318       if (len != (0xffff & ~nlen)) error_exit("bad len");
319 
320       // Dump literal output data
321       while (len) {
322         int pos = bb->bitpos >> 3, bblen = bb->len - pos;
323         char *p = bb->buf+pos;
324 
325         // dump bytes until done or end of current bitbuf contents
326         if (bblen > len) bblen = len;
327         pos = bblen;
328         while (pos--) output_byte(*(p++));
329         bitbuf_skip(bb, bblen << 3);
330         len -= bblen;
331       }
332 
333     // Compressed block
334     } else {
335       struct huff *disthuff, *lithuff;
336 
337       // Dynamic huffman codes?
338       if (type == 2) {
339         struct huff *h2 = ((struct huff *)toybuf)+1;
340         int i, litlen, distlen, hufflen;
341         char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
342                               "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
343 
344         // The huffman trees are stored as a series of bit lengths
345         litlen = bitbuf_get(bb, 5)+257;  // max 288
346         distlen = bitbuf_get(bb, 5)+1;   // max 32
347         hufflen = bitbuf_get(bb, 4)+4;   // max 19
348 
349         // The literal and distance codes are themselves compressed, in
350         // a complicated way: an array of bit lengths (hufflen many
351         // entries, each 3 bits) is used to fill out an array of 19 entries
352         // in a magic order, leaving the rest 0. Then make a tree out of it:
353         memset(bits = toybuf+1, 0, 19);
354         for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
355         len2huff(h2, bits, 19);
356 
357         // Use that tree to read in the literal and distance bit lengths
358         for (i = 0; i < litlen + distlen;) {
359           int sym = huff_and_puff(bb, h2);
360 
361           // 0-15 are literals, 16 = repeat previous code 3-6 times,
362           // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
363           if (sym < 16) bits[i++] = sym;
364           else {
365             int len = sym & 2;
366 
367             len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
368             memset(bits+i, bits[i-1] * !(sym&3), len);
369             i += len;
370           }
371         }
372         if (i > litlen+distlen) error_exit("bad tree");
373 
374         len2huff(lithuff = h2, bits, litlen);
375         len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
376 
377       // Static huffman codes
378       } else {
379         lithuff = TT.fixlithuff;
380         disthuff = TT.fixdisthuff;
381       }
382 
383       // Use huffman tables to decode block of compressed symbols
384       for (;;) {
385         int sym = huff_and_puff(bb, lithuff);
386 
387         // Literal?
388         if (sym < 256) output_byte(sym);
389 
390         // Copy range?
391         else if (sym > 256) {
392           int len, dist;
393 
394           sym -= 257;
395           len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]);
396           sym = huff_and_puff(bb, disthuff);
397           dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]);
398           sym = TT.pos & 32767;
399 
400           while (len--) output_byte(TT.data[(TT.pos-dist) & 32767]);
401 
402         // End of block
403         } else break;
404       }
405     }
406 
407     // Was that the last block?
408     if (final) break;
409   }
410 
411   if (TT.pos & 32767) {
412     xwrite(TT.outfd, TT.data, TT.pos & 32767);
413     if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767);
414   }
415 }
416 
417 // Deflate from TT.infd to bitbuf
418 // For deflate, TT.len = input read, TT.pos = input consumed
deflate(struct bitbuf * bb)419 static void deflate(struct bitbuf *bb)
420 {
421   char *data = TT.data;
422   int len, final = 0;
423 
424   TT.crc = ~0;
425 
426   while (!final) {
427     // Read next half-window of data if we haven't hit EOF yet.
428     len = readall(TT.infd, data+(TT.len&32768), 32768);
429     if (len < 0) perror_exit("read"); // todo: add filename
430     if (len != 32768) final++;
431     if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len);
432     // TT.len += len;  crcfunc advances len
433 
434     // store block as literal
435     bitbuf_put(bb, final, 1);
436     bitbuf_put(bb, 0, 1);
437 
438     bitbuf_put(bb, 0, (8-bb->bitpos)&7);
439     bitbuf_put(bb, len, 16);
440     bitbuf_put(bb, 0xffff & ~len, 16);
441 
442     // repeat until spanked
443     while (TT.pos != TT.len) {
444       unsigned pos = TT.pos & 65535;
445 
446       bitbuf_put(bb, data[pos], 8);
447 
448       // need to refill buffer?
449       if (!(32767 & ++TT.pos) && !final) break;
450     }
451   }
452   bitbuf_flush(bb);
453 }
454 
455 // Allocate memory for deflate/inflate.
init_deflate(int compress)456 static void init_deflate(int compress)
457 {
458   int i, n = 1;
459 
460   // compress needs 64k data and 32k each for hashhead and hashchain.
461   // decompress just needs 32k data.
462   TT.data = xmalloc(32768*(compress ? 4 : 1));
463   if (compress) {
464     TT.hashhead = (unsigned short *)(TT.data + 65536);
465     TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768);
466   }
467 
468   // Calculate lenbits, lenbase, distbits, distbase
469   *TT.lenbase = 3;
470   for (i = 0; i<sizeof(TT.lenbits)-1; i++) {
471     if (i>4) {
472       if (!(i&3)) {
473         TT.lenbits[i]++;
474         n <<= 1;
475       }
476       if (i == 27) n--;
477       else TT.lenbits[i+1] = TT.lenbits[i];
478     }
479     TT.lenbase[i+1] = n + TT.lenbase[i];
480   }
481   n = 0;
482   for (i = 0; i<sizeof(TT.distbits); i++) {
483     TT.distbase[i] = 1<<n;
484     if (i) TT.distbase[i] += TT.distbase[i-1];
485     if (i>3 && !(i&1)) n++;
486     TT.distbits[i] = n;
487   }
488 
489   // Init fixed huffman tables
490   for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
491   len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
492   memset(toybuf, 5, 30);
493   len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
494 }
495 
496 // Return true/false whether we consumed a gzip header.
is_gzip(struct bitbuf * bb)497 static int is_gzip(struct bitbuf *bb)
498 {
499   int flags;
500 
501   // Confirm signature
502   if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
503     return 0;
504   bitbuf_skip(bb, 6*8);
505 
506   // Skip extra, name, comment, header CRC fields
507   if (flags & 4) bitbuf_skip(bb, 16);
508   if (flags & 8) while (bitbuf_get(bb, 8));
509   if (flags & 16) while (bitbuf_get(bb, 8));
510   if (flags & 2) bitbuf_skip(bb, 16);
511 
512   return 1;
513 }
514 
gzip_crc(char * data,int len)515 void gzip_crc(char *data, int len)
516 {
517   int i;
518   unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024);
519 
520   crc = TT.crc;
521   for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
522   TT.crc = crc;
523   TT.len += len;
524 }
525 
do_gzip(int fd,char * name)526 static void do_gzip(int fd, char *name)
527 {
528   struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf));
529 
530   // Header from RFC 1952 section 2.2:
531   // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
532   // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
533   // Operating System (FF=unknown)
534 
535   TT.infd = fd;
536   xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
537 
538   // Use last 1k of toybuf for little endian crc table
539   crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
540   TT.crcfunc = gzip_crc;
541 
542   deflate(bb);
543 
544   // tail: crc32, len32
545 
546   bitbuf_put(bb, 0, (8-bb->bitpos)&7);
547   bitbuf_put(bb, ~TT.crc, 32);
548   bitbuf_put(bb, TT.len, 32);
549 
550   bitbuf_flush(bb);
551   free(bb);
552 }
553 
do_zcat(int fd,char * name)554 static void do_zcat(int fd, char *name)
555 {
556   struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf));
557 
558   if (!is_gzip(bb)) error_exit("not gzip");
559   TT.outfd = 1;
560 
561   // Use last 1k of toybuf for little endian crc table
562   crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
563   TT.crcfunc = gzip_crc;
564 
565   inflate(bb);
566 
567   // tail: crc32, len32
568 
569   bitbuf_skip(bb, (8-bb->bitpos)&7);
570   if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32))
571     error_exit("bad crc");
572   free(bb);
573 }
574 
575 // Parse many different kinds of command line argument:
576 
compress_main(void)577 void compress_main(void)
578 {
579   // todo: this
580   printf("hello world");
581 }
582 
583 //#define CLEANUP_compress
584 //#define FOR_zcat
585 //#include "generated/flags.h"
586 
zcat_main(void)587 void zcat_main(void)
588 {
589   init_deflate(0);
590 
591   loopfiles(toys.optargs, do_zcat);
592 }
593 
gunzip_main(void)594 void gunzip_main(void)
595 {
596   init_deflate(0);
597 
598   loopfiles(toys.optargs, do_zcat);
599 }
600 
gzip_main(void)601 void gzip_main(void)
602 {
603   init_deflate(1);
604 
605   loopfiles(toys.optargs, do_gzip);
606 }
607