1 /* compress.c - deflate/inflate code for zip, gzip, zlib, and raw
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * The inflate/deflate code lives here, so the various things that use it
6 * either live here or call these commands to pipe data through them.
7 *
8 * Divergence from posix: replace obsolete/patented "compress" with mutiplexer.
9 * (gzip already replaces "uncompress".)
10 *
11 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
12 * LSB 4.1 has gzip, gunzip, and zcat
13 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
14
15 // Accept many different kinds of command line argument.
16 // Leave Lrg at end so flag values line up.
17
18 USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN))
19 USE_GZIP(NEWTOY(gzip, USE_GZIP_D("d")"19dcflqStvgLRz[!gLRz]", TOYFLAG_USR|TOYFLAG_BIN))
20 USE_ZCAT(NEWTOY(zcat, 0, TOYFLAG_USR|TOYFLAG_BIN))
21 USE_GUNZIP(NEWTOY(gunzip, "cflqStv", TOYFLAG_USR|TOYFLAG_BIN))
22
23 //zip unzip gzip gunzip zcat
24
25 config COMPRESS
26 bool "compress"
27 default n
28 help
29 usage: compress [-zgLR19] [FILE]
30
31 Compress or decompress file (or stdin) using "deflate" algorithm.
32
33 -1 min compression
34 -9 max compression (default)
35 -g gzip (default)
36 -L zlib
37 -R raw
38 -z zip
39
40 config GZIP
41 bool "gzip"
42 default y
43 depends on COMPRESS
44 help
45 usage: gzip [-19cfqStvzgLR] [FILE...]
46
47 Compess (deflate) file(s). With no files, compress stdin to stdout.
48
49 On successful decompression, compressed files are replaced with the
50 uncompressed version. The input file is removed and replaced with
51 a new file without the .gz extension (with same ownership/permissions).
52
53 -1 Minimal compression (fastest)
54 -9 Max compression (default)
55 -c cat to stdout (act as zcat)
56 -f force (if output file exists, input is tty, unrecognized extension)
57 -q quiet (no warnings)
58 -S specify exension (default .*)
59 -t test compressed file(s)
60 -v verbose (like -l, but compress files)
61
62 Compression type:
63 -g gzip (default) -L zlib -R raw -z zip
64
65 config GZIP_D
66 bool
67 default y
68 depends on GZIP && DECOMPRESS
69 help
70 usage: gzip [-d]
71
72 -d decompress (act as gunzip)
73
74 config DECOMPRESS
75 bool "decompress"
76 default n
77 help
78 usage: compress [-zglrcd9] [FILE]
79
80 Compress or decompress file (or stdin) using "deflate" algorithm.
81
82 -c compress with -g gzip (default) -l zlib -r raw -z zip
83 -d decompress (autodetects type)
84
85
86 config ZCAT
87 bool "zcat"
88 default y
89 depends on DECOMPRESS
90 help
91 usage: zcat [FILE...]
92
93 Decompress deflated file(s) to stdout
94
95 config GUNZIP
96 bool "gunzip"
97 default y
98 depends on DECOMPRESS
99 help
100 usage: gunzip [-cflqStv] [FILE...]
101
102 Decompess (deflate) file(s). With no files, compress stdin to stdout.
103
104 On successful decompression, compressed files are replaced with the
105 uncompressed version. The input file is removed and replaced with
106 a new file without the .gz extension (with same ownership/permissions).
107
108 -c cat to stdout (act as zcat)
109 -f force (output file exists, input is tty, unrecognized extension)
110 -l list compressed/uncompressed/ratio/name for each input file.
111 -q quiet (no warnings)
112 -S specify exension (default .*)
113 -t test compressed file(s)
114 -v verbose (like -l, but decompress files)
115 */
116
117 #define FOR_compress
118 #include "toys.h"
119
120 GLOBALS(
121 // Huffman codes: base offset and extra bits tables (length and distance)
122 char lenbits[29], distbits[30];
123 unsigned short lenbase[29], distbase[30];
124 void *fixdisthuff, *fixlithuff;
125
126 // CRC
127 void (*crcfunc)(char *data, int len);
128 unsigned crc;
129
130 // Compressed data buffer
131 char *data;
132 unsigned pos, len;
133 int infd, outfd;
134
135 // Tables only used for deflation
136 unsigned short *hashhead, *hashchain;
137 )
138
139 // little endian bit buffer
140 struct bitbuf {
141 int fd, bitpos, len, max;
142 char buf[];
143 };
144
145 // malloc a struct bitbuf
bitbuf_init(int fd,int size)146 struct bitbuf *bitbuf_init(int fd, int size)
147 {
148 struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
149
150 bb->max = size;
151 bb->fd = fd;
152
153 return bb;
154 }
155
156 // Advance bitpos without the overhead of recording bits
bitbuf_skip(struct bitbuf * bb,int bits)157 void bitbuf_skip(struct bitbuf *bb, int bits)
158 {
159 int pos = bb->bitpos + bits, len = bb->len << 3;
160
161 while (pos >= len) {
162 pos -= len;
163 len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
164 if (bb->len < 1) perror_exit("inflate EOF");
165 }
166 bb->bitpos = pos;
167 }
168
169 // Optimized single bit inlined version
bitbuf_bit(struct bitbuf * bb)170 static inline int bitbuf_bit(struct bitbuf *bb)
171 {
172 int bufpos = bb->bitpos>>3;
173
174 if (bufpos == bb->len) {
175 bitbuf_skip(bb, 0);
176 bufpos = 0;
177 }
178
179 return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
180 }
181
182 // Fetch the next X bits from the bitbuf, little endian
bitbuf_get(struct bitbuf * bb,int bits)183 unsigned bitbuf_get(struct bitbuf *bb, int bits)
184 {
185 int result = 0, offset = 0;
186
187 while (bits) {
188 int click = bb->bitpos >> 3, blow, blen;
189
190 // Load more data if buffer empty
191 if (click == bb->len) bitbuf_skip(bb, click = 0);
192
193 // grab bits from next byte
194 blow = bb->bitpos & 7;
195 blen = 8-blow;
196 if (blen > bits) blen = bits;
197 result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
198 offset += blen;
199 bits -= blen;
200 bb->bitpos += blen;
201 }
202
203 return result;
204 }
205
bitbuf_flush(struct bitbuf * bb)206 void bitbuf_flush(struct bitbuf *bb)
207 {
208 if (!bb->bitpos) return;
209
210 xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8);
211 memset(bb->buf, 0, bb->max);
212 bb->bitpos = 0;
213 }
214
bitbuf_put(struct bitbuf * bb,int data,int len)215 void bitbuf_put(struct bitbuf *bb, int data, int len)
216 {
217 while (len) {
218 int click = bb->bitpos >> 3, blow, blen;
219
220 // Flush buffer if necessary
221 if (click == bb->max) {
222 bitbuf_flush(bb);
223 click = 0;
224 }
225 blow = bb->bitpos & 7;
226 blen = 8-blow;
227 if (blen > len) blen = len;
228 bb->buf[click] |= data << blow;
229 bb->bitpos += blen;
230 data >>= blen;
231 len -= blen;
232 }
233 }
234
output_byte(char sym)235 static void output_byte(char sym)
236 {
237 int pos = TT.pos++ & 32767;
238
239 TT.data[pos] = sym;
240
241 if (!pos) {
242 xwrite(TT.outfd, TT.data, 32768);
243 if (TT.crcfunc) TT.crcfunc(TT.data, 32768);
244 }
245 }
246
247 // Huffman coding uses bits to traverse a binary tree to a leaf node,
248 // By placing frequently occurring symbols at shorter paths, frequently
249 // used symbols may be represented in fewer bits than uncommon symbols.
250
251 struct huff {
252 unsigned short length[16];
253 unsigned short symbol[288];
254 };
255
256 // Create simple huffman tree from array of bit lengths.
257
258 // The symbols in the huffman trees are sorted (first by bit length
259 // of the code to reach them, then by symbol number). This means that given
260 // the bit length of each symbol, we can construct a unique tree.
len2huff(struct huff * huff,char bitlen[],int len)261 static void len2huff(struct huff *huff, char bitlen[], int len)
262 {
263 int offset[16];
264 int i;
265
266 // Count number of codes at each bit length
267 memset(huff, 0, sizeof(struct huff));
268 for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
269
270 // Sort symbols by bit length. (They'll remain sorted by symbol within that.)
271 *huff->length = *offset = 0;
272 for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
273
274 for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
275 }
276
277 // Fetch and decode next huffman coded symbol from bitbuf.
278 // This takes advantage of the sorting to navigate the tree as an array:
279 // each time we fetch a bit we have all the codes at that bit level in
280 // order with no gaps.
huff_and_puff(struct bitbuf * bb,struct huff * huff)281 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
282 {
283 unsigned short *length = huff->length;
284 int start = 0, offset = 0;
285
286 // Traverse through the bit lengths until our code is in this range
287 for (;;) {
288 offset = (offset << 1) | bitbuf_bit(bb);
289 start += *++length;
290 if ((offset -= *length) < 0) break;
291 if ((length - huff->length) & 16) error_exit("bad symbol");
292 }
293
294 return huff->symbol[start + offset];
295 }
296
297 // Decompress deflated data from bitbuf to TT.outfd.
inflate(struct bitbuf * bb)298 static void inflate(struct bitbuf *bb)
299 {
300 TT.crc = ~0;
301 // repeat until spanked
302 for (;;) {
303 int final, type;
304
305 final = bitbuf_get(bb, 1);
306 type = bitbuf_get(bb, 2);
307
308 if (type == 3) error_exit("bad type");
309
310 // Uncompressed block?
311 if (!type) {
312 int len, nlen;
313
314 // Align to byte, read length
315 bitbuf_skip(bb, (8-bb->bitpos)&7);
316 len = bitbuf_get(bb, 16);
317 nlen = bitbuf_get(bb, 16);
318 if (len != (0xffff & ~nlen)) error_exit("bad len");
319
320 // Dump literal output data
321 while (len) {
322 int pos = bb->bitpos >> 3, bblen = bb->len - pos;
323 char *p = bb->buf+pos;
324
325 // dump bytes until done or end of current bitbuf contents
326 if (bblen > len) bblen = len;
327 pos = bblen;
328 while (pos--) output_byte(*(p++));
329 bitbuf_skip(bb, bblen << 3);
330 len -= bblen;
331 }
332
333 // Compressed block
334 } else {
335 struct huff *disthuff, *lithuff;
336
337 // Dynamic huffman codes?
338 if (type == 2) {
339 struct huff *h2 = ((struct huff *)toybuf)+1;
340 int i, litlen, distlen, hufflen;
341 char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
342 "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
343
344 // The huffman trees are stored as a series of bit lengths
345 litlen = bitbuf_get(bb, 5)+257; // max 288
346 distlen = bitbuf_get(bb, 5)+1; // max 32
347 hufflen = bitbuf_get(bb, 4)+4; // max 19
348
349 // The literal and distance codes are themselves compressed, in
350 // a complicated way: an array of bit lengths (hufflen many
351 // entries, each 3 bits) is used to fill out an array of 19 entries
352 // in a magic order, leaving the rest 0. Then make a tree out of it:
353 memset(bits = toybuf+1, 0, 19);
354 for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
355 len2huff(h2, bits, 19);
356
357 // Use that tree to read in the literal and distance bit lengths
358 for (i = 0; i < litlen + distlen;) {
359 int sym = huff_and_puff(bb, h2);
360
361 // 0-15 are literals, 16 = repeat previous code 3-6 times,
362 // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
363 if (sym < 16) bits[i++] = sym;
364 else {
365 int len = sym & 2;
366
367 len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
368 memset(bits+i, bits[i-1] * !(sym&3), len);
369 i += len;
370 }
371 }
372 if (i > litlen+distlen) error_exit("bad tree");
373
374 len2huff(lithuff = h2, bits, litlen);
375 len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
376
377 // Static huffman codes
378 } else {
379 lithuff = TT.fixlithuff;
380 disthuff = TT.fixdisthuff;
381 }
382
383 // Use huffman tables to decode block of compressed symbols
384 for (;;) {
385 int sym = huff_and_puff(bb, lithuff);
386
387 // Literal?
388 if (sym < 256) output_byte(sym);
389
390 // Copy range?
391 else if (sym > 256) {
392 int len, dist;
393
394 sym -= 257;
395 len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]);
396 sym = huff_and_puff(bb, disthuff);
397 dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]);
398 sym = TT.pos & 32767;
399
400 while (len--) output_byte(TT.data[(TT.pos-dist) & 32767]);
401
402 // End of block
403 } else break;
404 }
405 }
406
407 // Was that the last block?
408 if (final) break;
409 }
410
411 if (TT.pos & 32767) {
412 xwrite(TT.outfd, TT.data, TT.pos & 32767);
413 if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767);
414 }
415 }
416
417 // Deflate from TT.infd to bitbuf
418 // For deflate, TT.len = input read, TT.pos = input consumed
deflate(struct bitbuf * bb)419 static void deflate(struct bitbuf *bb)
420 {
421 char *data = TT.data;
422 int len, final = 0;
423
424 TT.crc = ~0;
425
426 while (!final) {
427 // Read next half-window of data if we haven't hit EOF yet.
428 len = readall(TT.infd, data+(TT.len&32768), 32768);
429 if (len < 0) perror_exit("read"); // todo: add filename
430 if (len != 32768) final++;
431 if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len);
432 // TT.len += len; crcfunc advances len
433
434 // store block as literal
435 bitbuf_put(bb, final, 1);
436 bitbuf_put(bb, 0, 1);
437
438 bitbuf_put(bb, 0, (8-bb->bitpos)&7);
439 bitbuf_put(bb, len, 16);
440 bitbuf_put(bb, 0xffff & ~len, 16);
441
442 // repeat until spanked
443 while (TT.pos != TT.len) {
444 unsigned pos = TT.pos & 65535;
445
446 bitbuf_put(bb, data[pos], 8);
447
448 // need to refill buffer?
449 if (!(32767 & ++TT.pos) && !final) break;
450 }
451 }
452 bitbuf_flush(bb);
453 }
454
455 // Allocate memory for deflate/inflate.
init_deflate(int compress)456 static void init_deflate(int compress)
457 {
458 int i, n = 1;
459
460 // compress needs 64k data and 32k each for hashhead and hashchain.
461 // decompress just needs 32k data.
462 TT.data = xmalloc(32768*(compress ? 4 : 1));
463 if (compress) {
464 TT.hashhead = (unsigned short *)(TT.data + 65536);
465 TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768);
466 }
467
468 // Calculate lenbits, lenbase, distbits, distbase
469 *TT.lenbase = 3;
470 for (i = 0; i<sizeof(TT.lenbits)-1; i++) {
471 if (i>4) {
472 if (!(i&3)) {
473 TT.lenbits[i]++;
474 n <<= 1;
475 }
476 if (i == 27) n--;
477 else TT.lenbits[i+1] = TT.lenbits[i];
478 }
479 TT.lenbase[i+1] = n + TT.lenbase[i];
480 }
481 n = 0;
482 for (i = 0; i<sizeof(TT.distbits); i++) {
483 TT.distbase[i] = 1<<n;
484 if (i) TT.distbase[i] += TT.distbase[i-1];
485 if (i>3 && !(i&1)) n++;
486 TT.distbits[i] = n;
487 }
488
489 // Init fixed huffman tables
490 for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
491 len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
492 memset(toybuf, 5, 30);
493 len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
494 }
495
496 // Return true/false whether we consumed a gzip header.
is_gzip(struct bitbuf * bb)497 static int is_gzip(struct bitbuf *bb)
498 {
499 int flags;
500
501 // Confirm signature
502 if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
503 return 0;
504 bitbuf_skip(bb, 6*8);
505
506 // Skip extra, name, comment, header CRC fields
507 if (flags & 4) bitbuf_skip(bb, 16);
508 if (flags & 8) while (bitbuf_get(bb, 8));
509 if (flags & 16) while (bitbuf_get(bb, 8));
510 if (flags & 2) bitbuf_skip(bb, 16);
511
512 return 1;
513 }
514
gzip_crc(char * data,int len)515 void gzip_crc(char *data, int len)
516 {
517 int i;
518 unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024);
519
520 crc = TT.crc;
521 for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
522 TT.crc = crc;
523 TT.len += len;
524 }
525
do_gzip(int fd,char * name)526 static void do_gzip(int fd, char *name)
527 {
528 struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf));
529
530 // Header from RFC 1952 section 2.2:
531 // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
532 // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
533 // Operating System (FF=unknown)
534
535 TT.infd = fd;
536 xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
537
538 // Use last 1k of toybuf for little endian crc table
539 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
540 TT.crcfunc = gzip_crc;
541
542 deflate(bb);
543
544 // tail: crc32, len32
545
546 bitbuf_put(bb, 0, (8-bb->bitpos)&7);
547 bitbuf_put(bb, ~TT.crc, 32);
548 bitbuf_put(bb, TT.len, 32);
549
550 bitbuf_flush(bb);
551 free(bb);
552 }
553
do_zcat(int fd,char * name)554 static void do_zcat(int fd, char *name)
555 {
556 struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf));
557
558 if (!is_gzip(bb)) error_exit("not gzip");
559 TT.outfd = 1;
560
561 // Use last 1k of toybuf for little endian crc table
562 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
563 TT.crcfunc = gzip_crc;
564
565 inflate(bb);
566
567 // tail: crc32, len32
568
569 bitbuf_skip(bb, (8-bb->bitpos)&7);
570 if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32))
571 error_exit("bad crc");
572 free(bb);
573 }
574
575 // Parse many different kinds of command line argument:
576
compress_main(void)577 void compress_main(void)
578 {
579 // todo: this
580 printf("hello world");
581 }
582
583 //#define CLEANUP_compress
584 //#define FOR_zcat
585 //#include "generated/flags.h"
586
zcat_main(void)587 void zcat_main(void)
588 {
589 init_deflate(0);
590
591 loopfiles(toys.optargs, do_zcat);
592 }
593
gunzip_main(void)594 void gunzip_main(void)
595 {
596 init_deflate(0);
597
598 loopfiles(toys.optargs, do_zcat);
599 }
600
gzip_main(void)601 void gzip_main(void)
602 {
603 init_deflate(1);
604
605 loopfiles(toys.optargs, do_gzip);
606 }
607