1 /* gzappend -- command to append to a gzip file
2 
3   Copyright (C) 2003, 2012 Mark Adler, all rights reserved
4   version 1.2, 11 Oct 2012
5 
6   This software is provided 'as-is', without any express or implied
7   warranty.  In no event will the author be held liable for any damages
8   arising from the use of this software.
9 
10   Permission is granted to anyone to use this software for any purpose,
11   including commercial applications, and to alter it and redistribute it
12   freely, subject to the following restrictions:
13 
14   1. The origin of this software must not be misrepresented; you must not
15      claim that you wrote the original software. If you use this software
16      in a product, an acknowledgment in the product documentation would be
17      appreciated but is not required.
18   2. Altered source versions must be plainly marked as such, and must not be
19      misrepresented as being the original software.
20   3. This notice may not be removed or altered from any source distribution.
21 
22   Mark Adler    madler@alumni.caltech.edu
23  */
24 
25 /*
26  * Change history:
27  *
28  * 1.0  19 Oct 2003     - First version
29  * 1.1   4 Nov 2003     - Expand and clarify some comments and notes
30  *                      - Add version and copyright to help
31  *                      - Send help to stdout instead of stderr
32  *                      - Add some preemptive typecasts
33  *                      - Add L to constants in lseek() calls
34  *                      - Remove some debugging information in error messages
35  *                      - Use new data_type definition for zlib 1.2.1
36  *                      - Simplfy and unify file operations
37  *                      - Finish off gzip file in gztack()
38  *                      - Use deflatePrime() instead of adding empty blocks
39  *                      - Keep gzip file clean on appended file read errors
40  *                      - Use in-place rotate instead of auxiliary buffer
41  *                        (Why you ask?  Because it was fun to write!)
42  * 1.2  11 Oct 2012     - Fix for proper z_const usage
43  *                      - Check for input buffer malloc failure
44  */
45 
46 /*
47    gzappend takes a gzip file and appends to it, compressing files from the
48    command line or data from stdin.  The gzip file is written to directly, to
49    avoid copying that file, in case it's large.  Note that this results in the
50    unfriendly behavior that if gzappend fails, the gzip file is corrupted.
51 
52    This program was written to illustrate the use of the new Z_BLOCK option of
53    zlib 1.2.x's inflate() function.  This option returns from inflate() at each
54    block boundary to facilitate locating and modifying the last block bit at
55    the start of the final deflate block.  Also whether using Z_BLOCK or not,
56    another required feature of zlib 1.2.x is that inflate() now provides the
57    number of unusued bits in the last input byte used.  gzappend will not work
58    with versions of zlib earlier than 1.2.1.
59 
60    gzappend first decompresses the gzip file internally, discarding all but
61    the last 32K of uncompressed data, and noting the location of the last block
62    bit and the number of unused bits in the last byte of the compressed data.
63    The gzip trailer containing the CRC-32 and length of the uncompressed data
64    is verified.  This trailer will be later overwritten.
65 
66    Then the last block bit is cleared by seeking back in the file and rewriting
67    the byte that contains it.  Seeking forward, the last byte of the compressed
68    data is saved along with the number of unused bits to initialize deflate.
69 
70    A deflate process is initialized, using the last 32K of the uncompressed
71    data from the gzip file to initialize the dictionary.  If the total
72    uncompressed data was less than 32K, then all of it is used to initialize
73    the dictionary.  The deflate output bit buffer is also initialized with the
74    last bits from the original deflate stream.  From here on, the data to
75    append is simply compressed using deflate, and written to the gzip file.
76    When that is complete, the new CRC-32 and uncompressed length are written
77    as the trailer of the gzip file.
78  */
79 
80 #include <stdio.h>
81 #include <stdlib.h>
82 #include <string.h>
83 #include <fcntl.h>
84 #include <unistd.h>
85 #include "zlib.h"
86 
87 #define local static
88 #define LGCHUNK 14
89 #define CHUNK (1U << LGCHUNK)
90 #define DSIZE 32768U
91 
92 /* print an error message and terminate with extreme prejudice */
bye(char * msg1,char * msg2)93 local void bye(char *msg1, char *msg2)
94 {
95     fprintf(stderr, "gzappend error: %s%s\n", msg1, msg2);
96     exit(1);
97 }
98 
99 /* return the greatest common divisor of a and b using Euclid's algorithm,
100    modified to be fast when one argument much greater than the other, and
101    coded to avoid unnecessary swapping */
gcd(unsigned a,unsigned b)102 local unsigned gcd(unsigned a, unsigned b)
103 {
104     unsigned c;
105 
106     while (a && b)
107         if (a > b) {
108             c = b;
109             while (a - c >= c)
110                 c <<= 1;
111             a -= c;
112         }
113         else {
114             c = a;
115             while (b - c >= c)
116                 c <<= 1;
117             b -= c;
118         }
119     return a + b;
120 }
121 
122 /* rotate list[0..len-1] left by rot positions, in place */
rotate(unsigned char * list,unsigned len,unsigned rot)123 local void rotate(unsigned char *list, unsigned len, unsigned rot)
124 {
125     unsigned char tmp;
126     unsigned cycles;
127     unsigned char *start, *last, *to, *from;
128 
129     /* normalize rot and handle degenerate cases */
130     if (len < 2) return;
131     if (rot >= len) rot %= len;
132     if (rot == 0) return;
133 
134     /* pointer to last entry in list */
135     last = list + (len - 1);
136 
137     /* do simple left shift by one */
138     if (rot == 1) {
139         tmp = *list;
140         memcpy(list, list + 1, len - 1);
141         *last = tmp;
142         return;
143     }
144 
145     /* do simple right shift by one */
146     if (rot == len - 1) {
147         tmp = *last;
148         memmove(list + 1, list, len - 1);
149         *list = tmp;
150         return;
151     }
152 
153     /* otherwise do rotate as a set of cycles in place */
154     cycles = gcd(len, rot);             /* number of cycles */
155     do {
156         start = from = list + cycles;   /* start index is arbitrary */
157         tmp = *from;                    /* save entry to be overwritten */
158         for (;;) {
159             to = from;                  /* next step in cycle */
160             from += rot;                /* go right rot positions */
161             if (from > last) from -= len;   /* (pointer better not wrap) */
162             if (from == start) break;   /* all but one shifted */
163             *to = *from;                /* shift left */
164         }
165         *to = tmp;                      /* complete the circle */
166     } while (--cycles);
167 }
168 
169 /* structure for gzip file read operations */
170 typedef struct {
171     int fd;                     /* file descriptor */
172     int size;                   /* 1 << size is bytes in buf */
173     unsigned left;              /* bytes available at next */
174     unsigned char *buf;         /* buffer */
175     z_const unsigned char *next;    /* next byte in buffer */
176     char *name;                 /* file name for error messages */
177 } file;
178 
179 /* reload buffer */
readin(file * in)180 local int readin(file *in)
181 {
182     int len;
183 
184     len = read(in->fd, in->buf, 1 << in->size);
185     if (len == -1) bye("error reading ", in->name);
186     in->left = (unsigned)len;
187     in->next = in->buf;
188     return len;
189 }
190 
191 /* read from file in, exit if end-of-file */
readmore(file * in)192 local int readmore(file *in)
193 {
194     if (readin(in) == 0) bye("unexpected end of ", in->name);
195     return 0;
196 }
197 
198 #define read1(in) (in->left == 0 ? readmore(in) : 0, \
199                    in->left--, *(in->next)++)
200 
201 /* skip over n bytes of in */
skip(file * in,unsigned n)202 local void skip(file *in, unsigned n)
203 {
204     unsigned bypass;
205 
206     if (n > in->left) {
207         n -= in->left;
208         bypass = n & ~((1U << in->size) - 1);
209         if (bypass) {
210             if (lseek(in->fd, (off_t)bypass, SEEK_CUR) == -1)
211                 bye("seeking ", in->name);
212             n -= bypass;
213         }
214         readmore(in);
215         if (n > in->left)
216             bye("unexpected end of ", in->name);
217     }
218     in->left -= n;
219     in->next += n;
220 }
221 
222 /* read a four-byte unsigned integer, little-endian, from in */
read4(file * in)223 unsigned long read4(file *in)
224 {
225     unsigned long val;
226 
227     val = read1(in);
228     val += (unsigned)read1(in) << 8;
229     val += (unsigned long)read1(in) << 16;
230     val += (unsigned long)read1(in) << 24;
231     return val;
232 }
233 
234 /* skip over gzip header */
gzheader(file * in)235 local void gzheader(file *in)
236 {
237     int flags;
238     unsigned n;
239 
240     if (read1(in) != 31 || read1(in) != 139) bye(in->name, " not a gzip file");
241     if (read1(in) != 8) bye("unknown compression method in", in->name);
242     flags = read1(in);
243     if (flags & 0xe0) bye("unknown header flags set in", in->name);
244     skip(in, 6);
245     if (flags & 4) {
246         n = read1(in);
247         n += (unsigned)(read1(in)) << 8;
248         skip(in, n);
249     }
250     if (flags & 8) while (read1(in) != 0) ;
251     if (flags & 16) while (read1(in) != 0) ;
252     if (flags & 2) skip(in, 2);
253 }
254 
255 /* decompress gzip file "name", return strm with a deflate stream ready to
256    continue compression of the data in the gzip file, and return a file
257    descriptor pointing to where to write the compressed data -- the deflate
258    stream is initialized to compress using level "level" */
gzscan(char * name,z_stream * strm,int level)259 local int gzscan(char *name, z_stream *strm, int level)
260 {
261     int ret, lastbit, left, full;
262     unsigned have;
263     unsigned long crc, tot;
264     unsigned char *window;
265     off_t lastoff, end;
266     file gz;
267 
268     /* open gzip file */
269     gz.name = name;
270     gz.fd = open(name, O_RDWR, 0);
271     if (gz.fd == -1) bye("cannot open ", name);
272     gz.buf = malloc(CHUNK);
273     if (gz.buf == NULL) bye("out of memory", "");
274     gz.size = LGCHUNK;
275     gz.left = 0;
276 
277     /* skip gzip header */
278     gzheader(&gz);
279 
280     /* prepare to decompress */
281     window = malloc(DSIZE);
282     if (window == NULL) bye("out of memory", "");
283     strm->zalloc = Z_NULL;
284     strm->zfree = Z_NULL;
285     strm->opaque = Z_NULL;
286     ret = inflateInit2(strm, -15);
287     if (ret != Z_OK) bye("out of memory", " or library mismatch");
288 
289     /* decompress the deflate stream, saving append information */
290     lastbit = 0;
291     lastoff = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
292     left = 0;
293     strm->avail_in = gz.left;
294     strm->next_in = gz.next;
295     crc = crc32(0L, Z_NULL, 0);
296     have = full = 0;
297     do {
298         /* if needed, get more input */
299         if (strm->avail_in == 0) {
300             readmore(&gz);
301             strm->avail_in = gz.left;
302             strm->next_in = gz.next;
303         }
304 
305         /* set up output to next available section of sliding window */
306         strm->avail_out = DSIZE - have;
307         strm->next_out = window + have;
308 
309         /* inflate and check for errors */
310         ret = inflate(strm, Z_BLOCK);
311         if (ret == Z_STREAM_ERROR) bye("internal stream error!", "");
312         if (ret == Z_MEM_ERROR) bye("out of memory", "");
313         if (ret == Z_DATA_ERROR)
314             bye("invalid compressed data--format violated in", name);
315 
316         /* update crc and sliding window pointer */
317         crc = crc32(crc, window + have, DSIZE - have - strm->avail_out);
318         if (strm->avail_out)
319             have = DSIZE - strm->avail_out;
320         else {
321             have = 0;
322             full = 1;
323         }
324 
325         /* process end of block */
326         if (strm->data_type & 128) {
327             if (strm->data_type & 64)
328                 left = strm->data_type & 0x1f;
329             else {
330                 lastbit = strm->data_type & 0x1f;
331                 lastoff = lseek(gz.fd, 0L, SEEK_CUR) - strm->avail_in;
332             }
333         }
334     } while (ret != Z_STREAM_END);
335     inflateEnd(strm);
336     gz.left = strm->avail_in;
337     gz.next = strm->next_in;
338 
339     /* save the location of the end of the compressed data */
340     end = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
341 
342     /* check gzip trailer and save total for deflate */
343     if (crc != read4(&gz))
344         bye("invalid compressed data--crc mismatch in ", name);
345     tot = strm->total_out;
346     if ((tot & 0xffffffffUL) != read4(&gz))
347         bye("invalid compressed data--length mismatch in", name);
348 
349     /* if not at end of file, warn */
350     if (gz.left || readin(&gz))
351         fprintf(stderr,
352             "gzappend warning: junk at end of gzip file overwritten\n");
353 
354     /* clear last block bit */
355     lseek(gz.fd, lastoff - (lastbit != 0), SEEK_SET);
356     if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
357     *gz.buf = (unsigned char)(*gz.buf ^ (1 << ((8 - lastbit) & 7)));
358     lseek(gz.fd, -1L, SEEK_CUR);
359     if (write(gz.fd, gz.buf, 1) != 1) bye("writing after seek to ", name);
360 
361     /* if window wrapped, build dictionary from window by rotating */
362     if (full) {
363         rotate(window, DSIZE, have);
364         have = DSIZE;
365     }
366 
367     /* set up deflate stream with window, crc, total_in, and leftover bits */
368     ret = deflateInit2(strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
369     if (ret != Z_OK) bye("out of memory", "");
370     deflateSetDictionary(strm, window, have);
371     strm->adler = crc;
372     strm->total_in = tot;
373     if (left) {
374         lseek(gz.fd, --end, SEEK_SET);
375         if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
376         deflatePrime(strm, 8 - left, *gz.buf);
377     }
378     lseek(gz.fd, end, SEEK_SET);
379 
380     /* clean up and return */
381     free(window);
382     free(gz.buf);
383     return gz.fd;
384 }
385 
386 /* append file "name" to gzip file gd using deflate stream strm -- if last
387    is true, then finish off the deflate stream at the end */
gztack(char * name,int gd,z_stream * strm,int last)388 local void gztack(char *name, int gd, z_stream *strm, int last)
389 {
390     int fd, len, ret;
391     unsigned left;
392     unsigned char *in, *out;
393 
394     /* open file to compress and append */
395     fd = 0;
396     if (name != NULL) {
397         fd = open(name, O_RDONLY, 0);
398         if (fd == -1)
399             fprintf(stderr, "gzappend warning: %s not found, skipping ...\n",
400                     name);
401     }
402 
403     /* allocate buffers */
404     in = malloc(CHUNK);
405     out = malloc(CHUNK);
406     if (in == NULL || out == NULL) bye("out of memory", "");
407 
408     /* compress input file and append to gzip file */
409     do {
410         /* get more input */
411         len = read(fd, in, CHUNK);
412         if (len == -1) {
413             fprintf(stderr,
414                     "gzappend warning: error reading %s, skipping rest ...\n",
415                     name);
416             len = 0;
417         }
418         strm->avail_in = (unsigned)len;
419         strm->next_in = in;
420         if (len) strm->adler = crc32(strm->adler, in, (unsigned)len);
421 
422         /* compress and write all available output */
423         do {
424             strm->avail_out = CHUNK;
425             strm->next_out = out;
426             ret = deflate(strm, last && len == 0 ? Z_FINISH : Z_NO_FLUSH);
427             left = CHUNK - strm->avail_out;
428             while (left) {
429                 len = write(gd, out + CHUNK - strm->avail_out - left, left);
430                 if (len == -1) bye("writing gzip file", "");
431                 left -= (unsigned)len;
432             }
433         } while (strm->avail_out == 0 && ret != Z_STREAM_END);
434     } while (len != 0);
435 
436     /* write trailer after last entry */
437     if (last) {
438         deflateEnd(strm);
439         out[0] = (unsigned char)(strm->adler);
440         out[1] = (unsigned char)(strm->adler >> 8);
441         out[2] = (unsigned char)(strm->adler >> 16);
442         out[3] = (unsigned char)(strm->adler >> 24);
443         out[4] = (unsigned char)(strm->total_in);
444         out[5] = (unsigned char)(strm->total_in >> 8);
445         out[6] = (unsigned char)(strm->total_in >> 16);
446         out[7] = (unsigned char)(strm->total_in >> 24);
447         len = 8;
448         do {
449             ret = write(gd, out + 8 - len, len);
450             if (ret == -1) bye("writing gzip file", "");
451             len -= ret;
452         } while (len);
453         close(gd);
454     }
455 
456     /* clean up and return */
457     free(out);
458     free(in);
459     if (fd > 0) close(fd);
460 }
461 
462 /* process the compression level option if present, scan the gzip file, and
463    append the specified files, or append the data from stdin if no other file
464    names are provided on the command line -- the gzip file must be writable
465    and seekable */
main(int argc,char ** argv)466 int main(int argc, char **argv)
467 {
468     int gd, level;
469     z_stream strm;
470 
471     /* ignore command name */
472     argc--; argv++;
473 
474     /* provide usage if no arguments */
475     if (*argv == NULL) {
476         printf(
477             "gzappend 1.2 (11 Oct 2012) Copyright (C) 2003, 2012 Mark Adler\n"
478                );
479         printf(
480             "usage: gzappend [-level] file.gz [ addthis [ andthis ... ]]\n");
481         return 0;
482     }
483 
484     /* set compression level */
485     level = Z_DEFAULT_COMPRESSION;
486     if (argv[0][0] == '-') {
487         if (argv[0][1] < '0' || argv[0][1] > '9' || argv[0][2] != 0)
488             bye("invalid compression level", "");
489         level = argv[0][1] - '0';
490         if (*++argv == NULL) bye("no gzip file name after options", "");
491     }
492 
493     /* prepare to append to gzip file */
494     gd = gzscan(*argv++, &strm, level);
495 
496     /* append files on command line, or from stdin if none */
497     if (*argv == NULL)
498         gztack(NULL, gd, &strm, 1);
499     else
500         do {
501             gztack(*argv, gd, &strm, argv[1] == NULL);
502         } while (*++argv != NULL);
503     return 0;
504 }
505