1/* gzappend -- command to append to a gzip file
2
3  Copyright (C) 2003, 2012 Mark Adler, all rights reserved
4  version 1.2, 11 Oct 2012
5
6  This software is provided 'as-is', without any express or implied
7  warranty.  In no event will the author be held liable for any damages
8  arising from the use of this software.
9
10  Permission is granted to anyone to use this software for any purpose,
11  including commercial applications, and to alter it and redistribute it
12  freely, subject to the following restrictions:
13
14  1. The origin of this software must not be misrepresented; you must not
15     claim that you wrote the original software. If you use this software
16     in a product, an acknowledgment in the product documentation would be
17     appreciated but is not required.
18  2. Altered source versions must be plainly marked as such, and must not be
19     misrepresented as being the original software.
20  3. This notice may not be removed or altered from any source distribution.
21
22  Mark Adler    madler@alumni.caltech.edu
23 */
24
25/*
26 * Change history:
27 *
28 * 1.0  19 Oct 2003     - First version
29 * 1.1   4 Nov 2003     - Expand and clarify some comments and notes
30 *                      - Add version and copyright to help
31 *                      - Send help to stdout instead of stderr
32 *                      - Add some preemptive typecasts
33 *                      - Add L to constants in lseek() calls
34 *                      - Remove some debugging information in error messages
35 *                      - Use new data_type definition for zlib 1.2.1
36 *                      - Simplfy and unify file operations
37 *                      - Finish off gzip file in gztack()
38 *                      - Use deflatePrime() instead of adding empty blocks
39 *                      - Keep gzip file clean on appended file read errors
40 *                      - Use in-place rotate instead of auxiliary buffer
41 *                        (Why you ask?  Because it was fun to write!)
42 * 1.2  11 Oct 2012     - Fix for proper z_const usage
43 *                      - Check for input buffer malloc failure
44 */
45
46/*
47   gzappend takes a gzip file and appends to it, compressing files from the
48   command line or data from stdin.  The gzip file is written to directly, to
49   avoid copying that file, in case it's large.  Note that this results in the
50   unfriendly behavior that if gzappend fails, the gzip file is corrupted.
51
52   This program was written to illustrate the use of the new Z_BLOCK option of
53   zlib 1.2.x's inflate() function.  This option returns from inflate() at each
54   block boundary to facilitate locating and modifying the last block bit at
55   the start of the final deflate block.  Also whether using Z_BLOCK or not,
56   another required feature of zlib 1.2.x is that inflate() now provides the
57   number of unusued bits in the last input byte used.  gzappend will not work
58   with versions of zlib earlier than 1.2.1.
59
60   gzappend first decompresses the gzip file internally, discarding all but
61   the last 32K of uncompressed data, and noting the location of the last block
62   bit and the number of unused bits in the last byte of the compressed data.
63   The gzip trailer containing the CRC-32 and length of the uncompressed data
64   is verified.  This trailer will be later overwritten.
65
66   Then the last block bit is cleared by seeking back in the file and rewriting
67   the byte that contains it.  Seeking forward, the last byte of the compressed
68   data is saved along with the number of unused bits to initialize deflate.
69
70   A deflate process is initialized, using the last 32K of the uncompressed
71   data from the gzip file to initialize the dictionary.  If the total
72   uncompressed data was less than 32K, then all of it is used to initialize
73   the dictionary.  The deflate output bit buffer is also initialized with the
74   last bits from the original deflate stream.  From here on, the data to
75   append is simply compressed using deflate, and written to the gzip file.
76   When that is complete, the new CRC-32 and uncompressed length are written
77   as the trailer of the gzip file.
78 */
79
80#include <stdio.h>
81#include <stdlib.h>
82#include <string.h>
83#include <fcntl.h>
84#include <unistd.h>
85#include "zlib.h"
86
87#define local static
88#define LGCHUNK 14
89#define CHUNK (1U << LGCHUNK)
90#define DSIZE 32768U
91
92/* print an error message and terminate with extreme prejudice */
93local void bye(char *msg1, char *msg2)
94{
95    fprintf(stderr, "gzappend error: %s%s\n", msg1, msg2);
96    exit(1);
97}
98
99/* return the greatest common divisor of a and b using Euclid's algorithm,
100   modified to be fast when one argument much greater than the other, and
101   coded to avoid unnecessary swapping */
102local unsigned gcd(unsigned a, unsigned b)
103{
104    unsigned c;
105
106    while (a && b)
107        if (a > b) {
108            c = b;
109            while (a - c >= c)
110                c <<= 1;
111            a -= c;
112        }
113        else {
114            c = a;
115            while (b - c >= c)
116                c <<= 1;
117            b -= c;
118        }
119    return a + b;
120}
121
122/* rotate list[0..len-1] left by rot positions, in place */
123local void rotate(unsigned char *list, unsigned len, unsigned rot)
124{
125    unsigned char tmp;
126    unsigned cycles;
127    unsigned char *start, *last, *to, *from;
128
129    /* normalize rot and handle degenerate cases */
130    if (len < 2) return;
131    if (rot >= len) rot %= len;
132    if (rot == 0) return;
133
134    /* pointer to last entry in list */
135    last = list + (len - 1);
136
137    /* do simple left shift by one */
138    if (rot == 1) {
139        tmp = *list;
140        memcpy(list, list + 1, len - 1);
141        *last = tmp;
142        return;
143    }
144
145    /* do simple right shift by one */
146    if (rot == len - 1) {
147        tmp = *last;
148        memmove(list + 1, list, len - 1);
149        *list = tmp;
150        return;
151    }
152
153    /* otherwise do rotate as a set of cycles in place */
154    cycles = gcd(len, rot);             /* number of cycles */
155    do {
156        start = from = list + cycles;   /* start index is arbitrary */
157        tmp = *from;                    /* save entry to be overwritten */
158        for (;;) {
159            to = from;                  /* next step in cycle */
160            from += rot;                /* go right rot positions */
161            if (from > last) from -= len;   /* (pointer better not wrap) */
162            if (from == start) break;   /* all but one shifted */
163            *to = *from;                /* shift left */
164        }
165        *to = tmp;                      /* complete the circle */
166    } while (--cycles);
167}
168
169/* structure for gzip file read operations */
170typedef struct {
171    int fd;                     /* file descriptor */
172    int size;                   /* 1 << size is bytes in buf */
173    unsigned left;              /* bytes available at next */
174    unsigned char *buf;         /* buffer */
175    z_const unsigned char *next;    /* next byte in buffer */
176    char *name;                 /* file name for error messages */
177} file;
178
179/* reload buffer */
180local int readin(file *in)
181{
182    int len;
183
184    len = read(in->fd, in->buf, 1 << in->size);
185    if (len == -1) bye("error reading ", in->name);
186    in->left = (unsigned)len;
187    in->next = in->buf;
188    return len;
189}
190
191/* read from file in, exit if end-of-file */
192local int readmore(file *in)
193{
194    if (readin(in) == 0) bye("unexpected end of ", in->name);
195    return 0;
196}
197
198#define read1(in) (in->left == 0 ? readmore(in) : 0, \
199                   in->left--, *(in->next)++)
200
201/* skip over n bytes of in */
202local void skip(file *in, unsigned n)
203{
204    unsigned bypass;
205
206    if (n > in->left) {
207        n -= in->left;
208        bypass = n & ~((1U << in->size) - 1);
209        if (bypass) {
210            if (lseek(in->fd, (off_t)bypass, SEEK_CUR) == -1)
211                bye("seeking ", in->name);
212            n -= bypass;
213        }
214        readmore(in);
215        if (n > in->left)
216            bye("unexpected end of ", in->name);
217    }
218    in->left -= n;
219    in->next += n;
220}
221
222/* read a four-byte unsigned integer, little-endian, from in */
223unsigned long read4(file *in)
224{
225    unsigned long val;
226
227    val = read1(in);
228    val += (unsigned)read1(in) << 8;
229    val += (unsigned long)read1(in) << 16;
230    val += (unsigned long)read1(in) << 24;
231    return val;
232}
233
234/* skip over gzip header */
235local void gzheader(file *in)
236{
237    int flags;
238    unsigned n;
239
240    if (read1(in) != 31 || read1(in) != 139) bye(in->name, " not a gzip file");
241    if (read1(in) != 8) bye("unknown compression method in", in->name);
242    flags = read1(in);
243    if (flags & 0xe0) bye("unknown header flags set in", in->name);
244    skip(in, 6);
245    if (flags & 4) {
246        n = read1(in);
247        n += (unsigned)(read1(in)) << 8;
248        skip(in, n);
249    }
250    if (flags & 8) while (read1(in) != 0) ;
251    if (flags & 16) while (read1(in) != 0) ;
252    if (flags & 2) skip(in, 2);
253}
254
255/* decompress gzip file "name", return strm with a deflate stream ready to
256   continue compression of the data in the gzip file, and return a file
257   descriptor pointing to where to write the compressed data -- the deflate
258   stream is initialized to compress using level "level" */
259local int gzscan(char *name, z_stream *strm, int level)
260{
261    int ret, lastbit, left, full;
262    unsigned have;
263    unsigned long crc, tot;
264    unsigned char *window;
265    off_t lastoff, end;
266    file gz;
267
268    /* open gzip file */
269    gz.name = name;
270    gz.fd = open(name, O_RDWR, 0);
271    if (gz.fd == -1) bye("cannot open ", name);
272    gz.buf = malloc(CHUNK);
273    if (gz.buf == NULL) bye("out of memory", "");
274    gz.size = LGCHUNK;
275    gz.left = 0;
276
277    /* skip gzip header */
278    gzheader(&gz);
279
280    /* prepare to decompress */
281    window = malloc(DSIZE);
282    if (window == NULL) bye("out of memory", "");
283    strm->zalloc = Z_NULL;
284    strm->zfree = Z_NULL;
285    strm->opaque = Z_NULL;
286    ret = inflateInit2(strm, -15);
287    if (ret != Z_OK) bye("out of memory", " or library mismatch");
288
289    /* decompress the deflate stream, saving append information */
290    lastbit = 0;
291    lastoff = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
292    left = 0;
293    strm->avail_in = gz.left;
294    strm->next_in = gz.next;
295    crc = crc32(0L, Z_NULL, 0);
296    have = full = 0;
297    do {
298        /* if needed, get more input */
299        if (strm->avail_in == 0) {
300            readmore(&gz);
301            strm->avail_in = gz.left;
302            strm->next_in = gz.next;
303        }
304
305        /* set up output to next available section of sliding window */
306        strm->avail_out = DSIZE - have;
307        strm->next_out = window + have;
308
309        /* inflate and check for errors */
310        ret = inflate(strm, Z_BLOCK);
311        if (ret == Z_STREAM_ERROR) bye("internal stream error!", "");
312        if (ret == Z_MEM_ERROR) bye("out of memory", "");
313        if (ret == Z_DATA_ERROR)
314            bye("invalid compressed data--format violated in", name);
315
316        /* update crc and sliding window pointer */
317        crc = crc32(crc, window + have, DSIZE - have - strm->avail_out);
318        if (strm->avail_out)
319            have = DSIZE - strm->avail_out;
320        else {
321            have = 0;
322            full = 1;
323        }
324
325        /* process end of block */
326        if (strm->data_type & 128) {
327            if (strm->data_type & 64)
328                left = strm->data_type & 0x1f;
329            else {
330                lastbit = strm->data_type & 0x1f;
331                lastoff = lseek(gz.fd, 0L, SEEK_CUR) - strm->avail_in;
332            }
333        }
334    } while (ret != Z_STREAM_END);
335    inflateEnd(strm);
336    gz.left = strm->avail_in;
337    gz.next = strm->next_in;
338
339    /* save the location of the end of the compressed data */
340    end = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
341
342    /* check gzip trailer and save total for deflate */
343    if (crc != read4(&gz))
344        bye("invalid compressed data--crc mismatch in ", name);
345    tot = strm->total_out;
346    if ((tot & 0xffffffffUL) != read4(&gz))
347        bye("invalid compressed data--length mismatch in", name);
348
349    /* if not at end of file, warn */
350    if (gz.left || readin(&gz))
351        fprintf(stderr,
352            "gzappend warning: junk at end of gzip file overwritten\n");
353
354    /* clear last block bit */
355    lseek(gz.fd, lastoff - (lastbit != 0), SEEK_SET);
356    if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
357    *gz.buf = (unsigned char)(*gz.buf ^ (1 << ((8 - lastbit) & 7)));
358    lseek(gz.fd, -1L, SEEK_CUR);
359    if (write(gz.fd, gz.buf, 1) != 1) bye("writing after seek to ", name);
360
361    /* if window wrapped, build dictionary from window by rotating */
362    if (full) {
363        rotate(window, DSIZE, have);
364        have = DSIZE;
365    }
366
367    /* set up deflate stream with window, crc, total_in, and leftover bits */
368    ret = deflateInit2(strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
369    if (ret != Z_OK) bye("out of memory", "");
370    deflateSetDictionary(strm, window, have);
371    strm->adler = crc;
372    strm->total_in = tot;
373    if (left) {
374        lseek(gz.fd, --end, SEEK_SET);
375        if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
376        deflatePrime(strm, 8 - left, *gz.buf);
377    }
378    lseek(gz.fd, end, SEEK_SET);
379
380    /* clean up and return */
381    free(window);
382    free(gz.buf);
383    return gz.fd;
384}
385
386/* append file "name" to gzip file gd using deflate stream strm -- if last
387   is true, then finish off the deflate stream at the end */
388local void gztack(char *name, int gd, z_stream *strm, int last)
389{
390    int fd, len, ret;
391    unsigned left;
392    unsigned char *in, *out;
393
394    /* open file to compress and append */
395    fd = 0;
396    if (name != NULL) {
397        fd = open(name, O_RDONLY, 0);
398        if (fd == -1)
399            fprintf(stderr, "gzappend warning: %s not found, skipping ...\n",
400                    name);
401    }
402
403    /* allocate buffers */
404    in = malloc(CHUNK);
405    out = malloc(CHUNK);
406    if (in == NULL || out == NULL) bye("out of memory", "");
407
408    /* compress input file and append to gzip file */
409    do {
410        /* get more input */
411        len = read(fd, in, CHUNK);
412        if (len == -1) {
413            fprintf(stderr,
414                    "gzappend warning: error reading %s, skipping rest ...\n",
415                    name);
416            len = 0;
417        }
418        strm->avail_in = (unsigned)len;
419        strm->next_in = in;
420        if (len) strm->adler = crc32(strm->adler, in, (unsigned)len);
421
422        /* compress and write all available output */
423        do {
424            strm->avail_out = CHUNK;
425            strm->next_out = out;
426            ret = deflate(strm, last && len == 0 ? Z_FINISH : Z_NO_FLUSH);
427            left = CHUNK - strm->avail_out;
428            while (left) {
429                len = write(gd, out + CHUNK - strm->avail_out - left, left);
430                if (len == -1) bye("writing gzip file", "");
431                left -= (unsigned)len;
432            }
433        } while (strm->avail_out == 0 && ret != Z_STREAM_END);
434    } while (len != 0);
435
436    /* write trailer after last entry */
437    if (last) {
438        deflateEnd(strm);
439        out[0] = (unsigned char)(strm->adler);
440        out[1] = (unsigned char)(strm->adler >> 8);
441        out[2] = (unsigned char)(strm->adler >> 16);
442        out[3] = (unsigned char)(strm->adler >> 24);
443        out[4] = (unsigned char)(strm->total_in);
444        out[5] = (unsigned char)(strm->total_in >> 8);
445        out[6] = (unsigned char)(strm->total_in >> 16);
446        out[7] = (unsigned char)(strm->total_in >> 24);
447        len = 8;
448        do {
449            ret = write(gd, out + 8 - len, len);
450            if (ret == -1) bye("writing gzip file", "");
451            len -= ret;
452        } while (len);
453        close(gd);
454    }
455
456    /* clean up and return */
457    free(out);
458    free(in);
459    if (fd > 0) close(fd);
460}
461
462/* process the compression level option if present, scan the gzip file, and
463   append the specified files, or append the data from stdin if no other file
464   names are provided on the command line -- the gzip file must be writable
465   and seekable */
466int main(int argc, char **argv)
467{
468    int gd, level;
469    z_stream strm;
470
471    /* ignore command name */
472    argc--; argv++;
473
474    /* provide usage if no arguments */
475    if (*argv == NULL) {
476        printf(
477            "gzappend 1.2 (11 Oct 2012) Copyright (C) 2003, 2012 Mark Adler\n"
478               );
479        printf(
480            "usage: gzappend [-level] file.gz [ addthis [ andthis ... ]]\n");
481        return 0;
482    }
483
484    /* set compression level */
485    level = Z_DEFAULT_COMPRESSION;
486    if (argv[0][0] == '-') {
487        if (argv[0][1] < '0' || argv[0][1] > '9' || argv[0][2] != 0)
488            bye("invalid compression level", "");
489        level = argv[0][1] - '0';
490        if (*++argv == NULL) bye("no gzip file name after options", "");
491    }
492
493    /* prepare to append to gzip file */
494    gd = gzscan(*argv++, &strm, level);
495
496    /* append files on command line, or from stdin if none */
497    if (*argv == NULL)
498        gztack(NULL, gd, &strm, 1);
499    else
500        do {
501            gztack(*argv, gd, &strm, argv[1] == NULL);
502        } while (*++argv != NULL);
503    return 0;
504}
505