1/* gzjoin -- command to join gzip files into one gzip file
2
3  Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved
4  version 1.2, 14 Aug 2012
5
6  This software is provided 'as-is', without any express or implied
7  warranty.  In no event will the author be held liable for any damages
8  arising from the use of this software.
9
10  Permission is granted to anyone to use this software for any purpose,
11  including commercial applications, and to alter it and redistribute it
12  freely, subject to the following restrictions:
13
14  1. The origin of this software must not be misrepresented; you must not
15     claim that you wrote the original software. If you use this software
16     in a product, an acknowledgment in the product documentation would be
17     appreciated but is not required.
18  2. Altered source versions must be plainly marked as such, and must not be
19     misrepresented as being the original software.
20  3. This notice may not be removed or altered from any source distribution.
21
22  Mark Adler    madler@alumni.caltech.edu
23 */
24
25/*
26 * Change history:
27 *
28 * 1.0  11 Dec 2004     - First version
29 * 1.1  12 Jun 2005     - Changed ssize_t to long for portability
30 * 1.2  14 Aug 2012     - Clean up for z_const usage
31 */
32
33/*
34   gzjoin takes one or more gzip files on the command line and writes out a
35   single gzip file that will uncompress to the concatenation of the
36   uncompressed data from the individual gzip files.  gzjoin does this without
37   having to recompress any of the data and without having to calculate a new
38   crc32 for the concatenated uncompressed data.  gzjoin does however have to
39   decompress all of the input data in order to find the bits in the compressed
40   data that need to be modified to concatenate the streams.
41
42   gzjoin does not do an integrity check on the input gzip files other than
43   checking the gzip header and decompressing the compressed data.  They are
44   otherwise assumed to be complete and correct.
45
46   Each joint between gzip files removes at least 18 bytes of previous trailer
47   and subsequent header, and inserts an average of about three bytes to the
48   compressed data in order to connect the streams.  The output gzip file
49   has a minimal ten-byte gzip header with no file name or modification time.
50
51   This program was written to illustrate the use of the Z_BLOCK option of
52   inflate() and the crc32_combine() function.  gzjoin will not compile with
53   versions of zlib earlier than 1.2.3.
54 */
55
56#include <stdio.h>      /* fputs(), fprintf(), fwrite(), putc() */
57#include <stdlib.h>     /* exit(), malloc(), free() */
58#include <fcntl.h>      /* open() */
59#include <unistd.h>     /* close(), read(), lseek() */
60#include "zlib.h"
61    /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
62
63#define local static
64
65/* exit with an error (return a value to allow use in an expression) */
66local int bail(char *why1, char *why2)
67{
68    fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
69    exit(1);
70    return 0;
71}
72
73/* -- simple buffered file input with access to the buffer -- */
74
75#define CHUNK 32768         /* must be a power of two and fit in unsigned */
76
77/* bin buffered input file type */
78typedef struct {
79    char *name;             /* name of file for error messages */
80    int fd;                 /* file descriptor */
81    unsigned left;          /* bytes remaining at next */
82    unsigned char *next;    /* next byte to read */
83    unsigned char *buf;     /* allocated buffer of length CHUNK */
84} bin;
85
86/* close a buffered file and free allocated memory */
87local void bclose(bin *in)
88{
89    if (in != NULL) {
90        if (in->fd != -1)
91            close(in->fd);
92        if (in->buf != NULL)
93            free(in->buf);
94        free(in);
95    }
96}
97
98/* open a buffered file for input, return a pointer to type bin, or NULL on
99   failure */
100local bin *bopen(char *name)
101{
102    bin *in;
103
104    in = malloc(sizeof(bin));
105    if (in == NULL)
106        return NULL;
107    in->buf = malloc(CHUNK);
108    in->fd = open(name, O_RDONLY, 0);
109    if (in->buf == NULL || in->fd == -1) {
110        bclose(in);
111        return NULL;
112    }
113    in->left = 0;
114    in->next = in->buf;
115    in->name = name;
116    return in;
117}
118
119/* load buffer from file, return -1 on read error, 0 or 1 on success, with
120   1 indicating that end-of-file was reached */
121local int bload(bin *in)
122{
123    long len;
124
125    if (in == NULL)
126        return -1;
127    if (in->left != 0)
128        return 0;
129    in->next = in->buf;
130    do {
131        len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
132        if (len < 0)
133            return -1;
134        in->left += (unsigned)len;
135    } while (len != 0 && in->left < CHUNK);
136    return len == 0 ? 1 : 0;
137}
138
139/* get a byte from the file, bail if end of file */
140#define bget(in) (in->left ? 0 : bload(in), \
141                  in->left ? (in->left--, *(in->next)++) : \
142                    bail("unexpected end of file on ", in->name))
143
144/* get a four-byte little-endian unsigned integer from file */
145local unsigned long bget4(bin *in)
146{
147    unsigned long val;
148
149    val = bget(in);
150    val += (unsigned long)(bget(in)) << 8;
151    val += (unsigned long)(bget(in)) << 16;
152    val += (unsigned long)(bget(in)) << 24;
153    return val;
154}
155
156/* skip bytes in file */
157local void bskip(bin *in, unsigned skip)
158{
159    /* check pointer */
160    if (in == NULL)
161        return;
162
163    /* easy case -- skip bytes in buffer */
164    if (skip <= in->left) {
165        in->left -= skip;
166        in->next += skip;
167        return;
168    }
169
170    /* skip what's in buffer, discard buffer contents */
171    skip -= in->left;
172    in->left = 0;
173
174    /* seek past multiples of CHUNK bytes */
175    if (skip > CHUNK) {
176        unsigned left;
177
178        left = skip & (CHUNK - 1);
179        if (left == 0) {
180            /* exact number of chunks: seek all the way minus one byte to check
181               for end-of-file with a read */
182            lseek(in->fd, skip - 1, SEEK_CUR);
183            if (read(in->fd, in->buf, 1) != 1)
184                bail("unexpected end of file on ", in->name);
185            return;
186        }
187
188        /* skip the integral chunks, update skip with remainder */
189        lseek(in->fd, skip - left, SEEK_CUR);
190        skip = left;
191    }
192
193    /* read more input and skip remainder */
194    bload(in);
195    if (skip > in->left)
196        bail("unexpected end of file on ", in->name);
197    in->left -= skip;
198    in->next += skip;
199}
200
201/* -- end of buffered input functions -- */
202
203/* skip the gzip header from file in */
204local void gzhead(bin *in)
205{
206    int flags;
207
208    /* verify gzip magic header and compression method */
209    if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
210        bail(in->name, " is not a valid gzip file");
211
212    /* get and verify flags */
213    flags = bget(in);
214    if ((flags & 0xe0) != 0)
215        bail("unknown reserved bits set in ", in->name);
216
217    /* skip modification time, extra flags, and os */
218    bskip(in, 6);
219
220    /* skip extra field if present */
221    if (flags & 4) {
222        unsigned len;
223
224        len = bget(in);
225        len += (unsigned)(bget(in)) << 8;
226        bskip(in, len);
227    }
228
229    /* skip file name if present */
230    if (flags & 8)
231        while (bget(in) != 0)
232            ;
233
234    /* skip comment if present */
235    if (flags & 16)
236        while (bget(in) != 0)
237            ;
238
239    /* skip header crc if present */
240    if (flags & 2)
241        bskip(in, 2);
242}
243
244/* write a four-byte little-endian unsigned integer to out */
245local void put4(unsigned long val, FILE *out)
246{
247    putc(val & 0xff, out);
248    putc((val >> 8) & 0xff, out);
249    putc((val >> 16) & 0xff, out);
250    putc((val >> 24) & 0xff, out);
251}
252
253/* Load up zlib stream from buffered input, bail if end of file */
254local void zpull(z_streamp strm, bin *in)
255{
256    if (in->left == 0)
257        bload(in);
258    if (in->left == 0)
259        bail("unexpected end of file on ", in->name);
260    strm->avail_in = in->left;
261    strm->next_in = in->next;
262}
263
264/* Write header for gzip file to out and initialize trailer. */
265local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
266{
267    fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
268    *crc = crc32(0L, Z_NULL, 0);
269    *tot = 0;
270}
271
272/* Copy the compressed data from name, zeroing the last block bit of the last
273   block if clr is true, and adding empty blocks as needed to get to a byte
274   boundary.  If clr is false, then the last block becomes the last block of
275   the output, and the gzip trailer is written.  crc and tot maintains the
276   crc and length (modulo 2^32) of the output for the trailer.  The resulting
277   gzip file is written to out.  gzinit() must be called before the first call
278   of gzcopy() to write the gzip header and to initialize crc and tot. */
279local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
280                  FILE *out)
281{
282    int ret;                /* return value from zlib functions */
283    int pos;                /* where the "last block" bit is in byte */
284    int last;               /* true if processing the last block */
285    bin *in;                /* buffered input file */
286    unsigned char *start;   /* start of compressed data in buffer */
287    unsigned char *junk;    /* buffer for uncompressed data -- discarded */
288    z_off_t len;            /* length of uncompressed data (support > 4 GB) */
289    z_stream strm;          /* zlib inflate stream */
290
291    /* open gzip file and skip header */
292    in = bopen(name);
293    if (in == NULL)
294        bail("could not open ", name);
295    gzhead(in);
296
297    /* allocate buffer for uncompressed data and initialize raw inflate
298       stream */
299    junk = malloc(CHUNK);
300    strm.zalloc = Z_NULL;
301    strm.zfree = Z_NULL;
302    strm.opaque = Z_NULL;
303    strm.avail_in = 0;
304    strm.next_in = Z_NULL;
305    ret = inflateInit2(&strm, -15);
306    if (junk == NULL || ret != Z_OK)
307        bail("out of memory", "");
308
309    /* inflate and copy compressed data, clear last-block bit if requested */
310    len = 0;
311    zpull(&strm, in);
312    start = in->next;
313    last = start[0] & 1;
314    if (last && clr)
315        start[0] &= ~1;
316    strm.avail_out = 0;
317    for (;;) {
318        /* if input used and output done, write used input and get more */
319        if (strm.avail_in == 0 && strm.avail_out != 0) {
320            fwrite(start, 1, strm.next_in - start, out);
321            start = in->buf;
322            in->left = 0;
323            zpull(&strm, in);
324        }
325
326        /* decompress -- return early when end-of-block reached */
327        strm.avail_out = CHUNK;
328        strm.next_out = junk;
329        ret = inflate(&strm, Z_BLOCK);
330        switch (ret) {
331        case Z_MEM_ERROR:
332            bail("out of memory", "");
333        case Z_DATA_ERROR:
334            bail("invalid compressed data in ", in->name);
335        }
336
337        /* update length of uncompressed data */
338        len += CHUNK - strm.avail_out;
339
340        /* check for block boundary (only get this when block copied out) */
341        if (strm.data_type & 128) {
342            /* if that was the last block, then done */
343            if (last)
344                break;
345
346            /* number of unused bits in last byte */
347            pos = strm.data_type & 7;
348
349            /* find the next last-block bit */
350            if (pos != 0) {
351                /* next last-block bit is in last used byte */
352                pos = 0x100 >> pos;
353                last = strm.next_in[-1] & pos;
354                if (last && clr)
355                    in->buf[strm.next_in - in->buf - 1] &= ~pos;
356            }
357            else {
358                /* next last-block bit is in next unused byte */
359                if (strm.avail_in == 0) {
360                    /* don't have that byte yet -- get it */
361                    fwrite(start, 1, strm.next_in - start, out);
362                    start = in->buf;
363                    in->left = 0;
364                    zpull(&strm, in);
365                }
366                last = strm.next_in[0] & 1;
367                if (last && clr)
368                    in->buf[strm.next_in - in->buf] &= ~1;
369            }
370        }
371    }
372
373    /* update buffer with unused input */
374    in->left = strm.avail_in;
375    in->next = in->buf + (strm.next_in - in->buf);
376
377    /* copy used input, write empty blocks to get to byte boundary */
378    pos = strm.data_type & 7;
379    fwrite(start, 1, in->next - start - 1, out);
380    last = in->next[-1];
381    if (pos == 0 || !clr)
382        /* already at byte boundary, or last file: write last byte */
383        putc(last, out);
384    else {
385        /* append empty blocks to last byte */
386        last &= ((0x100 >> pos) - 1);       /* assure unused bits are zero */
387        if (pos & 1) {
388            /* odd -- append an empty stored block */
389            putc(last, out);
390            if (pos == 1)
391                putc(0, out);               /* two more bits in block header */
392            fwrite("\0\0\xff\xff", 1, 4, out);
393        }
394        else {
395            /* even -- append 1, 2, or 3 empty fixed blocks */
396            switch (pos) {
397            case 6:
398                putc(last | 8, out);
399                last = 0;
400            case 4:
401                putc(last | 0x20, out);
402                last = 0;
403            case 2:
404                putc(last | 0x80, out);
405                putc(0, out);
406            }
407        }
408    }
409
410    /* update crc and tot */
411    *crc = crc32_combine(*crc, bget4(in), len);
412    *tot += (unsigned long)len;
413
414    /* clean up */
415    inflateEnd(&strm);
416    free(junk);
417    bclose(in);
418
419    /* write trailer if this is the last gzip file */
420    if (!clr) {
421        put4(*crc, out);
422        put4(*tot, out);
423    }
424}
425
426/* join the gzip files on the command line, write result to stdout */
427int main(int argc, char **argv)
428{
429    unsigned long crc, tot;     /* running crc and total uncompressed length */
430
431    /* skip command name */
432    argc--;
433    argv++;
434
435    /* show usage if no arguments */
436    if (argc == 0) {
437        fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
438              stderr);
439        return 0;
440    }
441
442    /* join gzip files on command line and write to stdout */
443    gzinit(&crc, &tot, stdout);
444    while (argc--)
445        gzcopy(*argv++, argc, &crc, &tot, stdout);
446
447    /* done */
448    return 0;
449}
450