1/* gzappend -- command to append to a gzip file 2 3 Copyright (C) 2003, 2012 Mark Adler, all rights reserved 4 version 1.2, 11 Oct 2012 5 6 This software is provided 'as-is', without any express or implied 7 warranty. In no event will the author be held liable for any damages 8 arising from the use of this software. 9 10 Permission is granted to anyone to use this software for any purpose, 11 including commercial applications, and to alter it and redistribute it 12 freely, subject to the following restrictions: 13 14 1. The origin of this software must not be misrepresented; you must not 15 claim that you wrote the original software. If you use this software 16 in a product, an acknowledgment in the product documentation would be 17 appreciated but is not required. 18 2. Altered source versions must be plainly marked as such, and must not be 19 misrepresented as being the original software. 20 3. This notice may not be removed or altered from any source distribution. 21 22 Mark Adler madler@alumni.caltech.edu 23 */ 24 25/* 26 * Change history: 27 * 28 * 1.0 19 Oct 2003 - First version 29 * 1.1 4 Nov 2003 - Expand and clarify some comments and notes 30 * - Add version and copyright to help 31 * - Send help to stdout instead of stderr 32 * - Add some preemptive typecasts 33 * - Add L to constants in lseek() calls 34 * - Remove some debugging information in error messages 35 * - Use new data_type definition for zlib 1.2.1 36 * - Simplfy and unify file operations 37 * - Finish off gzip file in gztack() 38 * - Use deflatePrime() instead of adding empty blocks 39 * - Keep gzip file clean on appended file read errors 40 * - Use in-place rotate instead of auxiliary buffer 41 * (Why you ask? Because it was fun to write!) 42 * 1.2 11 Oct 2012 - Fix for proper z_const usage 43 * - Check for input buffer malloc failure 44 */ 45 46/* 47 gzappend takes a gzip file and appends to it, compressing files from the 48 command line or data from stdin. The gzip file is written to directly, to 49 avoid copying that file, in case it's large. Note that this results in the 50 unfriendly behavior that if gzappend fails, the gzip file is corrupted. 51 52 This program was written to illustrate the use of the new Z_BLOCK option of 53 zlib 1.2.x's inflate() function. This option returns from inflate() at each 54 block boundary to facilitate locating and modifying the last block bit at 55 the start of the final deflate block. Also whether using Z_BLOCK or not, 56 another required feature of zlib 1.2.x is that inflate() now provides the 57 number of unusued bits in the last input byte used. gzappend will not work 58 with versions of zlib earlier than 1.2.1. 59 60 gzappend first decompresses the gzip file internally, discarding all but 61 the last 32K of uncompressed data, and noting the location of the last block 62 bit and the number of unused bits in the last byte of the compressed data. 63 The gzip trailer containing the CRC-32 and length of the uncompressed data 64 is verified. This trailer will be later overwritten. 65 66 Then the last block bit is cleared by seeking back in the file and rewriting 67 the byte that contains it. Seeking forward, the last byte of the compressed 68 data is saved along with the number of unused bits to initialize deflate. 69 70 A deflate process is initialized, using the last 32K of the uncompressed 71 data from the gzip file to initialize the dictionary. If the total 72 uncompressed data was less than 32K, then all of it is used to initialize 73 the dictionary. The deflate output bit buffer is also initialized with the 74 last bits from the original deflate stream. From here on, the data to 75 append is simply compressed using deflate, and written to the gzip file. 76 When that is complete, the new CRC-32 and uncompressed length are written 77 as the trailer of the gzip file. 78 */ 79 80#include <stdio.h> 81#include <stdlib.h> 82#include <string.h> 83#include <fcntl.h> 84#include <unistd.h> 85#include "zlib.h" 86 87#define local static 88#define LGCHUNK 14 89#define CHUNK (1U << LGCHUNK) 90#define DSIZE 32768U 91 92/* print an error message and terminate with extreme prejudice */ 93local void bye(char *msg1, char *msg2) 94{ 95 fprintf(stderr, "gzappend error: %s%s\n", msg1, msg2); 96 exit(1); 97} 98 99/* return the greatest common divisor of a and b using Euclid's algorithm, 100 modified to be fast when one argument much greater than the other, and 101 coded to avoid unnecessary swapping */ 102local unsigned gcd(unsigned a, unsigned b) 103{ 104 unsigned c; 105 106 while (a && b) 107 if (a > b) { 108 c = b; 109 while (a - c >= c) 110 c <<= 1; 111 a -= c; 112 } 113 else { 114 c = a; 115 while (b - c >= c) 116 c <<= 1; 117 b -= c; 118 } 119 return a + b; 120} 121 122/* rotate list[0..len-1] left by rot positions, in place */ 123local void rotate(unsigned char *list, unsigned len, unsigned rot) 124{ 125 unsigned char tmp; 126 unsigned cycles; 127 unsigned char *start, *last, *to, *from; 128 129 /* normalize rot and handle degenerate cases */ 130 if (len < 2) return; 131 if (rot >= len) rot %= len; 132 if (rot == 0) return; 133 134 /* pointer to last entry in list */ 135 last = list + (len - 1); 136 137 /* do simple left shift by one */ 138 if (rot == 1) { 139 tmp = *list; 140 memcpy(list, list + 1, len - 1); 141 *last = tmp; 142 return; 143 } 144 145 /* do simple right shift by one */ 146 if (rot == len - 1) { 147 tmp = *last; 148 memmove(list + 1, list, len - 1); 149 *list = tmp; 150 return; 151 } 152 153 /* otherwise do rotate as a set of cycles in place */ 154 cycles = gcd(len, rot); /* number of cycles */ 155 do { 156 start = from = list + cycles; /* start index is arbitrary */ 157 tmp = *from; /* save entry to be overwritten */ 158 for (;;) { 159 to = from; /* next step in cycle */ 160 from += rot; /* go right rot positions */ 161 if (from > last) from -= len; /* (pointer better not wrap) */ 162 if (from == start) break; /* all but one shifted */ 163 *to = *from; /* shift left */ 164 } 165 *to = tmp; /* complete the circle */ 166 } while (--cycles); 167} 168 169/* structure for gzip file read operations */ 170typedef struct { 171 int fd; /* file descriptor */ 172 int size; /* 1 << size is bytes in buf */ 173 unsigned left; /* bytes available at next */ 174 unsigned char *buf; /* buffer */ 175 z_const unsigned char *next; /* next byte in buffer */ 176 char *name; /* file name for error messages */ 177} file; 178 179/* reload buffer */ 180local int readin(file *in) 181{ 182 int len; 183 184 len = read(in->fd, in->buf, 1 << in->size); 185 if (len == -1) bye("error reading ", in->name); 186 in->left = (unsigned)len; 187 in->next = in->buf; 188 return len; 189} 190 191/* read from file in, exit if end-of-file */ 192local int readmore(file *in) 193{ 194 if (readin(in) == 0) bye("unexpected end of ", in->name); 195 return 0; 196} 197 198#define read1(in) (in->left == 0 ? readmore(in) : 0, \ 199 in->left--, *(in->next)++) 200 201/* skip over n bytes of in */ 202local void skip(file *in, unsigned n) 203{ 204 unsigned bypass; 205 206 if (n > in->left) { 207 n -= in->left; 208 bypass = n & ~((1U << in->size) - 1); 209 if (bypass) { 210 if (lseek(in->fd, (off_t)bypass, SEEK_CUR) == -1) 211 bye("seeking ", in->name); 212 n -= bypass; 213 } 214 readmore(in); 215 if (n > in->left) 216 bye("unexpected end of ", in->name); 217 } 218 in->left -= n; 219 in->next += n; 220} 221 222/* read a four-byte unsigned integer, little-endian, from in */ 223unsigned long read4(file *in) 224{ 225 unsigned long val; 226 227 val = read1(in); 228 val += (unsigned)read1(in) << 8; 229 val += (unsigned long)read1(in) << 16; 230 val += (unsigned long)read1(in) << 24; 231 return val; 232} 233 234/* skip over gzip header */ 235local void gzheader(file *in) 236{ 237 int flags; 238 unsigned n; 239 240 if (read1(in) != 31 || read1(in) != 139) bye(in->name, " not a gzip file"); 241 if (read1(in) != 8) bye("unknown compression method in", in->name); 242 flags = read1(in); 243 if (flags & 0xe0) bye("unknown header flags set in", in->name); 244 skip(in, 6); 245 if (flags & 4) { 246 n = read1(in); 247 n += (unsigned)(read1(in)) << 8; 248 skip(in, n); 249 } 250 if (flags & 8) while (read1(in) != 0) ; 251 if (flags & 16) while (read1(in) != 0) ; 252 if (flags & 2) skip(in, 2); 253} 254 255/* decompress gzip file "name", return strm with a deflate stream ready to 256 continue compression of the data in the gzip file, and return a file 257 descriptor pointing to where to write the compressed data -- the deflate 258 stream is initialized to compress using level "level" */ 259local int gzscan(char *name, z_stream *strm, int level) 260{ 261 int ret, lastbit, left, full; 262 unsigned have; 263 unsigned long crc, tot; 264 unsigned char *window; 265 off_t lastoff, end; 266 file gz; 267 268 /* open gzip file */ 269 gz.name = name; 270 gz.fd = open(name, O_RDWR, 0); 271 if (gz.fd == -1) bye("cannot open ", name); 272 gz.buf = malloc(CHUNK); 273 if (gz.buf == NULL) bye("out of memory", ""); 274 gz.size = LGCHUNK; 275 gz.left = 0; 276 277 /* skip gzip header */ 278 gzheader(&gz); 279 280 /* prepare to decompress */ 281 window = malloc(DSIZE); 282 if (window == NULL) bye("out of memory", ""); 283 strm->zalloc = Z_NULL; 284 strm->zfree = Z_NULL; 285 strm->opaque = Z_NULL; 286 ret = inflateInit2(strm, -15); 287 if (ret != Z_OK) bye("out of memory", " or library mismatch"); 288 289 /* decompress the deflate stream, saving append information */ 290 lastbit = 0; 291 lastoff = lseek(gz.fd, 0L, SEEK_CUR) - gz.left; 292 left = 0; 293 strm->avail_in = gz.left; 294 strm->next_in = gz.next; 295 crc = crc32(0L, Z_NULL, 0); 296 have = full = 0; 297 do { 298 /* if needed, get more input */ 299 if (strm->avail_in == 0) { 300 readmore(&gz); 301 strm->avail_in = gz.left; 302 strm->next_in = gz.next; 303 } 304 305 /* set up output to next available section of sliding window */ 306 strm->avail_out = DSIZE - have; 307 strm->next_out = window + have; 308 309 /* inflate and check for errors */ 310 ret = inflate(strm, Z_BLOCK); 311 if (ret == Z_STREAM_ERROR) bye("internal stream error!", ""); 312 if (ret == Z_MEM_ERROR) bye("out of memory", ""); 313 if (ret == Z_DATA_ERROR) 314 bye("invalid compressed data--format violated in", name); 315 316 /* update crc and sliding window pointer */ 317 crc = crc32(crc, window + have, DSIZE - have - strm->avail_out); 318 if (strm->avail_out) 319 have = DSIZE - strm->avail_out; 320 else { 321 have = 0; 322 full = 1; 323 } 324 325 /* process end of block */ 326 if (strm->data_type & 128) { 327 if (strm->data_type & 64) 328 left = strm->data_type & 0x1f; 329 else { 330 lastbit = strm->data_type & 0x1f; 331 lastoff = lseek(gz.fd, 0L, SEEK_CUR) - strm->avail_in; 332 } 333 } 334 } while (ret != Z_STREAM_END); 335 inflateEnd(strm); 336 gz.left = strm->avail_in; 337 gz.next = strm->next_in; 338 339 /* save the location of the end of the compressed data */ 340 end = lseek(gz.fd, 0L, SEEK_CUR) - gz.left; 341 342 /* check gzip trailer and save total for deflate */ 343 if (crc != read4(&gz)) 344 bye("invalid compressed data--crc mismatch in ", name); 345 tot = strm->total_out; 346 if ((tot & 0xffffffffUL) != read4(&gz)) 347 bye("invalid compressed data--length mismatch in", name); 348 349 /* if not at end of file, warn */ 350 if (gz.left || readin(&gz)) 351 fprintf(stderr, 352 "gzappend warning: junk at end of gzip file overwritten\n"); 353 354 /* clear last block bit */ 355 lseek(gz.fd, lastoff - (lastbit != 0), SEEK_SET); 356 if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name); 357 *gz.buf = (unsigned char)(*gz.buf ^ (1 << ((8 - lastbit) & 7))); 358 lseek(gz.fd, -1L, SEEK_CUR); 359 if (write(gz.fd, gz.buf, 1) != 1) bye("writing after seek to ", name); 360 361 /* if window wrapped, build dictionary from window by rotating */ 362 if (full) { 363 rotate(window, DSIZE, have); 364 have = DSIZE; 365 } 366 367 /* set up deflate stream with window, crc, total_in, and leftover bits */ 368 ret = deflateInit2(strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); 369 if (ret != Z_OK) bye("out of memory", ""); 370 deflateSetDictionary(strm, window, have); 371 strm->adler = crc; 372 strm->total_in = tot; 373 if (left) { 374 lseek(gz.fd, --end, SEEK_SET); 375 if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name); 376 deflatePrime(strm, 8 - left, *gz.buf); 377 } 378 lseek(gz.fd, end, SEEK_SET); 379 380 /* clean up and return */ 381 free(window); 382 free(gz.buf); 383 return gz.fd; 384} 385 386/* append file "name" to gzip file gd using deflate stream strm -- if last 387 is true, then finish off the deflate stream at the end */ 388local void gztack(char *name, int gd, z_stream *strm, int last) 389{ 390 int fd, len, ret; 391 unsigned left; 392 unsigned char *in, *out; 393 394 /* open file to compress and append */ 395 fd = 0; 396 if (name != NULL) { 397 fd = open(name, O_RDONLY, 0); 398 if (fd == -1) 399 fprintf(stderr, "gzappend warning: %s not found, skipping ...\n", 400 name); 401 } 402 403 /* allocate buffers */ 404 in = malloc(CHUNK); 405 out = malloc(CHUNK); 406 if (in == NULL || out == NULL) bye("out of memory", ""); 407 408 /* compress input file and append to gzip file */ 409 do { 410 /* get more input */ 411 len = read(fd, in, CHUNK); 412 if (len == -1) { 413 fprintf(stderr, 414 "gzappend warning: error reading %s, skipping rest ...\n", 415 name); 416 len = 0; 417 } 418 strm->avail_in = (unsigned)len; 419 strm->next_in = in; 420 if (len) strm->adler = crc32(strm->adler, in, (unsigned)len); 421 422 /* compress and write all available output */ 423 do { 424 strm->avail_out = CHUNK; 425 strm->next_out = out; 426 ret = deflate(strm, last && len == 0 ? Z_FINISH : Z_NO_FLUSH); 427 left = CHUNK - strm->avail_out; 428 while (left) { 429 len = write(gd, out + CHUNK - strm->avail_out - left, left); 430 if (len == -1) bye("writing gzip file", ""); 431 left -= (unsigned)len; 432 } 433 } while (strm->avail_out == 0 && ret != Z_STREAM_END); 434 } while (len != 0); 435 436 /* write trailer after last entry */ 437 if (last) { 438 deflateEnd(strm); 439 out[0] = (unsigned char)(strm->adler); 440 out[1] = (unsigned char)(strm->adler >> 8); 441 out[2] = (unsigned char)(strm->adler >> 16); 442 out[3] = (unsigned char)(strm->adler >> 24); 443 out[4] = (unsigned char)(strm->total_in); 444 out[5] = (unsigned char)(strm->total_in >> 8); 445 out[6] = (unsigned char)(strm->total_in >> 16); 446 out[7] = (unsigned char)(strm->total_in >> 24); 447 len = 8; 448 do { 449 ret = write(gd, out + 8 - len, len); 450 if (ret == -1) bye("writing gzip file", ""); 451 len -= ret; 452 } while (len); 453 close(gd); 454 } 455 456 /* clean up and return */ 457 free(out); 458 free(in); 459 if (fd > 0) close(fd); 460} 461 462/* process the compression level option if present, scan the gzip file, and 463 append the specified files, or append the data from stdin if no other file 464 names are provided on the command line -- the gzip file must be writable 465 and seekable */ 466int main(int argc, char **argv) 467{ 468 int gd, level; 469 z_stream strm; 470 471 /* ignore command name */ 472 argc--; argv++; 473 474 /* provide usage if no arguments */ 475 if (*argv == NULL) { 476 printf( 477 "gzappend 1.2 (11 Oct 2012) Copyright (C) 2003, 2012 Mark Adler\n" 478 ); 479 printf( 480 "usage: gzappend [-level] file.gz [ addthis [ andthis ... ]]\n"); 481 return 0; 482 } 483 484 /* set compression level */ 485 level = Z_DEFAULT_COMPRESSION; 486 if (argv[0][0] == '-') { 487 if (argv[0][1] < '0' || argv[0][1] > '9' || argv[0][2] != 0) 488 bye("invalid compression level", ""); 489 level = argv[0][1] - '0'; 490 if (*++argv == NULL) bye("no gzip file name after options", ""); 491 } 492 493 /* prepare to append to gzip file */ 494 gd = gzscan(*argv++, &strm, level); 495 496 /* append files on command line, or from stdin if none */ 497 if (*argv == NULL) 498 gztack(NULL, gd, &strm, 1); 499 else 500 do { 501 gztack(*argv, gd, &strm, argv[1] == NULL); 502 } while (*++argv != NULL); 503 return 0; 504} 505