1/* compress.c - deflate/inflate code for zip, gzip, zlib, and raw 2 * 3 * Copyright 2014 Rob Landley <rob@landley.net> 4 * 5 * The inflate/deflate code lives here, so the various things that use it 6 * either live here or call these commands to pipe data through them. 7 * 8 * Divergence from posix: replace obsolete/patented "compress" with mutiplexer. 9 * (gzip already replaces "uncompress".) 10 * 11 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip) 12 * LSB 4.1 has gzip, gunzip, and zcat 13 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout 14 15// Accept many different kinds of command line argument. 16// Leave Lrg at end so flag values line up. 17 18USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN)) 19USE_GZIP(NEWTOY(gzip, USE_GZIP_D("d")"19dcflqStvgLRz[!gLRz]", TOYFLAG_USR|TOYFLAG_BIN)) 20USE_ZCAT(NEWTOY(zcat, 0, TOYFLAG_USR|TOYFLAG_BIN)) 21USE_GUNZIP(NEWTOY(gunzip, "cflqStv", TOYFLAG_USR|TOYFLAG_BIN)) 22 23//zip unzip gzip gunzip zcat 24 25config COMPRESS 26 bool "compress" 27 default n 28 help 29 usage: compress [-zgLR19] [FILE] 30 31 Compress or decompress file (or stdin) using "deflate" algorithm. 32 33 -1 min compression 34 -9 max compression (default) 35 -g gzip (default) 36 -L zlib 37 -R raw 38 -z zip 39 40config GZIP 41 bool "gzip" 42 default y 43 depends on COMPRESS 44 help 45 usage: gzip [-19cfqStvzgLR] [FILE...] 46 47 Compess (deflate) file(s). With no files, compress stdin to stdout. 48 49 On successful decompression, compressed files are replaced with the 50 uncompressed version. The input file is removed and replaced with 51 a new file without the .gz extension (with same ownership/permissions). 52 53 -1 Minimal compression (fastest) 54 -9 Max compression (default) 55 -c cat to stdout (act as zcat) 56 -f force (if output file exists, input is tty, unrecognized extension) 57 -q quiet (no warnings) 58 -S specify exension (default .*) 59 -t test compressed file(s) 60 -v verbose (like -l, but compress files) 61 62 Compression type: 63 -g gzip (default) -L zlib -R raw -z zip 64 65config GZIP_D 66 bool 67 default y 68 depends on GZIP && DECOMPRESS 69 help 70 usage: gzip [-d] 71 72 -d decompress (act as gunzip) 73 74config DECOMPRESS 75 bool "decompress" 76 default n 77 help 78 usage: compress [-zglrcd9] [FILE] 79 80 Compress or decompress file (or stdin) using "deflate" algorithm. 81 82 -c compress with -g gzip (default) -l zlib -r raw -z zip 83 -d decompress (autodetects type) 84 85 86config ZCAT 87 bool "zcat" 88 default y 89 depends on DECOMPRESS 90 help 91 usage: zcat [FILE...] 92 93 Decompress deflated file(s) to stdout 94 95config GUNZIP 96 bool "gunzip" 97 default y 98 depends on DECOMPRESS 99 help 100 usage: gunzip [-cflqStv] [FILE...] 101 102 Decompess (deflate) file(s). With no files, compress stdin to stdout. 103 104 On successful decompression, compressed files are replaced with the 105 uncompressed version. The input file is removed and replaced with 106 a new file without the .gz extension (with same ownership/permissions). 107 108 -c cat to stdout (act as zcat) 109 -f force (output file exists, input is tty, unrecognized extension) 110 -l list compressed/uncompressed/ratio/name for each input file. 111 -q quiet (no warnings) 112 -S specify exension (default .*) 113 -t test compressed file(s) 114 -v verbose (like -l, but decompress files) 115*/ 116 117#define FOR_compress 118#include "toys.h" 119 120GLOBALS( 121 // Huffman codes: base offset and extra bits tables (length and distance) 122 char lenbits[29], distbits[30]; 123 unsigned short lenbase[29], distbase[30]; 124 void *fixdisthuff, *fixlithuff; 125 126 // CRC 127 void (*crcfunc)(char *data, int len); 128 unsigned crc; 129 130 // Compressed data buffer 131 char *data; 132 unsigned pos, len; 133 int infd, outfd; 134 135 // Tables only used for deflation 136 unsigned short *hashhead, *hashchain; 137) 138 139// little endian bit buffer 140struct bitbuf { 141 int fd, bitpos, len, max; 142 char buf[]; 143}; 144 145// malloc a struct bitbuf 146struct bitbuf *bitbuf_init(int fd, int size) 147{ 148 struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size); 149 150 bb->max = size; 151 bb->fd = fd; 152 153 return bb; 154} 155 156// Advance bitpos without the overhead of recording bits 157void bitbuf_skip(struct bitbuf *bb, int bits) 158{ 159 int pos = bb->bitpos + bits, len = bb->len << 3; 160 161 while (pos >= len) { 162 pos -= len; 163 len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3; 164 if (bb->len < 1) perror_exit("inflate EOF"); 165 } 166 bb->bitpos = pos; 167} 168 169// Optimized single bit inlined version 170static inline int bitbuf_bit(struct bitbuf *bb) 171{ 172 int bufpos = bb->bitpos>>3; 173 174 if (bufpos == bb->len) { 175 bitbuf_skip(bb, 0); 176 bufpos = 0; 177 } 178 179 return (bb->buf[bufpos]>>(bb->bitpos++&7))&1; 180} 181 182// Fetch the next X bits from the bitbuf, little endian 183unsigned bitbuf_get(struct bitbuf *bb, int bits) 184{ 185 int result = 0, offset = 0; 186 187 while (bits) { 188 int click = bb->bitpos >> 3, blow, blen; 189 190 // Load more data if buffer empty 191 if (click == bb->len) bitbuf_skip(bb, click = 0); 192 193 // grab bits from next byte 194 blow = bb->bitpos & 7; 195 blen = 8-blow; 196 if (blen > bits) blen = bits; 197 result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset; 198 offset += blen; 199 bits -= blen; 200 bb->bitpos += blen; 201 } 202 203 return result; 204} 205 206void bitbuf_flush(struct bitbuf *bb) 207{ 208 if (!bb->bitpos) return; 209 210 xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8); 211 memset(bb->buf, 0, bb->max); 212 bb->bitpos = 0; 213} 214 215void bitbuf_put(struct bitbuf *bb, int data, int len) 216{ 217 while (len) { 218 int click = bb->bitpos >> 3, blow, blen; 219 220 // Flush buffer if necessary 221 if (click == bb->max) { 222 bitbuf_flush(bb); 223 click = 0; 224 } 225 blow = bb->bitpos & 7; 226 blen = 8-blow; 227 if (blen > len) blen = len; 228 bb->buf[click] |= data << blow; 229 bb->bitpos += blen; 230 data >>= blen; 231 len -= blen; 232 } 233} 234 235static void data_crc(char sym) 236{ 237 TT.data[TT.pos++ & 32767] = sym; 238 239 if (!(TT.pos & 32767)) { 240 xwrite(TT.outfd, TT.data, 32768); 241 if (TT.crcfunc) TT.crcfunc(TT.data, 32768); 242 } 243} 244 245// Huffman coding uses bits to traverse a binary tree to a leaf node, 246// By placing frequently occurring symbols at shorter paths, frequently 247// used symbols may be represented in fewer bits than uncommon symbols. 248 249struct huff { 250 unsigned short length[16]; 251 unsigned short symbol[288]; 252}; 253 254// Create simple huffman tree from array of bit lengths. 255 256// The symbols in the huffman trees are sorted (first by bit length 257// of the code to reach them, then by symbol number). This means that given 258// the bit length of each symbol, we can construct a unique tree. 259static void len2huff(struct huff *huff, char bitlen[], int len) 260{ 261 int offset[16]; 262 int i; 263 264 // Count number of codes at each bit length 265 memset(huff, 0, sizeof(struct huff)); 266 for (i = 0; i<len; i++) huff->length[bitlen[i]]++; 267 268 // Sort symbols by bit length. (They'll remain sorted by symbol within that.) 269 *huff->length = *offset = 0; 270 for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1]; 271 272 for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i; 273} 274 275// Fetch and decode next huffman coded symbol from bitbuf. 276// This takes advantage of the sorting to navigate the tree as an array: 277// each time we fetch a bit we have all the codes at that bit level in 278// order with no gaps. 279static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff) 280{ 281 unsigned short *length = huff->length; 282 int start = 0, offset = 0; 283 284 // Traverse through the bit lengths until our code is in this range 285 for (;;) { 286 offset = (offset << 1) | bitbuf_bit(bb); 287 start += *++length; 288 if ((offset -= *length) < 0) break; 289 if ((length - huff->length) & 16) error_exit("bad symbol"); 290 } 291 292 return huff->symbol[start + offset]; 293} 294 295// Decompress deflated data from bitbuf to TT.outfd. 296static void inflate(struct bitbuf *bb) 297{ 298 TT.crc = ~0; 299 // repeat until spanked 300 for (;;) { 301 int final, type; 302 303 final = bitbuf_get(bb, 1); 304 type = bitbuf_get(bb, 2); 305 306 if (type == 3) error_exit("bad type"); 307 308 // Uncompressed block? 309 if (!type) { 310 int len, nlen; 311 312 // Align to byte, read length 313 bitbuf_skip(bb, (8-bb->bitpos)&7); 314 len = bitbuf_get(bb, 16); 315 nlen = bitbuf_get(bb, 16); 316 if (len != (0xffff & ~nlen)) error_exit("bad len"); 317 318 // Dump literal output data 319 while (len) { 320 int pos = bb->bitpos >> 3, bblen = bb->len - pos; 321 char *p = bb->buf+pos; 322 323 // dump bytes until done or end of current bitbuf contents 324 if (bblen > len) bblen = len; 325 pos = bblen; 326 while (pos--) data_crc(*(p++)); 327 bitbuf_skip(bb, bblen << 3); 328 len -= bblen; 329 } 330 331 // Compressed block 332 } else { 333 struct huff *disthuff, *lithuff; 334 335 // Dynamic huffman codes? 336 if (type == 2) { 337 struct huff *h2 = ((struct huff *)toybuf)+1; 338 int i, litlen, distlen, hufflen; 339 char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b" 340 "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits; 341 342 // The huffman trees are stored as a series of bit lengths 343 litlen = bitbuf_get(bb, 5)+257; // max 288 344 distlen = bitbuf_get(bb, 5)+1; // max 32 345 hufflen = bitbuf_get(bb, 4)+4; // max 19 346 347 // The literal and distance codes are themselves compressed, in 348 // a complicated way: an array of bit lengths (hufflen many 349 // entries, each 3 bits) is used to fill out an array of 19 entries 350 // in a magic order, leaving the rest 0. Then make a tree out of it: 351 memset(bits = toybuf+1, 0, 19); 352 for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3); 353 len2huff(h2, bits, 19); 354 355 // Use that tree to read in the literal and distance bit lengths 356 for (i = 0; i < litlen + distlen;) { 357 int sym = huff_and_puff(bb, h2); 358 359 // 0-15 are literals, 16 = repeat previous code 3-6 times, 360 // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit) 361 if (sym < 16) bits[i++] = sym; 362 else { 363 int len = sym & 2; 364 365 len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2); 366 memset(bits+i, bits[i-1] * !(sym&3), len); 367 i += len; 368 } 369 } 370 if (i > litlen+distlen) error_exit("bad tree"); 371 372 len2huff(lithuff = h2, bits, litlen); 373 len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen); 374 375 // Static huffman codes 376 } else { 377 lithuff = TT.fixlithuff; 378 disthuff = TT.fixdisthuff; 379 } 380 381 // Use huffman tables to decode block of compressed symbols 382 for (;;) { 383 int sym = huff_and_puff(bb, lithuff); 384 385 // Literal? 386 if (sym < 256) data_crc(sym); 387 388 // Copy range? 389 else if (sym > 256) { 390 int len, dist; 391 392 sym -= 257; 393 len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]); 394 sym = huff_and_puff(bb, disthuff); 395 dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]); 396 sym = TT.pos & 32767; 397 398 while (len--) data_crc(TT.data[(TT.pos-dist) & 32767]); 399 400 // End of block 401 } else break; 402 } 403 } 404 405 // Was that the last block? 406 if (final) break; 407 } 408 409 if (TT.pos & 32767) { 410 xwrite(TT.outfd, TT.data, TT.pos & 32767); 411 if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767); 412 } 413} 414 415// Deflate from TT.infd to bitbuf 416// For deflate, TT.len = input read, TT.pos = input consumed 417static void deflate(struct bitbuf *bb) 418{ 419 char *data = TT.data; 420 int len, final = 0; 421 422 TT.crc = ~0; 423 424 while (!final) { 425 // Read next half-window of data if we haven't hit EOF yet. 426 len = readall(TT.infd, data+(TT.len&32768), 32768); 427 if (len < 0) perror_exit("read"); // todo: add filename 428 if (len != 32768) final++; 429 if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len); 430 // TT.len += len; crcfunc advances len 431 432 // store block as literal 433 bitbuf_put(bb, final, 1); 434 bitbuf_put(bb, 0, 1); 435 436 bitbuf_put(bb, 0, (8-bb->bitpos)&7); 437 bitbuf_put(bb, len, 16); 438 bitbuf_put(bb, 0xffff & ~len, 16); 439 440 // repeat until spanked 441 while (TT.pos != TT.len) { 442 unsigned pos = TT.pos & 65535; 443 444 bitbuf_put(bb, data[pos], 8); 445 446 // need to refill buffer? 447 if (!(32767 & ++TT.pos) && !final) break; 448 } 449 } 450 bitbuf_flush(bb); 451} 452 453// Allocate memory for deflate/inflate. 454static void init_deflate(int compress) 455{ 456 int i, n = 1; 457 458 // compress needs 64k data and 32k each for hashhead and hashchain. 459 // decompress just needs 32k data. 460 TT.data = xmalloc(32768*(compress ? 4 : 1)); 461 if (compress) { 462 TT.hashhead = (unsigned short *)(TT.data + 65536); 463 TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768); 464 } 465 466 // Calculate lenbits, lenbase, distbits, distbase 467 *TT.lenbase = 3; 468 for (i = 0; i<sizeof(TT.lenbits)-1; i++) { 469 if (i>4) { 470 if (!(i&3)) { 471 TT.lenbits[i]++; 472 n <<= 1; 473 } 474 if (i == 27) n--; 475 else TT.lenbits[i+1] = TT.lenbits[i]; 476 } 477 TT.lenbase[i+1] = n + TT.lenbase[i]; 478 } 479 n = 0; 480 for (i = 0; i<sizeof(TT.distbits); i++) { 481 TT.distbase[i] = 1<<n; 482 if (i) TT.distbase[i] += TT.distbase[i-1]; 483 if (i>3 && !(i&1)) n++; 484 TT.distbits[i] = n; 485 } 486 487 // Init fixed huffman tables 488 for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279); 489 len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288); 490 memset(toybuf, 5, 30); 491 len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30); 492} 493 494// Return true/false whether we consumed a gzip header. 495static int is_gzip(struct bitbuf *bb) 496{ 497 int flags; 498 499 // Confirm signature 500 if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31) 501 return 0; 502 bitbuf_skip(bb, 6*8); 503 504 // Skip extra, name, comment, header CRC fields 505 if (flags & 4) bitbuf_skip(bb, 16); 506 if (flags & 8) while (bitbuf_get(bb, 8)); 507 if (flags & 16) while (bitbuf_get(bb, 8)); 508 if (flags & 2) bitbuf_skip(bb, 16); 509 510 return 1; 511} 512 513void gzip_crc(char *data, int len) 514{ 515 int i; 516 unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024); 517 518 crc = TT.crc; 519 for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8); 520 TT.crc = crc; 521 TT.len += len; 522} 523 524static void do_gzip(int fd, char *name) 525{ 526 struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf)); 527 528 // Header from RFC 1952 section 2.2: 529 // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none), 530 // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression), 531 // Operating System (FF=unknown) 532 533 TT.infd = fd; 534 xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10); 535 536 // Use last 1k of toybuf for little endian crc table 537 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1); 538 TT.crcfunc = gzip_crc; 539 540 deflate(bb); 541 542 // tail: crc32, len32 543 544 bitbuf_put(bb, 0, (8-bb->bitpos)&7); 545 bitbuf_put(bb, ~TT.crc, 32); 546 bitbuf_put(bb, TT.len, 32); 547 548 bitbuf_flush(bb); 549 free(bb); 550} 551 552static void do_zcat(int fd, char *name) 553{ 554 struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf)); 555 556 if (!is_gzip(bb)) error_exit("not gzip"); 557 TT.outfd = 1; 558 559 // Use last 1k of toybuf for little endian crc table 560 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1); 561 TT.crcfunc = gzip_crc; 562 563 inflate(bb); 564 565 // tail: crc32, len32 566 567 bitbuf_skip(bb, (8-bb->bitpos)&7); 568 if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32)) 569 error_exit("bad crc"); 570 free(bb); 571} 572 573// Parse many different kinds of command line argument: 574 575void compress_main(void) 576{ 577 // todo: this 578 printf("hello world"); 579} 580 581//#define CLEANUP_compress 582//#define FOR_zcat 583//#include "generated/flags.h" 584 585void zcat_main(void) 586{ 587 init_deflate(0); 588 589 loopfiles(toys.optargs, do_zcat); 590} 591 592void gunzip_main(void) 593{ 594 init_deflate(0); 595 596 loopfiles(toys.optargs, do_zcat); 597} 598 599void gzip_main(void) 600{ 601 init_deflate(1); 602 603 loopfiles(toys.optargs, do_gzip); 604} 605