1/* compress.c - deflate/inflate code for zip, gzip, zlib, and raw 2 * 3 * Copyright 2014 Rob Landley <rob@landley.net> 4 * 5 * The inflate/deflate code lives here, so the various things that use it 6 * either live here or call these commands to pipe data through them. 7 * 8 * Divergence from posix: replace obsolete/patented "compress" with mutiplexer. 9 * (gzip already replaces "uncompress".) 10 * 11 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip) 12 * LSB 4.1 has gzip, gunzip, and zcat 13 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout 14 15// Accept many different kinds of command line argument. 16// Leave Lrg at end so flag values line up. 17 18USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN)) 19 20//zip unzip gzip gunzip zcat 21 22config COMPRESS 23 bool "compress" 24 default n 25 help 26 usage: compress [-zgLR19] [FILE] 27 28 Compress or decompress file (or stdin) using "deflate" algorithm. 29 30 -1 min compression 31 -9 max compression (default) 32 -g gzip (default) 33 -L zlib 34 -R raw 35 -z zip 36 37config DECOMPRESS 38 bool "decompress" 39 default n 40 help 41 usage: compress [-zglrcd9] [FILE] 42 43 Compress or decompress file (or stdin) using "deflate" algorithm. 44 45 -c compress with -g gzip (default) -l zlib -r raw -z zip 46 -d decompress (autodetects type) 47*/ 48 49#define FOR_compress 50#include "toys.h" 51 52GLOBALS( 53 // Huffman codes: base offset and extra bits tables (length and distance) 54 char lenbits[29], distbits[30]; 55 unsigned short lenbase[29], distbase[30]; 56 void *fixdisthuff, *fixlithuff; 57 58 // CRC 59 void (*crcfunc)(char *data, int len); 60 unsigned crc; 61 62 // Compressed data buffer 63 char *data; 64 unsigned pos, len; 65 int infd, outfd; 66 67 // Tables only used for deflation 68 unsigned short *hashhead, *hashchain; 69) 70 71// little endian bit buffer 72struct bitbuf { 73 int fd, bitpos, len, max; 74 char buf[]; 75}; 76 77// malloc a struct bitbuf 78struct bitbuf *bitbuf_init(int fd, int size) 79{ 80 struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size); 81 82 bb->max = size; 83 bb->fd = fd; 84 85 return bb; 86} 87 88// Advance bitpos without the overhead of recording bits 89void bitbuf_skip(struct bitbuf *bb, int bits) 90{ 91 int pos = bb->bitpos + bits, len = bb->len << 3; 92 93 while (pos >= len) { 94 pos -= len; 95 len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3; 96 if (bb->len < 1) perror_exit("inflate EOF"); 97 } 98 bb->bitpos = pos; 99} 100 101// Optimized single bit inlined version 102static inline int bitbuf_bit(struct bitbuf *bb) 103{ 104 int bufpos = bb->bitpos>>3; 105 106 if (bufpos == bb->len) { 107 bitbuf_skip(bb, 0); 108 bufpos = 0; 109 } 110 111 return (bb->buf[bufpos]>>(bb->bitpos++&7))&1; 112} 113 114// Fetch the next X bits from the bitbuf, little endian 115unsigned bitbuf_get(struct bitbuf *bb, int bits) 116{ 117 int result = 0, offset = 0; 118 119 while (bits) { 120 int click = bb->bitpos >> 3, blow, blen; 121 122 // Load more data if buffer empty 123 if (click == bb->len) bitbuf_skip(bb, click = 0); 124 125 // grab bits from next byte 126 blow = bb->bitpos & 7; 127 blen = 8-blow; 128 if (blen > bits) blen = bits; 129 result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset; 130 offset += blen; 131 bits -= blen; 132 bb->bitpos += blen; 133 } 134 135 return result; 136} 137 138void bitbuf_flush(struct bitbuf *bb) 139{ 140 if (!bb->bitpos) return; 141 142 xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8); 143 memset(bb->buf, 0, bb->max); 144 bb->bitpos = 0; 145} 146 147void bitbuf_put(struct bitbuf *bb, int data, int len) 148{ 149 while (len) { 150 int click = bb->bitpos >> 3, blow, blen; 151 152 // Flush buffer if necessary 153 if (click == bb->max) { 154 bitbuf_flush(bb); 155 click = 0; 156 } 157 blow = bb->bitpos & 7; 158 blen = 8-blow; 159 if (blen > len) blen = len; 160 bb->buf[click] |= data << blow; 161 bb->bitpos += blen; 162 data >>= blen; 163 len -= blen; 164 } 165} 166 167static void output_byte(char sym) 168{ 169 int pos = TT.pos++ & 32767; 170 171 TT.data[pos] = sym; 172 173 if (pos == 32767) { 174 xwrite(TT.outfd, TT.data, 32768); 175 if (TT.crcfunc) TT.crcfunc(TT.data, 32768); 176 } 177} 178 179// Huffman coding uses bits to traverse a binary tree to a leaf node, 180// By placing frequently occurring symbols at shorter paths, frequently 181// used symbols may be represented in fewer bits than uncommon symbols. 182 183struct huff { 184 unsigned short length[16]; 185 unsigned short symbol[288]; 186}; 187 188// Create simple huffman tree from array of bit lengths. 189 190// The symbols in the huffman trees are sorted (first by bit length 191// of the code to reach them, then by symbol number). This means that given 192// the bit length of each symbol, we can construct a unique tree. 193static void len2huff(struct huff *huff, char bitlen[], int len) 194{ 195 int offset[16]; 196 int i; 197 198 // Count number of codes at each bit length 199 memset(huff, 0, sizeof(struct huff)); 200 for (i = 0; i<len; i++) huff->length[bitlen[i]]++; 201 202 // Sort symbols by bit length. (They'll remain sorted by symbol within that.) 203 *huff->length = *offset = 0; 204 for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1]; 205 206 for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i; 207} 208 209// Fetch and decode next huffman coded symbol from bitbuf. 210// This takes advantage of the sorting to navigate the tree as an array: 211// each time we fetch a bit we have all the codes at that bit level in 212// order with no gaps. 213static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff) 214{ 215 unsigned short *length = huff->length; 216 int start = 0, offset = 0; 217 218 // Traverse through the bit lengths until our code is in this range 219 for (;;) { 220 offset = (offset << 1) | bitbuf_bit(bb); 221 start += *++length; 222 if ((offset -= *length) < 0) break; 223 if ((length - huff->length) & 16) error_exit("bad symbol"); 224 } 225 226 return huff->symbol[start + offset]; 227} 228 229// Decompress deflated data from bitbuf to TT.outfd. 230static void inflate(struct bitbuf *bb) 231{ 232 TT.crc = ~0; 233 // repeat until spanked 234 for (;;) { 235 int final, type; 236 237 final = bitbuf_get(bb, 1); 238 type = bitbuf_get(bb, 2); 239 240 if (type == 3) error_exit("bad type"); 241 242 // Uncompressed block? 243 if (!type) { 244 int len, nlen; 245 246 // Align to byte, read length 247 bitbuf_skip(bb, (8-bb->bitpos)&7); 248 len = bitbuf_get(bb, 16); 249 nlen = bitbuf_get(bb, 16); 250 if (len != (0xffff & ~nlen)) error_exit("bad len"); 251 252 // Dump literal output data 253 while (len) { 254 int pos = bb->bitpos >> 3, bblen = bb->len - pos; 255 char *p = bb->buf+pos; 256 257 // dump bytes until done or end of current bitbuf contents 258 if (bblen > len) bblen = len; 259 pos = bblen; 260 while (pos--) output_byte(*(p++)); 261 bitbuf_skip(bb, bblen << 3); 262 len -= bblen; 263 } 264 265 // Compressed block 266 } else { 267 struct huff *disthuff, *lithuff; 268 269 // Dynamic huffman codes? 270 if (type == 2) { 271 struct huff *h2 = ((struct huff *)toybuf)+1; 272 int i, litlen, distlen, hufflen; 273 char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b" 274 "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits; 275 276 // The huffman trees are stored as a series of bit lengths 277 litlen = bitbuf_get(bb, 5)+257; // max 288 278 distlen = bitbuf_get(bb, 5)+1; // max 32 279 hufflen = bitbuf_get(bb, 4)+4; // max 19 280 281 // The literal and distance codes are themselves compressed, in 282 // a complicated way: an array of bit lengths (hufflen many 283 // entries, each 3 bits) is used to fill out an array of 19 entries 284 // in a magic order, leaving the rest 0. Then make a tree out of it: 285 memset(bits = toybuf+1, 0, 19); 286 for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3); 287 len2huff(h2, bits, 19); 288 289 // Use that tree to read in the literal and distance bit lengths 290 for (i = 0; i < litlen + distlen;) { 291 int sym = huff_and_puff(bb, h2); 292 293 // 0-15 are literals, 16 = repeat previous code 3-6 times, 294 // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit) 295 if (sym < 16) bits[i++] = sym; 296 else { 297 int len = sym & 2; 298 299 len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2); 300 memset(bits+i, bits[i-1] * !(sym&3), len); 301 i += len; 302 } 303 } 304 if (i > litlen+distlen) error_exit("bad tree"); 305 306 len2huff(lithuff = h2, bits, litlen); 307 len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen); 308 309 // Static huffman codes 310 } else { 311 lithuff = TT.fixlithuff; 312 disthuff = TT.fixdisthuff; 313 } 314 315 // Use huffman tables to decode block of compressed symbols 316 for (;;) { 317 int sym = huff_and_puff(bb, lithuff); 318 319 // Literal? 320 if (sym < 256) output_byte(sym); 321 322 // Copy range? 323 else if (sym > 256) { 324 int len, dist; 325 326 sym -= 257; 327 len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]); 328 sym = huff_and_puff(bb, disthuff); 329 dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]); 330 sym = TT.pos & 32767; 331 332 while (len--) output_byte(TT.data[(TT.pos-dist) & 32767]); 333 334 // End of block 335 } else break; 336 } 337 } 338 339 // Was that the last block? 340 if (final) break; 341 } 342 343 if (TT.pos & 32767) { 344 xwrite(TT.outfd, TT.data, TT.pos & 32767); 345 if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767); 346 } 347} 348 349// Deflate from TT.infd to bitbuf 350// For deflate, TT.len = input read, TT.pos = input consumed 351static void deflate(struct bitbuf *bb) 352{ 353 char *data = TT.data; 354 int len, final = 0; 355 356 TT.crc = ~0; 357 358 while (!final) { 359 // Read next half-window of data if we haven't hit EOF yet. 360 len = readall(TT.infd, data+(TT.len&32768), 32768); 361 if (len < 0) perror_exit("read"); // todo: add filename 362 if (len != 32768) final++; 363 if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len); 364 // TT.len += len; crcfunc advances len 365 366 // store block as literal 367 bitbuf_put(bb, final, 1); 368 bitbuf_put(bb, 0, 1); 369 370 bitbuf_put(bb, 0, (8-bb->bitpos)&7); 371 bitbuf_put(bb, len, 16); 372 bitbuf_put(bb, 0xffff & ~len, 16); 373 374 // repeat until spanked 375 while (TT.pos != TT.len) { 376 unsigned pos = TT.pos & 65535; 377 378 bitbuf_put(bb, data[pos], 8); 379 380 // need to refill buffer? 381 if (!(32767 & ++TT.pos) && !final) break; 382 } 383 } 384 bitbuf_flush(bb); 385} 386 387// Allocate memory for deflate/inflate. 388static void init_deflate(int compress) 389{ 390 int i, n = 1; 391 392 // compress needs 64k data and 32k each for hashhead and hashchain. 393 // decompress just needs 32k data. 394 TT.data = xmalloc(32768*(compress ? 4 : 1)); 395 if (compress) { 396 TT.hashhead = (unsigned short *)(TT.data + 65536); 397 TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768); 398 } 399 400 // Calculate lenbits, lenbase, distbits, distbase 401 *TT.lenbase = 3; 402 for (i = 0; i<sizeof(TT.lenbits)-1; i++) { 403 if (i>4) { 404 if (!(i&3)) { 405 TT.lenbits[i]++; 406 n <<= 1; 407 } 408 if (i == 27) n--; 409 else TT.lenbits[i+1] = TT.lenbits[i]; 410 } 411 TT.lenbase[i+1] = n + TT.lenbase[i]; 412 } 413 n = 0; 414 for (i = 0; i<sizeof(TT.distbits); i++) { 415 TT.distbase[i] = 1<<n; 416 if (i) TT.distbase[i] += TT.distbase[i-1]; 417 if (i>3 && !(i&1)) n++; 418 TT.distbits[i] = n; 419 } 420 421 // Init fixed huffman tables 422 for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279); 423 len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288); 424 memset(toybuf, 5, 30); 425 len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30); 426} 427 428// Return true/false whether we consumed a gzip header. 429static int is_gzip(struct bitbuf *bb) 430{ 431 int flags; 432 433 // Confirm signature 434 if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31) 435 return 0; 436 bitbuf_skip(bb, 6*8); 437 438 // Skip extra, name, comment, header CRC fields 439 if (flags & 4) bitbuf_skip(bb, 16); 440 if (flags & 8) while (bitbuf_get(bb, 8)); 441 if (flags & 16) while (bitbuf_get(bb, 8)); 442 if (flags & 2) bitbuf_skip(bb, 16); 443 444 return 1; 445} 446 447void gzip_crc(char *data, int len) 448{ 449 int i; 450 unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024); 451 452 crc = TT.crc; 453 for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8); 454 TT.crc = crc; 455 TT.len += len; 456} 457 458static void do_gzip(int fd, char *name) 459{ 460 struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf)); 461 462 // Header from RFC 1952 section 2.2: 463 // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none), 464 // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression), 465 // Operating System (FF=unknown) 466 467 TT.infd = fd; 468 xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10); 469 470 // Use last 1k of toybuf for little endian crc table 471 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1); 472 TT.crcfunc = gzip_crc; 473 474 deflate(bb); 475 476 // tail: crc32, len32 477 478 bitbuf_put(bb, 0, (8-bb->bitpos)&7); 479 bitbuf_put(bb, ~TT.crc, 32); 480 bitbuf_put(bb, TT.len, 32); 481 482 bitbuf_flush(bb); 483 free(bb); 484} 485 486static void do_zcat(int fd, char *name) 487{ 488 struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf)); 489 490 if (!is_gzip(bb)) error_exit("not gzip"); 491 TT.outfd = 1; 492 493 // Use last 1k of toybuf for little endian crc table 494 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1); 495 TT.crcfunc = gzip_crc; 496 497 inflate(bb); 498 499 // tail: crc32, len32 500 501 bitbuf_skip(bb, (8-bb->bitpos)&7); 502 if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32)) 503 error_exit("bad crc"); 504 free(bb); 505} 506 507// Parse many different kinds of command line argument: 508 509void compress_main(void) 510{ 511 // todo: this 512 printf("hello world"); 513} 514