1/* compress.c - deflate/inflate code for zip, gzip, zlib, and raw
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * The inflate/deflate code lives here, so the various things that use it
6 * either live here or call these commands to pipe data through them.
7 *
8 * Divergence from posix: replace obsolete/patented "compress" with mutiplexer.
9 * (gzip already replaces "uncompress".)
10 *
11 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
12 * LSB 4.1 has gzip, gunzip, and zcat
13 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
14
15// Accept many different kinds of command line argument.
16// Leave Lrg at end so flag values line up.
17
18USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN))
19USE_GZIP(NEWTOY(gzip, USE_GZIP_D("d")"19dcflqStvgLRz[!gLRz]", TOYFLAG_USR|TOYFLAG_BIN))
20USE_ZCAT(NEWTOY(zcat, 0, TOYFLAG_USR|TOYFLAG_BIN))
21USE_GUNZIP(NEWTOY(gunzip, "cflqStv", TOYFLAG_USR|TOYFLAG_BIN))
22
23//zip unzip gzip gunzip zcat
24
25config COMPRESS
26  bool "compress"
27  default n
28  help
29    usage: compress [-zgLR19] [FILE]
30
31    Compress or decompress file (or stdin) using "deflate" algorithm.
32
33    -1	min compression
34    -9	max compression (default)
35    -g	gzip (default)
36    -L	zlib
37    -R	raw
38    -z	zip
39
40config GZIP
41  bool "gzip"
42  default y
43  depends on COMPRESS
44  help
45    usage: gzip [-19cfqStvzgLR] [FILE...]
46
47    Compess (deflate) file(s). With no files, compress stdin to stdout.
48
49    On successful decompression, compressed files are replaced with the
50    uncompressed version. The input file is removed and replaced with
51    a new file without the .gz extension (with same ownership/permissions).
52
53    -1	Minimal compression (fastest)
54    -9	Max compression (default)
55    -c	cat to stdout (act as zcat)
56    -f	force (if output file exists, input is tty, unrecognized extension)
57    -q	quiet (no warnings)
58    -S	specify exension (default .*)
59    -t	test compressed file(s)
60    -v	verbose (like -l, but compress files)
61
62    Compression type:
63    -g gzip (default)    -L zlib    -R raw    -z zip
64
65config GZIP_D
66  bool
67  default y
68  depends on GZIP && DECOMPRESS
69  help
70    usage: gzip [-d]
71
72    -d	decompress (act as gunzip)
73
74config DECOMPRESS
75  bool "decompress"
76  default n
77  help
78    usage: compress [-zglrcd9] [FILE]
79
80    Compress or decompress file (or stdin) using "deflate" algorithm.
81
82    -c	compress with -g gzip (default)  -l zlib  -r raw  -z zip
83    -d	decompress (autodetects type)
84
85
86config ZCAT
87  bool "zcat"
88  default y
89  depends on DECOMPRESS
90  help
91    usage: zcat [FILE...]
92
93    Decompress deflated file(s) to stdout
94
95config GUNZIP
96  bool "gunzip"
97  default y
98  depends on DECOMPRESS
99  help
100    usage: gunzip [-cflqStv] [FILE...]
101
102    Decompess (deflate) file(s). With no files, compress stdin to stdout.
103
104    On successful decompression, compressed files are replaced with the
105    uncompressed version. The input file is removed and replaced with
106    a new file without the .gz extension (with same ownership/permissions).
107
108    -c	cat to stdout (act as zcat)
109    -f	force (output file exists, input is tty, unrecognized extension)
110    -l	list compressed/uncompressed/ratio/name for each input file.
111    -q	quiet (no warnings)
112    -S	specify exension (default .*)
113    -t	test compressed file(s)
114    -v	verbose (like -l, but decompress files)
115*/
116
117#define FOR_compress
118#include "toys.h"
119
120GLOBALS(
121  // Huffman codes: base offset and extra bits tables (length and distance)
122  char lenbits[29], distbits[30];
123  unsigned short lenbase[29], distbase[30];
124  void *fixdisthuff, *fixlithuff;
125
126  // CRC
127  void (*crcfunc)(char *data, int len);
128  unsigned crc;
129
130  // Compressed data buffer
131  char *data;
132  unsigned pos, len;
133  int infd, outfd;
134
135  // Tables only used for deflation
136  unsigned short *hashhead, *hashchain;
137)
138
139// little endian bit buffer
140struct bitbuf {
141  int fd, bitpos, len, max;
142  char buf[];
143};
144
145// malloc a struct bitbuf
146struct bitbuf *bitbuf_init(int fd, int size)
147{
148  struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
149
150  bb->max = size;
151  bb->fd = fd;
152
153  return bb;
154}
155
156// Advance bitpos without the overhead of recording bits
157void bitbuf_skip(struct bitbuf *bb, int bits)
158{
159  int pos = bb->bitpos + bits, len = bb->len << 3;
160
161  while (pos >= len) {
162    pos -= len;
163    len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
164    if (bb->len < 1) perror_exit("inflate EOF");
165  }
166  bb->bitpos = pos;
167}
168
169// Optimized single bit inlined version
170static inline int bitbuf_bit(struct bitbuf *bb)
171{
172  int bufpos = bb->bitpos>>3;
173
174  if (bufpos == bb->len) {
175    bitbuf_skip(bb, 0);
176    bufpos = 0;
177  }
178
179  return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
180}
181
182// Fetch the next X bits from the bitbuf, little endian
183unsigned bitbuf_get(struct bitbuf *bb, int bits)
184{
185  int result = 0, offset = 0;
186
187  while (bits) {
188    int click = bb->bitpos >> 3, blow, blen;
189
190    // Load more data if buffer empty
191    if (click == bb->len) bitbuf_skip(bb, click = 0);
192
193    // grab bits from next byte
194    blow = bb->bitpos & 7;
195    blen = 8-blow;
196    if (blen > bits) blen = bits;
197    result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
198    offset += blen;
199    bits -= blen;
200    bb->bitpos += blen;
201  }
202
203  return result;
204}
205
206void bitbuf_flush(struct bitbuf *bb)
207{
208  if (!bb->bitpos) return;
209
210  xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8);
211  memset(bb->buf, 0, bb->max);
212  bb->bitpos = 0;
213}
214
215void bitbuf_put(struct bitbuf *bb, int data, int len)
216{
217  while (len) {
218    int click = bb->bitpos >> 3, blow, blen;
219
220    // Flush buffer if necessary
221    if (click == bb->max) {
222      bitbuf_flush(bb);
223      click = 0;
224    }
225    blow = bb->bitpos & 7;
226    blen = 8-blow;
227    if (blen > len) blen = len;
228    bb->buf[click] |= data << blow;
229    bb->bitpos += blen;
230    data >>= blen;
231    len -= blen;
232  }
233}
234
235static void data_crc(char sym)
236{
237  TT.data[TT.pos++ & 32767] = sym;
238
239  if (!(TT.pos & 32767)) {
240    xwrite(TT.outfd, TT.data, 32768);
241    if (TT.crcfunc) TT.crcfunc(TT.data, 32768);
242  }
243}
244
245// Huffman coding uses bits to traverse a binary tree to a leaf node,
246// By placing frequently occurring symbols at shorter paths, frequently
247// used symbols may be represented in fewer bits than uncommon symbols.
248
249struct huff {
250  unsigned short length[16];
251  unsigned short symbol[288];
252};
253
254// Create simple huffman tree from array of bit lengths.
255
256// The symbols in the huffman trees are sorted (first by bit length
257// of the code to reach them, then by symbol number). This means that given
258// the bit length of each symbol, we can construct a unique tree.
259static void len2huff(struct huff *huff, char bitlen[], int len)
260{
261  int offset[16];
262  int i;
263
264  // Count number of codes at each bit length
265  memset(huff, 0, sizeof(struct huff));
266  for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
267
268  // Sort symbols by bit length. (They'll remain sorted by symbol within that.)
269  *huff->length = *offset = 0;
270  for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
271
272  for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
273}
274
275// Fetch and decode next huffman coded symbol from bitbuf.
276// This takes advantage of the sorting to navigate the tree as an array:
277// each time we fetch a bit we have all the codes at that bit level in
278// order with no gaps.
279static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
280{
281  unsigned short *length = huff->length;
282  int start = 0, offset = 0;
283
284  // Traverse through the bit lengths until our code is in this range
285  for (;;) {
286    offset = (offset << 1) | bitbuf_bit(bb);
287    start += *++length;
288    if ((offset -= *length) < 0) break;
289    if ((length - huff->length) & 16) error_exit("bad symbol");
290  }
291
292  return huff->symbol[start + offset];
293}
294
295// Decompress deflated data from bitbuf to TT.outfd.
296static void inflate(struct bitbuf *bb)
297{
298  TT.crc = ~0;
299  // repeat until spanked
300  for (;;) {
301    int final, type;
302
303    final = bitbuf_get(bb, 1);
304    type = bitbuf_get(bb, 2);
305
306    if (type == 3) error_exit("bad type");
307
308    // Uncompressed block?
309    if (!type) {
310      int len, nlen;
311
312      // Align to byte, read length
313      bitbuf_skip(bb, (8-bb->bitpos)&7);
314      len = bitbuf_get(bb, 16);
315      nlen = bitbuf_get(bb, 16);
316      if (len != (0xffff & ~nlen)) error_exit("bad len");
317
318      // Dump literal output data
319      while (len) {
320        int pos = bb->bitpos >> 3, bblen = bb->len - pos;
321        char *p = bb->buf+pos;
322
323        // dump bytes until done or end of current bitbuf contents
324        if (bblen > len) bblen = len;
325        pos = bblen;
326        while (pos--) data_crc(*(p++));
327        bitbuf_skip(bb, bblen << 3);
328        len -= bblen;
329      }
330
331    // Compressed block
332    } else {
333      struct huff *disthuff, *lithuff;
334
335      // Dynamic huffman codes?
336      if (type == 2) {
337        struct huff *h2 = ((struct huff *)toybuf)+1;
338        int i, litlen, distlen, hufflen;
339        char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
340                              "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
341
342        // The huffman trees are stored as a series of bit lengths
343        litlen = bitbuf_get(bb, 5)+257;  // max 288
344        distlen = bitbuf_get(bb, 5)+1;   // max 32
345        hufflen = bitbuf_get(bb, 4)+4;   // max 19
346
347        // The literal and distance codes are themselves compressed, in
348        // a complicated way: an array of bit lengths (hufflen many
349        // entries, each 3 bits) is used to fill out an array of 19 entries
350        // in a magic order, leaving the rest 0. Then make a tree out of it:
351        memset(bits = toybuf+1, 0, 19);
352        for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
353        len2huff(h2, bits, 19);
354
355        // Use that tree to read in the literal and distance bit lengths
356        for (i = 0; i < litlen + distlen;) {
357          int sym = huff_and_puff(bb, h2);
358
359          // 0-15 are literals, 16 = repeat previous code 3-6 times,
360          // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
361          if (sym < 16) bits[i++] = sym;
362          else {
363            int len = sym & 2;
364
365            len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
366            memset(bits+i, bits[i-1] * !(sym&3), len);
367            i += len;
368          }
369        }
370        if (i > litlen+distlen) error_exit("bad tree");
371
372        len2huff(lithuff = h2, bits, litlen);
373        len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
374
375      // Static huffman codes
376      } else {
377        lithuff = TT.fixlithuff;
378        disthuff = TT.fixdisthuff;
379      }
380
381      // Use huffman tables to decode block of compressed symbols
382      for (;;) {
383        int sym = huff_and_puff(bb, lithuff);
384
385        // Literal?
386        if (sym < 256) data_crc(sym);
387
388        // Copy range?
389        else if (sym > 256) {
390          int len, dist;
391
392          sym -= 257;
393          len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]);
394          sym = huff_and_puff(bb, disthuff);
395          dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]);
396          sym = TT.pos & 32767;
397
398          while (len--) data_crc(TT.data[(TT.pos-dist) & 32767]);
399
400        // End of block
401        } else break;
402      }
403    }
404
405    // Was that the last block?
406    if (final) break;
407  }
408
409  if (TT.pos & 32767) {
410    xwrite(TT.outfd, TT.data, TT.pos & 32767);
411    if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767);
412  }
413}
414
415// Deflate from TT.infd to bitbuf
416// For deflate, TT.len = input read, TT.pos = input consumed
417static void deflate(struct bitbuf *bb)
418{
419  char *data = TT.data;
420  int len, final = 0;
421
422  TT.crc = ~0;
423
424  while (!final) {
425    // Read next half-window of data if we haven't hit EOF yet.
426    len = readall(TT.infd, data+(TT.len&32768), 32768);
427    if (len < 0) perror_exit("read"); // todo: add filename
428    if (len != 32768) final++;
429    if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len);
430    // TT.len += len;  crcfunc advances len
431
432    // store block as literal
433    bitbuf_put(bb, final, 1);
434    bitbuf_put(bb, 0, 1);
435
436    bitbuf_put(bb, 0, (8-bb->bitpos)&7);
437    bitbuf_put(bb, len, 16);
438    bitbuf_put(bb, 0xffff & ~len, 16);
439
440    // repeat until spanked
441    while (TT.pos != TT.len) {
442      unsigned pos = TT.pos & 65535;
443
444      bitbuf_put(bb, data[pos], 8);
445
446      // need to refill buffer?
447      if (!(32767 & ++TT.pos) && !final) break;
448    }
449  }
450  bitbuf_flush(bb);
451}
452
453// Allocate memory for deflate/inflate.
454static void init_deflate(int compress)
455{
456  int i, n = 1;
457
458  // compress needs 64k data and 32k each for hashhead and hashchain.
459  // decompress just needs 32k data.
460  TT.data = xmalloc(32768*(compress ? 4 : 1));
461  if (compress) {
462    TT.hashhead = (unsigned short *)(TT.data + 65536);
463    TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768);
464  }
465
466  // Calculate lenbits, lenbase, distbits, distbase
467  *TT.lenbase = 3;
468  for (i = 0; i<sizeof(TT.lenbits)-1; i++) {
469    if (i>4) {
470      if (!(i&3)) {
471        TT.lenbits[i]++;
472        n <<= 1;
473      }
474      if (i == 27) n--;
475      else TT.lenbits[i+1] = TT.lenbits[i];
476    }
477    TT.lenbase[i+1] = n + TT.lenbase[i];
478  }
479  n = 0;
480  for (i = 0; i<sizeof(TT.distbits); i++) {
481    TT.distbase[i] = 1<<n;
482    if (i) TT.distbase[i] += TT.distbase[i-1];
483    if (i>3 && !(i&1)) n++;
484    TT.distbits[i] = n;
485  }
486
487  // Init fixed huffman tables
488  for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
489  len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
490  memset(toybuf, 5, 30);
491  len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
492}
493
494// Return true/false whether we consumed a gzip header.
495static int is_gzip(struct bitbuf *bb)
496{
497  int flags;
498
499  // Confirm signature
500  if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
501    return 0;
502  bitbuf_skip(bb, 6*8);
503
504  // Skip extra, name, comment, header CRC fields
505  if (flags & 4) bitbuf_skip(bb, 16);
506  if (flags & 8) while (bitbuf_get(bb, 8));
507  if (flags & 16) while (bitbuf_get(bb, 8));
508  if (flags & 2) bitbuf_skip(bb, 16);
509
510  return 1;
511}
512
513void gzip_crc(char *data, int len)
514{
515  int i;
516  unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024);
517
518  crc = TT.crc;
519  for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
520  TT.crc = crc;
521  TT.len += len;
522}
523
524static void do_gzip(int fd, char *name)
525{
526  struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf));
527
528  // Header from RFC 1952 section 2.2:
529  // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
530  // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
531  // Operating System (FF=unknown)
532
533  TT.infd = fd;
534  xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
535
536  // Use last 1k of toybuf for little endian crc table
537  crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
538  TT.crcfunc = gzip_crc;
539
540  deflate(bb);
541
542  // tail: crc32, len32
543
544  bitbuf_put(bb, 0, (8-bb->bitpos)&7);
545  bitbuf_put(bb, ~TT.crc, 32);
546  bitbuf_put(bb, TT.len, 32);
547
548  bitbuf_flush(bb);
549  free(bb);
550}
551
552static void do_zcat(int fd, char *name)
553{
554  struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf));
555
556  if (!is_gzip(bb)) error_exit("not gzip");
557  TT.outfd = 1;
558
559  // Use last 1k of toybuf for little endian crc table
560  crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
561  TT.crcfunc = gzip_crc;
562
563  inflate(bb);
564
565  // tail: crc32, len32
566
567  bitbuf_skip(bb, (8-bb->bitpos)&7);
568  if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32))
569    error_exit("bad crc");
570  free(bb);
571}
572
573// Parse many different kinds of command line argument:
574
575void compress_main(void)
576{
577  // todo: this
578  printf("hello world");
579}
580
581//#define CLEANUP_compress
582//#define FOR_zcat
583//#include "generated/flags.h"
584
585void zcat_main(void)
586{
587  init_deflate(0);
588
589  loopfiles(toys.optargs, do_zcat);
590}
591
592void gunzip_main(void)
593{
594  init_deflate(0);
595
596  loopfiles(toys.optargs, do_zcat);
597}
598
599void gzip_main(void)
600{
601  init_deflate(1);
602
603  loopfiles(toys.optargs, do_gzip);
604}
605