1/*************************************************************************
2* Name:        huffman.c
3* Author:      Marcus Geelnard
4* Description: Huffman coder/decoder implementation.
5* Reentrant:   Yes
6* $Id: huffman.c,v 1.6 2004/12/14 18:59:40 marcus256 Exp $
7*
8* This is a very straight forward implementation of a Huffman coder and
9* decoder.
10*
11* Primary flaws with this primitive implementation are:
12*  - Slow bit stream implementation
13*  - Fairly slow decoding (slower than encoding)
14*  - Maximum tree depth of 32 (the coder aborts if any code exceeds a
15*    size of 32 bits). If I'm not mistaking, this should not be possible
16*    unless the input buffer is larger than 2^32 bytes, which is not
17*    supported by the coder anyway (max 2^32-1 bytes can be specified with
18*    an unsigned 32-bit integer).
19*
20* On the other hand, there are a few advantages of this implementation:
21*  - The Huffman tree is stored in a very compact form, requiring only
22*    12 bits per symbol (for 8 bit symbols), meaning a maximum of 384
23*    bytes overhead.
24*  - The Huffman coder does quite well in situations where the data is
25*    noisy, in which case most dictionary based coders run into problems.
26*
27* Possible improvements (probably not worth it):
28*  - Partition the input data stream into blocks, where each block has
29*    its own Huffman tree. With variable block sizes, it should be
30*    possible to find locally optimal Huffman trees, which in turn could
31*    reduce the total size.
32*  - Allow for a few different predefined Huffman trees, which could
33*    reduce the size of a block even further.
34*-------------------------------------------------------------------------
35* Copyright (c) 2003-2011 Marcus Geelnard
36*
37* This software is provided 'as-is', without any express or implied
38* warranty. In no event will the authors be held liable for any damages
39* arising from the use of this software.
40*
41* Permission is granted to anyone to use this software for any purpose,
42* including commercial applications, and to alter it and redistribute it
43* freely, subject to the following restrictions:
44*
45* 1. The origin of this software must not be misrepresented; you must not
46*    claim that you wrote the original software. If you use this software
47*    in a product, an acknowledgment in the product documentation would
48*    be appreciated but is not required.
49*
50* 2. Altered source versions must be plainly marked as such, and must not
51*    be misrepresented as being the original software.
52*
53* 3. This notice may not be removed or altered from any source
54*    distribution.
55*
56* Marcus Geelnard
57* marcus.geelnard at home.se
58*************************************************************************/
59
60/* Modified May 06 by Julian Seward for use in Valgrind.
61   - changed integral types to V's versions (UInt, UChar etc)
62   - added initialisation in _Huffman_WriteBits, as described in
63     comment in that function.
64*/
65
66/*************************************************************************
67* Types used for Huffman coding
68*************************************************************************/
69
70typedef struct {
71    UInt Symbol;
72    UInt Count;
73    UInt Code;
74    UInt Bits;
75} huff_sym_t;
76
77typedef struct {
78    UChar *BytePtr;
79    UInt  BitPos;
80} huff_bitstream_t;
81
82
83
84/*************************************************************************
85*                           INTERNAL FUNCTIONS                           *
86*************************************************************************/
87
88
89/*************************************************************************
90* _Huffman_InitBitstream() - Initialize a bitstream.
91*************************************************************************/
92
93static void _Huffman_InitBitstream( huff_bitstream_t *stream,
94    UChar *buf )
95{
96    stream->BytePtr  = buf;
97    stream->BitPos   = 0;
98}
99
100
101/*************************************************************************
102* _Huffman_ReadBits() - Read bits from a bitstream.
103*************************************************************************/
104
105static UInt _Huffman_ReadBits( huff_bitstream_t *stream,
106    UInt bits )
107{
108    UInt  x, bit, count;
109    UChar *buf;
110
111    /* Get current stream state */
112    buf = stream->BytePtr;
113    bit = stream->BitPos;
114
115    /* Extract bits */
116    x = 0;
117    for( count = 0; count < bits; ++ count )
118    {
119        x = (x<<1) + (*buf & (1<<(7-bit)) ? 1 : 0);
120        bit = (bit+1) & 7;
121        if( !bit )
122        {
123            ++ buf;
124        }
125    }
126
127    /* Store new stream state */
128    stream->BytePtr = buf;
129    stream->BitPos  = bit;
130
131    return x;
132}
133
134
135/*************************************************************************
136* _Huffman_WriteBits() - Write bits to a bitstream.
137*************************************************************************/
138
139static void _Huffman_WriteBits( huff_bitstream_t *stream, UInt x,
140    UInt bits )
141{
142    UInt  bit, count;
143    UChar *buf;
144    UInt  mask;
145
146    /* Get current stream state */
147    buf = stream->BytePtr;
148    bit = stream->BitPos;
149
150    /* Append bits */
151    mask = 1 << (bits-1);
152    for( count = 0; count < bits; ++ count )
153    {
154        /* If we're starting a new byte, zero it out, so that the
155           resulting byte sequence looks completely defined from
156           Valgrind's point of view.  If this doesn't happen then the
157           last byte in the stream may look partially undefined. */
158        if (bit == 0)
159           *buf = 0;
160        *buf = (*buf & (0xff^(1<<(7-bit)))) +
161               ((x & mask ? 1 : 0) << (7-bit));
162        x <<= 1;
163        bit = (bit+1) & 7;
164        if( !bit )
165        {
166            ++ buf;
167        }
168    }
169
170    /* Store new stream state */
171    stream->BytePtr = buf;
172    stream->BitPos  = bit;
173}
174
175
176/*************************************************************************
177* _Huffman_Hist() - Calculate (sorted) histogram for a block of data.
178*************************************************************************/
179
180static void _Huffman_Hist( UChar *in, huff_sym_t *sym,
181    UInt size )
182{
183    Int k, swaps;
184    huff_sym_t tmp;
185
186    /* Clear/init histogram */
187    for( k = 0; k < 256; ++ k )
188    {
189        sym[k].Symbol = k;
190        sym[k].Count  = 0;
191        sym[k].Code   = 0;
192        sym[k].Bits   = 0;
193    }
194
195    /* Build histogram */
196    for( k = size; k; -- k )
197    {
198        sym[ *in ++ ].Count ++;
199    }
200
201    /* Sort histogram - most frequent symbol first (bubble sort) */
202    do
203    {
204        swaps = 0;
205        for( k = 0; k < 255; ++ k )
206        {
207            if( sym[k].Count < sym[k+1].Count )
208            {
209                tmp      = sym[k];
210                sym[k]   = sym[k+1];
211                sym[k+1] = tmp;
212                swaps    = 1;
213            }
214        }
215    }
216    while( swaps );
217}
218
219
220/*************************************************************************
221* _Huffman_MakeTree() - Generate a Huffman tree.
222*************************************************************************/
223
224static void _Huffman_MakeTree( huff_sym_t *sym, huff_bitstream_t *stream,
225    UInt code, UInt bits, UInt first,
226    UInt last )
227{
228    UInt k, size, size_a, size_b, last_a, first_b;
229
230    /* Is this a leaf node? */
231    if( first == last )
232    {
233        /* Append symbol to tree description */
234        _Huffman_WriteBits( stream, 1, 1 );
235        _Huffman_WriteBits( stream, sym[first].Symbol, 8 );
236
237        /* Store code info in symbol array */
238        sym[first].Code = code;
239        sym[first].Bits = bits;
240        return;
241    }
242    else
243    {
244        /* This was not a leaf node */
245        _Huffman_WriteBits( stream, 0, 1 );
246    }
247
248    /* Total size of interval */
249    size = 0;
250    for( k = first; k <= last; ++ k )
251    {
252        size += sym[k].Count;
253    }
254
255    /* Find size of branch a */
256    size_a = 0;
257    for( k = first; size_a < ((size+1)>>1) && k < last; ++ k )
258    {
259        size_a += sym[k].Count;
260    }
261
262    /* Non-empty branch? */
263    if( size_a > 0 )
264    {
265        /* Continue branching */
266        _Huffman_WriteBits( stream, 1, 1 );
267
268        /* Branch a cut in histogram */
269        last_a  = k-1;
270
271        /* Create branch a */
272        _Huffman_MakeTree( sym, stream, (code<<1)+0, bits+1,
273                               first, last_a );
274    }
275    else
276    {
277        /* This was an empty branch */
278        _Huffman_WriteBits( stream, 0, 1 );
279    }
280
281    /* Size of branch b */
282    size_b = size - size_a;
283
284    /* Non-empty branch? */
285    if( size_b > 0 )
286    {
287        /* Continue branching */
288        _Huffman_WriteBits( stream, 1, 1 );
289
290        /* Branch b cut in histogram */
291        first_b = k;
292
293        /* Create branch b */
294        _Huffman_MakeTree( sym, stream, (code<<1)+1, bits+1,
295                               first_b, last );
296    }
297    else
298    {
299        /* This was an empty branch */
300        _Huffman_WriteBits( stream, 0, 1 );
301    }
302}
303
304
305/*************************************************************************
306* _Huffman_RecoverTree() - Recover a Huffman tree from a bitstream.
307*************************************************************************/
308
309static void _Huffman_RecoverTree( huff_sym_t *sym,
310    huff_bitstream_t *stream, UInt code, UInt bits,
311    UInt *symnum )
312{
313    UInt symbol;
314
315    /* Is this a leaf node? */
316    if( _Huffman_ReadBits( stream, 1 ) )
317    {
318        /* Get symbol from tree description */
319        symbol = _Huffman_ReadBits( stream, 8 );
320
321        /* Store code info in symbol array */
322        sym[*symnum].Symbol = symbol;
323        sym[*symnum].Code   = code;
324        sym[*symnum].Bits   = bits;
325
326        /* Increase symbol counter */
327        *symnum = *symnum + 1;
328
329        return;
330    }
331
332    /* Non-empty branch? */
333    if( _Huffman_ReadBits( stream, 1 ) )
334    {
335        /* Create branch a */
336        _Huffman_RecoverTree( sym, stream, (code<<1)+0, bits+1,
337                              symnum );
338    }
339
340    /* Non-empty branch? */
341    if( _Huffman_ReadBits( stream, 1 ) )
342    {
343        /* Create branch b */
344        _Huffman_RecoverTree( sym, stream, (code<<1)+1, bits+1,
345                              symnum );
346    }
347}
348
349
350
351
352/*************************************************************************
353*                            PUBLIC FUNCTIONS                            *
354*************************************************************************/
355
356
357/*************************************************************************
358* Huffman_Compress() - Compress a block of data using a Huffman coder.
359*  in     - Input (uncompressed) buffer.
360*  out    - Output (compressed) buffer. This buffer must be 384 bytes
361*           larger than the input buffer.
362*  insize - Number of input bytes.
363* The function returns the size of the compressed data.
364*************************************************************************/
365static
366Int Huffman_Compress( UChar *in, UChar *out,
367    UInt insize )
368{
369    huff_sym_t       sym[ 256 ], tmp;
370    huff_bitstream_t stream;
371    UInt     k, total_bytes, swaps, symbol, last_symbol;
372
373    /* Do we have anything to compress? */
374    if( insize < 1 ) return 0;
375
376    /* Initialize bitstream */
377    _Huffman_InitBitstream( &stream, out );
378
379    /* Calculate and sort histogram for input data */
380    _Huffman_Hist( in, sym, insize );
381
382    /* Find number of used symbols */
383    for( last_symbol = 255; sym[last_symbol].Count == 0; -- last_symbol );
384
385    /* Special case: In order to build a correct tree, we need at least
386       two symbols (otherwise we get zero-bit representations). */
387    if( last_symbol == 0 ) ++ last_symbol;
388
389    /* Build Huffman tree */
390    _Huffman_MakeTree( sym, &stream, 0, 0, 0, last_symbol );
391
392    /* Was any code > 32 bits? (we do not handle that at present) */
393    for( k = 0; k < 255; ++ k )
394    {
395        if( sym[k].Bits > 32 )
396        {
397            return 0;
398        }
399    }
400
401    /* Sort histogram - first symbol first (bubble sort) */
402    do
403    {
404        swaps = 0;
405        for( k = 0; k < 255; ++ k )
406        {
407            if( sym[k].Symbol > sym[k+1].Symbol )
408            {
409                tmp      = sym[k];
410                sym[k]   = sym[k+1];
411                sym[k+1] = tmp;
412                swaps    = 1;
413            }
414        }
415    }
416    while( swaps );
417
418    /* Encode input stream */
419    for( k = 0; k < insize; ++ k )
420    {
421        symbol = in[ k ];
422        _Huffman_WriteBits( &stream, sym[symbol].Code,
423                            sym[symbol].Bits );
424    }
425
426    /* Calculate size of output data */
427    total_bytes = (Int)(stream.BytePtr - out);
428    if( stream.BitPos > 0 )
429    {
430        ++ total_bytes;
431    }
432
433    return total_bytes;
434}
435
436
437
438/*************************************************************************
439* Huffman_Uncompress() - Uncompress a block of data using a Huffman
440* decoder.
441*  in      - Input (compressed) buffer.
442*  out     - Output (uncompressed) buffer. This buffer must be large
443*            enough to hold the uncompressed data.
444*  insize  - Number of input bytes.
445*  outsize - Number of output bytes.
446*************************************************************************/
447static
448void Huffman_Uncompress( UChar *in, UChar *out,
449    UInt insize, UInt outsize )
450{
451    huff_sym_t       sym[ 256 ], tmp;
452    huff_bitstream_t stream;
453    UInt     k, m, symbol_count, swaps;
454    UChar    *buf;
455    UInt     bits, delta_bits, new_bits, code;
456
457    /* Do we have anything to decompress? */
458    if( insize < 1 ) return;
459
460    /* Initialize bitstream */
461    _Huffman_InitBitstream( &stream, in );
462
463    /* Clear tree/histogram */
464    for( k = 0; k < 256; ++ k )
465    {
466        sym[k].Bits = 0x7fffffff;
467    }
468
469    /* Recover Huffman tree */
470    symbol_count = 0;
471    _Huffman_RecoverTree( sym, &stream, 0, 0, &symbol_count );
472
473    /* Sort histogram - shortest code first (bubble sort) */
474    do
475    {
476        swaps = 0;
477        for( k = 0; k < symbol_count-1; ++ k )
478        {
479            if( sym[k].Bits > sym[k+1].Bits )
480            {
481                tmp      = sym[k];
482                sym[k]   = sym[k+1];
483                sym[k+1] = tmp;
484                swaps    = 1;
485            }
486        }
487    }
488    while( swaps );
489
490    /* Decode input stream */
491    buf = out;
492    for( k = 0; k < outsize; ++ k )
493    {
494        /* Search tree for matching code */
495        bits = 0;
496        code = 0;
497        for( m = 0; m < symbol_count; ++ m )
498        {
499            delta_bits = sym[m].Bits - bits;
500            if( delta_bits )
501            {
502                new_bits = _Huffman_ReadBits( &stream, delta_bits );
503                code = code | (new_bits << (32-bits-delta_bits));
504                bits = sym[m].Bits;
505            }
506            if( code == (sym[m].Code << (32-sym[m].Bits)) )
507            {
508                *buf ++ = (UChar) sym[m].Symbol;
509                break;
510            }
511        }
512    }
513}
514