1/************************************************************************* 2* Name: huffman.c 3* Author: Marcus Geelnard 4* Description: Huffman coder/decoder implementation. 5* Reentrant: Yes 6* $Id: huffman.c,v 1.6 2004/12/14 18:59:40 marcus256 Exp $ 7* 8* This is a very straight forward implementation of a Huffman coder and 9* decoder. 10* 11* Primary flaws with this primitive implementation are: 12* - Slow bit stream implementation 13* - Fairly slow decoding (slower than encoding) 14* - Maximum tree depth of 32 (the coder aborts if any code exceeds a 15* size of 32 bits). If I'm not mistaking, this should not be possible 16* unless the input buffer is larger than 2^32 bytes, which is not 17* supported by the coder anyway (max 2^32-1 bytes can be specified with 18* an unsigned 32-bit integer). 19* 20* On the other hand, there are a few advantages of this implementation: 21* - The Huffman tree is stored in a very compact form, requiring only 22* 12 bits per symbol (for 8 bit symbols), meaning a maximum of 384 23* bytes overhead. 24* - The Huffman coder does quite well in situations where the data is 25* noisy, in which case most dictionary based coders run into problems. 26* 27* Possible improvements (probably not worth it): 28* - Partition the input data stream into blocks, where each block has 29* its own Huffman tree. With variable block sizes, it should be 30* possible to find locally optimal Huffman trees, which in turn could 31* reduce the total size. 32* - Allow for a few different predefined Huffman trees, which could 33* reduce the size of a block even further. 34*------------------------------------------------------------------------- 35* Copyright (c) 2003-2011 Marcus Geelnard 36* 37* This software is provided 'as-is', without any express or implied 38* warranty. In no event will the authors be held liable for any damages 39* arising from the use of this software. 40* 41* Permission is granted to anyone to use this software for any purpose, 42* including commercial applications, and to alter it and redistribute it 43* freely, subject to the following restrictions: 44* 45* 1. The origin of this software must not be misrepresented; you must not 46* claim that you wrote the original software. If you use this software 47* in a product, an acknowledgment in the product documentation would 48* be appreciated but is not required. 49* 50* 2. Altered source versions must be plainly marked as such, and must not 51* be misrepresented as being the original software. 52* 53* 3. This notice may not be removed or altered from any source 54* distribution. 55* 56* Marcus Geelnard 57* marcus.geelnard at home.se 58*************************************************************************/ 59 60/* Modified May 06 by Julian Seward for use in Valgrind. 61 - changed integral types to V's versions (UInt, UChar etc) 62 - added initialisation in _Huffman_WriteBits, as described in 63 comment in that function. 64*/ 65 66/************************************************************************* 67* Types used for Huffman coding 68*************************************************************************/ 69 70typedef struct { 71 UInt Symbol; 72 UInt Count; 73 UInt Code; 74 UInt Bits; 75} huff_sym_t; 76 77typedef struct { 78 UChar *BytePtr; 79 UInt BitPos; 80} huff_bitstream_t; 81 82 83 84/************************************************************************* 85* INTERNAL FUNCTIONS * 86*************************************************************************/ 87 88 89/************************************************************************* 90* _Huffman_InitBitstream() - Initialize a bitstream. 91*************************************************************************/ 92 93static void _Huffman_InitBitstream( huff_bitstream_t *stream, 94 UChar *buf ) 95{ 96 stream->BytePtr = buf; 97 stream->BitPos = 0; 98} 99 100 101/************************************************************************* 102* _Huffman_ReadBits() - Read bits from a bitstream. 103*************************************************************************/ 104 105static UInt _Huffman_ReadBits( huff_bitstream_t *stream, 106 UInt bits ) 107{ 108 UInt x, bit, count; 109 UChar *buf; 110 111 /* Get current stream state */ 112 buf = stream->BytePtr; 113 bit = stream->BitPos; 114 115 /* Extract bits */ 116 x = 0; 117 for( count = 0; count < bits; ++ count ) 118 { 119 x = (x<<1) + (*buf & (1<<(7-bit)) ? 1 : 0); 120 bit = (bit+1) & 7; 121 if( !bit ) 122 { 123 ++ buf; 124 } 125 } 126 127 /* Store new stream state */ 128 stream->BytePtr = buf; 129 stream->BitPos = bit; 130 131 return x; 132} 133 134 135/************************************************************************* 136* _Huffman_WriteBits() - Write bits to a bitstream. 137*************************************************************************/ 138 139static void _Huffman_WriteBits( huff_bitstream_t *stream, UInt x, 140 UInt bits ) 141{ 142 UInt bit, count; 143 UChar *buf; 144 UInt mask; 145 146 /* Get current stream state */ 147 buf = stream->BytePtr; 148 bit = stream->BitPos; 149 150 /* Append bits */ 151 mask = 1 << (bits-1); 152 for( count = 0; count < bits; ++ count ) 153 { 154 /* If we're starting a new byte, zero it out, so that the 155 resulting byte sequence looks completely defined from 156 Valgrind's point of view. If this doesn't happen then the 157 last byte in the stream may look partially undefined. */ 158 if (bit == 0) 159 *buf = 0; 160 *buf = (*buf & (0xff^(1<<(7-bit)))) + 161 ((x & mask ? 1 : 0) << (7-bit)); 162 x <<= 1; 163 bit = (bit+1) & 7; 164 if( !bit ) 165 { 166 ++ buf; 167 } 168 } 169 170 /* Store new stream state */ 171 stream->BytePtr = buf; 172 stream->BitPos = bit; 173} 174 175 176/************************************************************************* 177* _Huffman_Hist() - Calculate (sorted) histogram for a block of data. 178*************************************************************************/ 179 180static void _Huffman_Hist( UChar *in, huff_sym_t *sym, 181 UInt size ) 182{ 183 Int k, swaps; 184 huff_sym_t tmp; 185 186 /* Clear/init histogram */ 187 for( k = 0; k < 256; ++ k ) 188 { 189 sym[k].Symbol = k; 190 sym[k].Count = 0; 191 sym[k].Code = 0; 192 sym[k].Bits = 0; 193 } 194 195 /* Build histogram */ 196 for( k = size; k; -- k ) 197 { 198 sym[ *in ++ ].Count ++; 199 } 200 201 /* Sort histogram - most frequent symbol first (bubble sort) */ 202 do 203 { 204 swaps = 0; 205 for( k = 0; k < 255; ++ k ) 206 { 207 if( sym[k].Count < sym[k+1].Count ) 208 { 209 tmp = sym[k]; 210 sym[k] = sym[k+1]; 211 sym[k+1] = tmp; 212 swaps = 1; 213 } 214 } 215 } 216 while( swaps ); 217} 218 219 220/************************************************************************* 221* _Huffman_MakeTree() - Generate a Huffman tree. 222*************************************************************************/ 223 224static void _Huffman_MakeTree( huff_sym_t *sym, huff_bitstream_t *stream, 225 UInt code, UInt bits, UInt first, 226 UInt last ) 227{ 228 UInt k, size, size_a, size_b, last_a, first_b; 229 230 /* Is this a leaf node? */ 231 if( first == last ) 232 { 233 /* Append symbol to tree description */ 234 _Huffman_WriteBits( stream, 1, 1 ); 235 _Huffman_WriteBits( stream, sym[first].Symbol, 8 ); 236 237 /* Store code info in symbol array */ 238 sym[first].Code = code; 239 sym[first].Bits = bits; 240 return; 241 } 242 else 243 { 244 /* This was not a leaf node */ 245 _Huffman_WriteBits( stream, 0, 1 ); 246 } 247 248 /* Total size of interval */ 249 size = 0; 250 for( k = first; k <= last; ++ k ) 251 { 252 size += sym[k].Count; 253 } 254 255 /* Find size of branch a */ 256 size_a = 0; 257 for( k = first; size_a < ((size+1)>>1) && k < last; ++ k ) 258 { 259 size_a += sym[k].Count; 260 } 261 262 /* Non-empty branch? */ 263 if( size_a > 0 ) 264 { 265 /* Continue branching */ 266 _Huffman_WriteBits( stream, 1, 1 ); 267 268 /* Branch a cut in histogram */ 269 last_a = k-1; 270 271 /* Create branch a */ 272 _Huffman_MakeTree( sym, stream, (code<<1)+0, bits+1, 273 first, last_a ); 274 } 275 else 276 { 277 /* This was an empty branch */ 278 _Huffman_WriteBits( stream, 0, 1 ); 279 } 280 281 /* Size of branch b */ 282 size_b = size - size_a; 283 284 /* Non-empty branch? */ 285 if( size_b > 0 ) 286 { 287 /* Continue branching */ 288 _Huffman_WriteBits( stream, 1, 1 ); 289 290 /* Branch b cut in histogram */ 291 first_b = k; 292 293 /* Create branch b */ 294 _Huffman_MakeTree( sym, stream, (code<<1)+1, bits+1, 295 first_b, last ); 296 } 297 else 298 { 299 /* This was an empty branch */ 300 _Huffman_WriteBits( stream, 0, 1 ); 301 } 302} 303 304 305/************************************************************************* 306* _Huffman_RecoverTree() - Recover a Huffman tree from a bitstream. 307*************************************************************************/ 308 309static void _Huffman_RecoverTree( huff_sym_t *sym, 310 huff_bitstream_t *stream, UInt code, UInt bits, 311 UInt *symnum ) 312{ 313 UInt symbol; 314 315 /* Is this a leaf node? */ 316 if( _Huffman_ReadBits( stream, 1 ) ) 317 { 318 /* Get symbol from tree description */ 319 symbol = _Huffman_ReadBits( stream, 8 ); 320 321 /* Store code info in symbol array */ 322 sym[*symnum].Symbol = symbol; 323 sym[*symnum].Code = code; 324 sym[*symnum].Bits = bits; 325 326 /* Increase symbol counter */ 327 *symnum = *symnum + 1; 328 329 return; 330 } 331 332 /* Non-empty branch? */ 333 if( _Huffman_ReadBits( stream, 1 ) ) 334 { 335 /* Create branch a */ 336 _Huffman_RecoverTree( sym, stream, (code<<1)+0, bits+1, 337 symnum ); 338 } 339 340 /* Non-empty branch? */ 341 if( _Huffman_ReadBits( stream, 1 ) ) 342 { 343 /* Create branch b */ 344 _Huffman_RecoverTree( sym, stream, (code<<1)+1, bits+1, 345 symnum ); 346 } 347} 348 349 350 351 352/************************************************************************* 353* PUBLIC FUNCTIONS * 354*************************************************************************/ 355 356 357/************************************************************************* 358* Huffman_Compress() - Compress a block of data using a Huffman coder. 359* in - Input (uncompressed) buffer. 360* out - Output (compressed) buffer. This buffer must be 384 bytes 361* larger than the input buffer. 362* insize - Number of input bytes. 363* The function returns the size of the compressed data. 364*************************************************************************/ 365static 366Int Huffman_Compress( UChar *in, UChar *out, 367 UInt insize ) 368{ 369 huff_sym_t sym[ 256 ], tmp; 370 huff_bitstream_t stream; 371 UInt k, total_bytes, swaps, symbol, last_symbol; 372 373 /* Do we have anything to compress? */ 374 if( insize < 1 ) return 0; 375 376 /* Initialize bitstream */ 377 _Huffman_InitBitstream( &stream, out ); 378 379 /* Calculate and sort histogram for input data */ 380 _Huffman_Hist( in, sym, insize ); 381 382 /* Find number of used symbols */ 383 for( last_symbol = 255; sym[last_symbol].Count == 0; -- last_symbol ); 384 385 /* Special case: In order to build a correct tree, we need at least 386 two symbols (otherwise we get zero-bit representations). */ 387 if( last_symbol == 0 ) ++ last_symbol; 388 389 /* Build Huffman tree */ 390 _Huffman_MakeTree( sym, &stream, 0, 0, 0, last_symbol ); 391 392 /* Was any code > 32 bits? (we do not handle that at present) */ 393 for( k = 0; k < 255; ++ k ) 394 { 395 if( sym[k].Bits > 32 ) 396 { 397 return 0; 398 } 399 } 400 401 /* Sort histogram - first symbol first (bubble sort) */ 402 do 403 { 404 swaps = 0; 405 for( k = 0; k < 255; ++ k ) 406 { 407 if( sym[k].Symbol > sym[k+1].Symbol ) 408 { 409 tmp = sym[k]; 410 sym[k] = sym[k+1]; 411 sym[k+1] = tmp; 412 swaps = 1; 413 } 414 } 415 } 416 while( swaps ); 417 418 /* Encode input stream */ 419 for( k = 0; k < insize; ++ k ) 420 { 421 symbol = in[ k ]; 422 _Huffman_WriteBits( &stream, sym[symbol].Code, 423 sym[symbol].Bits ); 424 } 425 426 /* Calculate size of output data */ 427 total_bytes = (Int)(stream.BytePtr - out); 428 if( stream.BitPos > 0 ) 429 { 430 ++ total_bytes; 431 } 432 433 return total_bytes; 434} 435 436 437 438/************************************************************************* 439* Huffman_Uncompress() - Uncompress a block of data using a Huffman 440* decoder. 441* in - Input (compressed) buffer. 442* out - Output (uncompressed) buffer. This buffer must be large 443* enough to hold the uncompressed data. 444* insize - Number of input bytes. 445* outsize - Number of output bytes. 446*************************************************************************/ 447static 448void Huffman_Uncompress( UChar *in, UChar *out, 449 UInt insize, UInt outsize ) 450{ 451 huff_sym_t sym[ 256 ], tmp; 452 huff_bitstream_t stream; 453 UInt k, m, symbol_count, swaps; 454 UChar *buf; 455 UInt bits, delta_bits, new_bits, code; 456 457 /* Do we have anything to decompress? */ 458 if( insize < 1 ) return; 459 460 /* Initialize bitstream */ 461 _Huffman_InitBitstream( &stream, in ); 462 463 /* Clear tree/histogram */ 464 for( k = 0; k < 256; ++ k ) 465 { 466 sym[k].Bits = 0x7fffffff; 467 } 468 469 /* Recover Huffman tree */ 470 symbol_count = 0; 471 _Huffman_RecoverTree( sym, &stream, 0, 0, &symbol_count ); 472 473 /* Sort histogram - shortest code first (bubble sort) */ 474 do 475 { 476 swaps = 0; 477 for( k = 0; k < symbol_count-1; ++ k ) 478 { 479 if( sym[k].Bits > sym[k+1].Bits ) 480 { 481 tmp = sym[k]; 482 sym[k] = sym[k+1]; 483 sym[k+1] = tmp; 484 swaps = 1; 485 } 486 } 487 } 488 while( swaps ); 489 490 /* Decode input stream */ 491 buf = out; 492 for( k = 0; k < outsize; ++ k ) 493 { 494 /* Search tree for matching code */ 495 bits = 0; 496 code = 0; 497 for( m = 0; m < symbol_count; ++ m ) 498 { 499 delta_bits = sym[m].Bits - bits; 500 if( delta_bits ) 501 { 502 new_bits = _Huffman_ReadBits( &stream, delta_bits ); 503 code = code | (new_bits << (32-bits-delta_bits)); 504 bits = sym[m].Bits; 505 } 506 if( code == (sym[m].Code << (32-sym[m].Bits)) ) 507 { 508 *buf ++ = (UChar) sym[m].Symbol; 509 break; 510 } 511 } 512 } 513} 514