1#!python3
2"""Program to dump contents of Brotli compressed files showing the compression format.
3Jurjen N.E. Bos, 2016.
4I found the following issues with the Brotli format:
5- The distance alphabet has size 16+(48<<POSTFIX),
6  but the last symbols are useless.
7  It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter.
8- The block type code is useless if NBLTYPES==2, you would only need 1 symbol
9  anyway, so why don't you just switch to "the other" type?
10"""
11import struct
12from operator import itemgetter, methodcaller
13from itertools import accumulate, repeat
14from collections import defaultdict, deque
15from functools import partial
16
17class InvalidStream(Exception): pass
18#lookup table
19L, I, D = "literal", "insert&copy", "distance"
20pL, pI, pD = 'P'+L, 'P'+I, 'P'+D
21
22def outputCharFormatter(c):
23    """Show character in readable format
24    """
25    #TODO 2: allow hex only output
26    if 32<c<127: return chr(c)
27    elif c==10: return '\\n'
28    elif c==13: return '\\r'
29    elif c==32: return '" "'
30    else: return '\\x{:02x}'.format(c)
31
32def outputFormatter(s):
33    """Show string or char.
34    """
35    result = ''
36    def formatSubString(s):
37        for c in s:
38            if c==32: yield ' '
39            else: yield outputCharFormatter(c)
40    if len(result)<200: return ''.join(formatSubString(s))
41    else:
42        return ''.join(formatSubString(s[:100]))+'...'+ \
43               ''.join(formatSubString(s[-100:]))
44
45
46class BitStream:
47    """Represent a bytes object. Can read bits and prefix codes the way
48    Brotli does.
49    """
50    def __init__(self, byteString):
51        self.data = byteString
52        #position in bits: byte pos is pos>>3, bit pos is pos&7
53        self.pos = 0
54
55    def __repr__(self):
56        """Representation
57        >>> olleke
58        BitStream(pos=0:0)
59        """
60        return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7)
61
62    def read(self, n):
63        """Read n bits from the stream and return as an integer.
64        Produces zero bits beyond the stream.
65        >>> olleke.data[0]==27
66        True
67        >>> olleke.read(5)
68        27
69
70        >>> olleke
71        BitStream(pos=0:5)
72        """
73        value = self.peek(n)
74        self.pos += n
75        if self.pos>len(self.data)*8:
76            raise ValueError('Read past end of stream')
77        return value
78
79    def peek(self, n):
80        """Peek an n bit integer from the stream without updating the pointer.
81        It is not an error to read beyond the end of the stream.
82        >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803
83        True
84        >>> olleke.peek(15)
85        11803
86        >>> hex(olleke.peek(32))
87        '0x2e1b'
88        """
89        #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3]
90        #convert to int: int.from_bytes(..., 'little')
91        #shift out the bits from the first byte: >>(self.pos&7)
92        #mask unwanted bits: & (1<<n)-1
93        return int.from_bytes(
94            self.data[self.pos>>3:self.pos+n+7>>3],
95            'little')>>(self.pos&7) & (1<<n)-1
96
97    def readBytes(self, n):
98        """Read n bytes from the stream on a byte boundary.
99        """
100        if self.pos&7: raise ValueError('readBytes: need byte boundary')
101        result = self.data[self.pos>>3:(self.pos>>3)+n]
102        self.pos += 8*n
103        return result
104
105#-----------------------Symbol-------------------------------------------
106class Symbol:
107    """A symbol in a code.
108    Refers back to the code that contains it.
109    Index is the place in the alphabet of the symbol.
110    """
111    __slots__ = 'code', 'index'
112    def __init__(self, code, value):
113        self.code = code
114        self.index = value
115
116    def __repr__(self):
117        return 'Symbol({}, {})'.format(self.code.name, self.index)
118
119    def __len__(self):
120        """Number of bits in the prefix notation of this symbol
121        """
122        return self.code.length(self.index)
123
124    def __int__(self):
125        return self.index
126
127    #these routines call equivalent routine in Code class
128    def bitPattern(self):
129        """Value of the symbol in the stream
130        """
131        return self.code.bitPattern(self.index)
132
133    def extraBits(self):
134        """Number of extra bits to read for this symbol
135        """
136        return self.code.extraBits(self.index)
137
138    def __str__(self):
139        """Short descriptor of the symbol without extra bits.
140        """
141        return self.code.mnemonic(self.index)
142
143    #requiring optional extra bits, if self.code supports them
144    def value(self, extra=None):
145        """The value used for processing. Can be a tuple.
146        with optional extra bits
147        """
148        if isinstance(self.code, WithExtra):
149            if not 0<=extra<1<<self.extraBits():
150                raise ValueError("value: extra value doesn't fit in extraBits")
151            return self.code.value(self.index, extra)
152        if extra is not None:
153            raise ValueError('value: no extra bits for this code')
154        return self.code.value(self.index)
155
156    def explanation(self, extra=None):
157        """Long explanation of the value from the numeric value
158        with optional extra bits
159        Used by Layout.verboseRead when printing the value
160        """
161        if isinstance(self.code, WithExtra):
162            return self.code.callback(self, extra)
163        return self.code.callback(self)
164
165#========================Code definitions==================================
166class RangeDecoder:
167    """A decoder for the Code class that assumes the symbols
168    are encoded consecutively in binary.
169    It all depends on the "alphabetSize" property.
170    The range runs from 0 to alphabetSize-1.
171    This is the default decoder.
172    """
173    def __init__(self, *, alphabetSize=None, bitLength=None, **args):
174        if bitLength is not None: alphabetSize = 1<<bitLength
175        if alphabetSize is not None:
176            self.alphabetSize = alphabetSize
177            self.maxLength = (alphabetSize-1).bit_length()
178
179    def __len__(self):
180        return self.alphabetSize
181
182    def __iter__(self):
183        """Produce all symbols.
184        """
185        return map(partial(Symbol, self), range(len(self)))
186
187    def __getitem__(self, index):
188        if index>=self.alphabetSize: raise ValueError('index out of range')
189        return Symbol(self, index)
190
191    def bitPattern(self, index):
192        return '{:0{}b}'.format(index, self.maxLength)
193
194    def length(self, index):
195        """Encoding length of given symbol.
196        Does not depend on index in this case.
197        """
198        return self.maxLength
199
200    def decodePeek(self, data):
201        """Find which symbol index matches the given data (from peek, as a number)
202        and return the number of bits decoded.
203        Can also be used to figure out length of a symbol.
204        """
205        return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1)
206
207class PrefixDecoder:
208    """A decoder for the Code class that uses a prefix code.
209    The code is determined by encoding:
210    encode[p] gives the index corresponding to bit pattern p.
211    Used setDecode(decodeTable) to switch the decoder from the default
212    to a prefix decoder, or pass decodeTable at init.
213    You can also use setLength(lengthTable)
214    to define the encoding from the lengths.
215    The set of symbol values does not need to be consecutive.
216    """
217    def __init__(self, *, decodeTable=None, **args):
218        if decodeTable is not None: self.setDecode(decodeTable)
219
220    def __len__(self):
221        return len(self.decodeTable)
222
223    def __iter__(self):
224        def revBits(index):
225            return self.bitPattern(index)[::-1]
226        return (
227            Symbol(self, index)
228            for index in sorted(self.decodeTable.values(), key=revBits)
229            )
230
231    def __getitem__(self, index):
232        if index not in self.lengthTable:
233            raise ValueError('No symbol {}[{}]'.format(
234                self.__class__.__name__, index))
235        return Symbol(self, index)
236
237    def bitPattern(self, index):
238        bits = next(b for (b,s) in self.decodeTable.items() if s==index)
239        return '{:0{}b}'.format(bits, self.length(index))
240
241    def length(self, index):
242        """Encoding length of given symbol.
243        """
244        return self.lengthTable[index]
245
246    def decodePeek(self, data):
247        """Find which symbol index matches the given data (from peek, as a number)
248        and return the number of bits decoded.
249        Can also be used to figure out length of a symbol.
250        """
251        #do binary search for word length
252        #invariant: lo<=length<=hi
253        lo, hi = self.minLength, self.maxLength
254        while lo<=hi:
255            mid = lo+hi>>1
256            #note lo<=mid<hi at this point
257            mask = (1<<mid)-1
258            #lets see what happens if we guess length is mid
259            try: index = self.decodeTable[data&mask]
260            except KeyError:
261                #too many bits specified, reduce estimated length
262                hi = mid-1
263                continue
264            #we found a symbol, but there could be a longer match
265            symbolLength = self.lengthTable[index]
266            if symbolLength<=mid:
267                #all bits match, symbol must be right
268                return symbolLength, Symbol(self, index)
269            #there must be more bits to match
270            lo = mid+1
271        return lo, Symbol(self, index)
272
273    #routine to set up the tables
274    def setDecode(self, decodeTable):
275        """Store decodeTable,
276        and compute lengthTable, minLength, maxLength from encodings.
277        """
278        self.decodeTable = decodeTable
279        #set of symbols with unknown length
280        todo = set(decodeTable)
281        #bit size under investigation
282        maskLength = 0
283        lengthTable = {}
284        while todo:
285            mask = (1<<maskLength)-1
286            #split the encodings that we didn't find yet using b bits
287            splitSymbols = defaultdict(list)
288            for s in todo: splitSymbols[s&mask].append(s)
289            #unique encodings have a length of maskLength bits
290            #set length, and remove from todo list
291            for s,subset in splitSymbols.items():
292                if len(subset)==1:
293                    lengthTable[self.decodeTable[s]] = maskLength
294                    todo.remove(s)
295            #now investigate with longer mask
296            maskLength +=1
297        #save result
298        self.lengthTable = lengthTable
299        self.minLength = min(lengthTable.values())
300        self.maxLength = max(lengthTable.values())
301        self.switchToPrefix()
302
303    def setLength(self, lengthTable):
304        """Given the bit pattern lengths for symbols given in lengthTable,
305        set decodeTable, minLength, maxLength
306        """
307        self.lengthTable = lengthTable
308        self.minLength = min(lengthTable.values())
309        self.maxLength = max(lengthTable.values())
310        #compute the backwards codes first; then reverse them
311        #compute (backwards) first code for every separate lengths
312        nextCodes = []
313        #build codes for each length, from right to left
314        code = 0
315        for bits in range(self.maxLength+1):
316            code <<= 1
317            nextCodes.append(code)
318            code += sum(x==bits for x in lengthTable.values())
319        self.decodeTable = {}
320        #count codes for each length, and store reversed in the table
321        for symbol in sorted(lengthTable):
322            bits = lengthTable[symbol]
323            bitpattern = '{:0{}b}'.format(nextCodes[bits], bits)
324            self.decodeTable[int(bitpattern[::-1], 2)] = symbol
325            nextCodes[bits] += 1
326        self.switchToPrefix()
327
328    def switchToPrefix(self):
329        """This routine makes sure the prefix decoder is activated.
330        """
331        self.mode = PrefixDecoder
332
333class Code(RangeDecoder, PrefixDecoder):
334    """An alphabet of symbols, that can be read from a stream.
335    If you use setDecode or setLength, you have a prefix code,
336    otherwise you have a range code.
337    Features:
338    code[index] produces symbol with given index
339    value(index): value of symbol
340    mnemonic(index): short description of symbol
341    explanation(index): show meaning of symbol, shown in Layout.verboseRead
342    iter(code): produce all symbols in some order
343    name: show as context in Layout.verboseRead
344    """
345    name = '?'
346    #callback is a function that gets the symbol and the extra bits
347    #default callback calls explanation
348    def __init__(self, name=None, *, callback=None, description='', **args):
349        """Don't forget to set either alphabetSize or decodeTable
350        """
351        #set name when provided, otherwise take class variable
352        if name is not None: self.name = name
353        if callback is not None: self.callback = callback
354        self.description = description
355        #mode switch
356        if 'bitLength' in args or 'alphabetSize' in args:
357            self.mode = RangeDecoder
358            RangeDecoder.__init__(self, **args)
359        elif 'decodeTable' in args:
360            self.mode = PrefixDecoder
361            PrefixDecoder.__init__(self, **args)
362        else:
363            super().__init__(**args)
364
365    def __repr__(self):
366        return self.__class__.__name__+' '+self.name
367
368    #the routines that get switched between RangeDecoder and PrefixDecoder
369    def __len__(self): return self.mode.__len__(self)
370    def __iter__(self): return self.mode.__iter__(self)
371    def __getitem__(self, index): return self.mode.__getitem__(self, index)
372    def bitPattern(self, index): return self.mode.bitPattern(self, index)
373    def length(self, index): return self.mode.length(self, index)
374    def decodePeek(self, data): return self.mode.decodePeek(self, data)
375    #general routines
376    def value(self, index, extra=None):
377        """Get value of symbol for computations.
378        Override where needed.
379        """
380        if extra is not None:
381            raise ValueError('value: no extra for this symbol')
382        return index
383
384    def mnemonic(self, index):
385        """Give mnemonic of symbol.
386        Override where needed.
387        """
388        return str(self.value(index))
389
390    def callback(self, symbol):
391        return self.explanation(symbol.index)
392
393    def explanation(self, index):
394        """Long explanation of the value from the numeric value
395        This is a default routine.
396        You can customize in three ways:
397        - set description to add some text
398        - override to get more control
399        - set callback to make it dependent on you local variables
400        """
401        value = self.value(index)
402        return '{0}{1}: {2}'.format(
403            self.description and self.description+': ',
404            self.bitPattern(index),
405            value,
406            )
407
408    def extraBits(self, index):
409        return 0
410
411    #Routines that use the decode interface
412    def showCode(self, width=80):
413        """Show all words of the code in a nice format.
414        """
415        #make table of all symbols with binary strings
416        symbolStrings = [
417            (self.bitPattern(s.index), self.mnemonic(s.index))
418            for s in self
419            ]
420        #determine column widths the way Lisp programmers do it
421        leftColWidth, rightColWidth = map(max, map(
422            map,
423            repeat(len),
424            zip(*symbolStrings)
425            ))
426        colwidth = leftColWidth+rightColWidth
427        columns = 81//(colwidth+2)
428        rows = -(-len(symbolStrings)//columns)
429        def justify(bs):
430            b,s = bs
431            return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth)
432        for i in range(rows):
433            print(' '.join(map(justify, symbolStrings[i::rows])).rstrip())
434
435    def readTuple(self, stream):
436        """Read symbol from stream. Returns symbol, length.
437        """
438        length, symbol = self.decodePeek(stream.peek(self.maxLength))
439        stream.pos += length
440        return length, symbol
441
442    def readTupleAndExtra(self, stream):
443        return self.readTuple(stream)+(0, None)
444
445class WithExtra(Code):
446    """Extension for Code so that symbol may have extra bits associated.
447    If you supply an extraTable, you can use extraBits
448    You can define an extraTable,
449    which allows to call extraBits to get the number of extraBits.
450    Otherwise, you can supply extraBits yourself.
451    Routine readTupleAndExtra now reads the extra bits too.
452    Value probably needs to be overridden; see Enumerator.
453    Note: this does not give you an decodeTable.
454    """
455    #redefine these if you don't want to use an extraTable
456    def extraBits(self, index):
457        """Get the number of extra bits for this symbol.
458        """
459        return self.extraTable[index]
460
461    def mnemonic(self, index):
462        """This value must be independent of extra.
463        """
464        return str(index)
465
466    def readTupleAndExtra(self, stream):
467        """Read symbol and extrabits from stream.
468        Returns symbol length, symbol, extraBits, extra
469        >>> olleke.pos = 6
470        >>> MetablockLengthAlphabet().readTupleAndExtra(olleke)
471        (2, Symbol(MLEN, 4), 16, 46)
472        """
473        length, symbol = self.decodePeek(stream.peek(self.maxLength))
474        stream.pos += length
475        extraBits = self.extraBits(symbol.index)
476        return length, symbol, extraBits, stream.read(extraBits)
477
478    def explanation(self, index, extra=None):
479        """Expanded version of Code.explanation supporting extra bits.
480        If you don't supply extra, it is not mentioned.
481        """
482        extraBits = 0 if extra is None else self.extraBits(index)
483        if not hasattr(self, 'extraTable'):
484            formatString = '{0}{3}'
485            lo = hi = value = self.value(index, extra)
486        elif extraBits==0:
487            formatString = '{0}{2}: {3}'
488            lo, hi = self.span(index)
489            value = lo
490        else:
491            formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}'
492            lo, hi = self.span(index)
493            value = lo+extra
494        return formatString.format(
495            self.description and self.description+': ',
496            'x'*extraBits,
497            self.bitPattern(index),
498            lo, hi,
499            extra,
500            value,
501            )
502
503    def callback(self, symbol, extra):
504        return self.explanation(symbol.index, extra)
505
506class BoolCode(Code):
507    """Same as Code(bitLength=1), but shows a boolean.
508    """
509    def __init__(self, name=None, **args):
510        super().__init__(name, bitLength=1, **args)
511
512    def value(self, index, extra=None):
513        return bool(super().value(index, extra))
514
515class Enumerator(WithExtra):
516    """Code that is defined by the ExtraTable.
517    extraTable is a class variable that contains
518    the extraBits of the symbols from 0
519    value0 contains the value of symbol 0
520    encodings is not neccessary, but allowed.
521    Note: place for FixedCode to make sure extraBits works
522    """
523    def __init__(self, name=None, **args):
524        #if there is no decodeTable to determine length, compute it ourselves
525        if 'decodeTable' not in args:
526            args['alphabetSize'] = len(self.extraTable)
527        super().__init__(name, **args)
528
529    def __len__(self):
530        return len(self.extraTable)
531
532    def __getitem__(self, index):
533        """Faster than PrefixDecoder
534        """
535        if index>=len(self.extraTable):
536            raise ValueError("No symbol {}[{}]".format(
537                self.__class__.__name__, index))
538        return Symbol(self, index)
539
540    def value(self, index, extra):
541        """Override if you don't define value0 and extraTable
542        """
543        lower, upper = self.span(index)
544        value = lower+(extra or 0)
545        if value>upper:
546            raise ValueError('value: extra out of range')
547        return value
548
549    def span(self, index):
550        """Give the range of possible values in a tuple
551        Useful for mnemonic and explanation
552        """
553        lower = self.value0+sum(1<<x for x in self.extraTable[:index])
554        upper = lower+(1<<self.extraTable[index])
555        return lower, upper-1
556
557#======================Code subclasses======================================
558#Alphabets used in the metablock header----------------------------------
559#For prefix codes
560class PrefixCodeHeader(WithExtra):
561    """Header of prefix codes.
562    """
563    def __init__(self, codename):
564        super().__init__('PFX', bitLength=2)
565        #this is the name of the code that it describes
566        self.codename = codename
567
568    def extraBits(self, index):
569        return 2 if index==1 else 0
570
571    def value(self, index, extra):
572        """Returns ('Simple', #codewords) or ('Complex', HSKIP)
573        """
574        if index==1:
575            if extra>3:
576                raise ValueError('value: extra out of range')
577            return 'Simple', extra+1
578        if extra:
579            raise ValueError('value: extra out of range')
580        return 'Complex', index
581
582    def explanation(self, index, extra):
583        if index==1:
584            return '{} is simple with {} code word{}'.format(
585                self.codename, extra+1, 's' if extra else '')
586        lengths = [1, 2, 3, 4, 0, 5, 17, 6]
587        return '{} is complex with lengths {}...'.format(
588            self.codename,
589            ','.join(
590                map(str, lengths[index:index+5]))
591            )
592
593class TreeShapeAlhabet(BoolCode):
594    """The bit used to indicate if four word code is "deep" or "wide"
595    """
596    name = 'SHAPE'
597    def value(self, index):
598        return [(2,2,2,2), (1,2,3,3)][index]
599
600    def explanation(self, index):
601        return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index))
602
603class LengthOfLengthAlphabet(Code):
604    """For use in decoding complex code descriptors.
605    >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('')
606    >>> print(lengthOfLengthAlphabet[2])
607    coded with 2 bits
608    >>> len(lengthOfLengthAlphabet[0])
609    2
610    >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)]
611    [2, 4, 3, 2, 2, 4]
612    >>> lengthOfLengthAlphabet.showCode()
613      00:skipped             01:coded with 4 bits 0111:coded with 1 bits
614      10:coded with 3 bits  011:coded with 2 bits 1111:coded with 5 bits
615    """
616    decodeTable = {
617         0b00:0,     0b10:3,
618       0b0111:1,     0b01:4,
619        0b011:2,   0b1111:5,
620       }
621
622    def __init__(self, name=None, **args):
623        super().__init__(name, decodeTable=self.decodeTable, **args)
624
625    def mnemonic(self, index):
626        if index==0: return 'skipped'
627        return 'coded with {} bits'.format(index)
628
629    def explanation(self, index, extra=None):
630        return self.description+': '+self.mnemonic(index)
631
632class LengthAlphabet(WithExtra):
633    """Length of symbols
634    Used during construction of a code.
635    """
636    def __init__(self, name):
637        super().__init__(name, alphabetSize=18)
638
639    def extraBits(self, index):
640        return {16:2, 17:3}.get(index, 0)
641
642    def mnemonic(self, index):
643        if index==0: return 'unused'
644        elif index==16: return 'rep xx'
645        elif index==17: return 'zero xxx'
646        else: return 'len {}'.format(index)
647
648    def explanation(self, index, extra):
649        return self.description.format(self[index], extra)
650
651    def value(self, index, extra):
652        #the caller got the length already, so extra is enough
653        return extra
654
655#Stream header
656class WindowSizeAlphabet(Code):
657    """The alphabet used for window size in the stream header.
658    >>> WindowSizeAlphabet()[10].explanation()
659    'windowsize=(1<<10)-16=1008'
660    """
661    decodeTable = {
662        0b0100001: 10,   0b1100001: 14,   0b0011: 18,   0b1011: 22,
663        0b0110001: 11,   0b1110001: 15,   0b0101: 19,   0b1101: 23,
664        0b1000001: 12,         0b0: 16,   0b0111: 20,   0b1111: 24,
665        0b1010001: 13,   0b0000001: 17,   0b1001: 21,
666        0b0010001: None,
667        }
668
669    name = 'WSIZE'
670
671    def __init__(self, name=None):
672        super().__init__(name, decodeTable=self.decodeTable)
673
674    def value(self, index):
675        #missing value gives index None
676        if index is None: return None
677        return (1<<index)-16
678
679    def explanation(self, index):
680        return 'windowsize=(1<<{})-16={}'.format(
681            index, (1<<index)-16)
682
683#Metablock
684class MetablockLengthAlphabet(WithExtra):
685    """Used for the meta block length;
686    also indicates a block with no data
687    >>> metablockLengthAlphabet = MetablockLengthAlphabet()
688    >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0])
689    Symbol(MLEN, 0)
690    'empty'
691    >>> metablockLengthAlphabet[3]
692    Traceback (most recent call last):
693        ...
694    ValueError: No symbol MetablockLengthAlphabet[3]
695    >>> print(metablockLengthAlphabet[4])
696    hhhh00
697    >>> metablockLengthAlphabet[4].value(0x1000)
698    4097
699    >>> metablockLengthAlphabet[5].value(0x1000)
700    Traceback (most recent call last):
701        ...
702    InvalidStream: Zeros in high nibble of MLEN
703    >>> metablockLengthAlphabet[5].explanation(0x12345)
704    'data length: 12345h+1=74566'
705    >>> metablockLengthAlphabet.showCode()
706    00:hhhh00   10:hhhhhh10 01:hhhhh01  11:empty
707    """
708    decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6}
709
710    name = 'MLEN'
711    def __init__(self, name=None):
712        super().__init__(name, decodeTable=self.decodeTable)
713
714    def extraBits(self, index):
715        return index*4
716
717    def mnemonic(self, index):
718        if index==0: return 'empty'
719        return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
720
721    def value(self, index, extra):
722        extraBits = self.extraBits(index)
723        if not 0<=extra<1<<extraBits:
724            raise ValueError('value: extra out of range')
725        if index==0: return 0
726        if index>4 and extra>>extraBits-4==0: raise InvalidStream(
727            'Zeros in high nibble of MLEN')
728        return extra+1
729
730    def explanation(self, index, extra):
731        if index==0: return '11: empty block'
732        extraBits = self.extraBits(index)
733        return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1)
734
735
736class ReservedAlphabet(BoolCode):
737    """The reserved bit that must be zero.
738    """
739    name = 'RSVD'
740    def value(self, index):
741        if index: raise ValueError('Reserved bit is not zero')
742
743    def explanation(self, index):
744        return 'Reserved (must be zero)'
745
746class FillerAlphabet(Code):
747    def __init__(self, *, streamPos):
748        super().__init__('SKIP', bitLength=(-streamPos)&7)
749
750    def explanation(self, index):
751        return '{} bit{} ignored'.format(
752            self.length(index),
753            '' if self.length(index)==1 else 's',
754            )
755
756class SkipLengthAlphabet(WithExtra):
757    """Used for the skip length in an empty metablock
758    >>> skipLengthAlphabet = SkipLengthAlphabet()
759    >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0])
760    Symbol(SKIP, 0)
761    'empty'
762    >>> skipLengthAlphabet[4]
763    Traceback (most recent call last):
764        ...
765    ValueError: index out of range
766    >>> print(skipLengthAlphabet[3])
767    hhhhhh11
768    >>> skipLengthAlphabet[2].value(0x1000)
769    4097
770    >>> skipLengthAlphabet[3].value(0x1000)
771    Traceback (most recent call last):
772        ...
773    InvalidStream: Zeros in high byte of SKIPBYTES
774    >>> skipLengthAlphabet[3].explanation(0x12345)
775    'skip length: 12345h+1=74566'
776    >>> skipLengthAlphabet.showCode()
777    00:empty    01:hh01     10:hhhh10   11:hhhhhh11
778    """
779    def __init__(self):
780        super().__init__('SKIP', bitLength=2)
781
782    def extraBits(self, index):
783        return index*8
784
785    def mnemonic(self, index):
786        if index==0: return 'empty'
787        return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
788
789    def value(self, index, extra):
790        extraBits = self.extraBits(index)
791        if not 0<=extra<1<<extraBits:
792            raise ValueError('value: extra out of range')
793        if index==0: return 0
794        if index>1 and extra>>extraBits-8==0:
795            raise InvalidStream('Zeros in high byte of SKIPBYTES')
796        return extra+1
797
798    def explanation(self, index, extra):
799        if index==0: return '00: no skip'
800        extraBits = self.extraBits(index)
801        return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1)
802
803
804class TypeCountAlphabet(Enumerator):
805    """Used for giving block type counts and tree counts.
806    >>> TypeCountAlphabet(description='').showCode()
807       0:0            0101:xx,0101      1011:xxxxx,1011
808    0001:0001         1101:xxxxxx,1101  0111:xxx,0111
809    1001:xxxx,1001    0011:x,0011       1111:xxxxxxx,1111
810    """
811    decodeTable = {
812             0b0: 0,   0b1001: 5,
813          0b0001: 1,   0b1011: 6,
814          0b0011: 2,   0b1101: 7,
815          0b0101: 3,   0b1111: 8,
816          0b0111: 4,
817          }
818
819    value0 = 1
820    extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7]
821    name = 'BT#'
822
823    def __init__(self, name=None, *, description):
824        super().__init__(
825            name,
826            decodeTable=self.decodeTable,
827            description=description)
828
829    def mnemonic(self, index):
830        if index==0: return '0'
831        if index==1: return '0001'
832        return 'x'*(self.extraBits(index))+','+self.bitPattern(index)
833
834    def explanation(self, index, extra):
835        value = self.value(index, extra)
836        description = self.description
837        if value==1: description = description[:-1]
838        return '{}: {} {}'.format(
839            self.mnemonic(index),
840            value,
841            description)
842
843class BlockTypeAlphabet(Code):
844    """The block types; this code works for all three kinds.
845    >>> b = BlockTypeAlphabet('T', NBLTYPES=5)
846    >>> print(*(x for x in b))
847    prev +1 #0 #1 #2 #3 #4
848    """
849    def __init__(self, name, NBLTYPES, **args):
850        super().__init__(name, alphabetSize=NBLTYPES+2, **args)
851        self.NBLTYPES = NBLTYPES
852
853    def mnemonic(self, index):
854        if index==0: return 'prev'
855        elif index==1: return '+1'
856        else: return '#'+str(index-2)
857
858    def value(self, index):
859        return index-2
860
861    def explanation(self, index):
862        if index==0: return '0: previous'
863        elif index==1: return '1: increment'
864        else: return 'Set block type to: '+str(index-2)
865
866class BlockCountAlphabet(Enumerator):
867    """Block counts
868    >>> b = BlockCountAlphabet('L')
869    >>> print(b[25])
870    [24*x]: BC16625-16793840
871    """
872
873    value0 = 1
874    extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24]
875    def __init__(self, name, **args):
876        super().__init__(name, alphabetSize=26, **args)
877
878    def mnemonic(self, index):
879        extraBits = self.extraBits(index)
880        return '{}: BC{}-{}'.format(
881            'x'*extraBits if index<5 else '[{}*x]'.format(extraBits),
882            *self.span(index))
883
884    def explanation(self, index, extra):
885        return 'Block count: '+super().explanation(index, extra)
886
887class DistanceParamAlphabet(WithExtra):
888    """The distance parameters NPOSTFIX and NDIRECT.
889    Although these are treated as two in the description, this is easier.
890    """
891    def __init__(self):
892        super().__init__('DIST', bitLength=2)
893
894    def extraBits(self, index):
895        return 4
896
897    def value(self, index, extra):
898        """Returns NPOSTFIX and NDIRECT<<NPOSTFIX
899        """
900        if extra>15:
901            raise ValueError('value: extra out of range')
902        return index, extra<<index
903
904    def explanation(self, index, extra):
905        return '{} postfix bits and {:04b}<<{}={} direct codes'.format(
906            index, extra, index, extra<<index)
907
908    def mnemonic(self, index):
909        return 'PF'+str(index)
910
911class LiteralContextMode(Code):
912    """For the literal context modes.
913    >>> LiteralContextMode().showCode()
914    00:LSB6   01:MSB6   10:UTF8   11:Signed
915    >>> LiteralContextMode().explanation(2)
916    'Context mode for type 9: 2(UTF8)'
917    """
918
919    def __init__(self, *, number=9):
920        super().__init__('LC'+str(number), bitLength=2)
921        self.number = number
922
923    def mnemonic(self, index):
924        return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index]
925
926    def explanation(self, index):
927        return 'Context mode for type {}: {}({})'.format(
928            self.number,
929            index,
930            self.mnemonic(index))
931
932class RLEmaxAlphabet(Enumerator):
933    """Used for describing the run length encoding used for describing context maps.
934    >>> RLEmaxAlphabet().showCode()
935    0:1    1:more
936    """
937    value0 = 0
938    extraTable = [0, 4]
939    name = 'RLE#'
940
941    def mnemonic(self, index):
942        return ['1', 'more'][index]
943
944    def explanation(self, index, extra):
945        description = self.description and self.description+': '
946        if index==0: return description+'No RLE coding'
947        return '{}xxxx 1: RLEMAX={}'.format(description, extra+1)
948
949class TreeAlphabet(WithExtra):
950    """The alphabet to enumerate entries (called trees) in the context map.
951    parameters are RLEMAX and NTREES
952    >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5)
953    >>> len(t)
954    8
955    >>> print(t[2])
956    xx+4 zeroes
957    >>> t[3].explanation(2)
958    '8+010=10 zeroes'
959    >>> t[0].value(0)
960    (1, 0)
961    """
962    name = 'CMI'
963    def __init__(self, name=None, *, RLEMAX, NTREES, **args):
964        super().__init__(name, alphabetSize=RLEMAX+NTREES, **args)
965        self.RLEMAX = RLEMAX
966        self.NTREES = NTREES
967
968    def extraBits(self, index):
969        if 0<index<=self.RLEMAX: return index
970        return 0
971
972    def mnemonic(self, index):
973        if index==0: return 'map #0'
974        if index<=self.RLEMAX:
975            return '{}+{} zeroes'.format('x'*index, 1<<index)
976        return 'map #{}'.format(index-self.RLEMAX)
977
978    def value(self, index, extra):
979        """Give count and value."""
980        index = index
981        if index==0: return 1, 0
982        if index<=self.RLEMAX: return (1<<index)+extra, 0
983        return 1, index-self.RLEMAX
984
985    def explanation(self, index, extra):
986        description = self.description and self.description+': '
987        if index==0: return description+'map #0'
988        if index<=self.RLEMAX:
989            return '{}+{:0{}b}={} zeroes'.format(
990                (1<<index),
991                extra, self.extraBits(index),
992                (1<<index)+extra)
993        return '{}map #{}-{}={}'.format(
994            description,
995            index, self.RLEMAX, index-self.RLEMAX)
996
997#Prefix alphabets for the data stream----------------------------------
998class LiteralAlphabet(Code):
999    """Alphabet of symbols.
1000    """
1001    minLength = maxLength = 8
1002    def __init__(self, number):
1003        super().__init__('L'+str(number), alphabetSize=1<<8)
1004
1005    def mnemonic(self, index):
1006        return outputCharFormatter(index)
1007
1008    def value(self, index, extra=None):
1009        return index
1010
1011    def explanation(self, index, extra=None):
1012        return self.mnemonic(index)
1013
1014class InsertLengthAlphabet(Enumerator):
1015    """Intern code for insert counts
1016    """
1017    value0 = 0
1018    extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24]
1019
1020class CopyLengthAlphabet(Enumerator):
1021    value0 = 2
1022    extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24]
1023
1024class InsertAndCopyAlphabet(WithExtra):
1025    """The insert and copy code
1026    >>> for x in range(0,704,704//13):
1027    ...    print('{:10b}'.format(x), InsertAndCopyAlphabet()[x])
1028             0 I0C2&D=0
1029        110110 I6+xC8&D=0
1030       1101100 I5C22+xxx&D=0
1031      10100010 I4C4
1032      11011000 I3C10+x
1033     100001110 I14+xxC8
1034     101000100 I10+xxC22+xxx
1035     101111010 I98+xxxxxC14+xx
1036     110110000 I6+xC70+xxxxx
1037     111100110 I1090+[10*x]C8
1038    1000011100 I26+xxxC326+[8*x]
1039    1001010010 I322+[8*x]C14+xx
1040    1010001000 I194+[7*x]C70+xxxxx
1041    1010111110 I22594+[24*x]C1094+[10*x]
1042    """
1043    insertLengthAlphabet = InsertLengthAlphabet(None)
1044    copyLengthAlphabet = CopyLengthAlphabet(None)
1045
1046    def __init__(self, number=''):
1047        super().__init__('IC'+str(number), bitLength=10)
1048
1049    def __len__(self):
1050        return 704
1051
1052    def extraBits(self, index):
1053        insertSymbol, copySymbol, dist0 = self.splitSymbol(index)
1054        return InsertLengthAlphabet.extraTable[insertSymbol.index] + \
1055            CopyLengthAlphabet.extraTable[copySymbol.index]
1056
1057    def splitSymbol(self, index):
1058        """Give relevant values for computations:
1059        (insertSymbol, copySymbol, dist0flag)
1060        """
1061        #determine insert and copy upper bits from table
1062        row = [0,0,1,1,2,2,1,3,2,3,3][index>>6]
1063        col = [0,1,0,1,0,1,2,0,2,1,2][index>>6]
1064        #determine inserts and copy sub codes
1065        insertLengthCode = row<<3 | index>>3&7
1066        if row: insertLengthCode -= 8
1067        copyLengthCode = col<<3 | index&7
1068        return (
1069            Symbol(self.insertLengthAlphabet, insertLengthCode),
1070            Symbol(self.copyLengthAlphabet, copyLengthCode),
1071            row==0
1072            )
1073
1074    def mnemonic(self, index):
1075        """Make a nice mnemonic
1076        """
1077        i,c,d0 = self.splitSymbol(index)
1078        iLower, _ = i.code.span(i.index)
1079        iExtra = i.extraBits()
1080        cLower, _ = c.code.span(c.index)
1081        cExtra = c.extraBits()
1082        return 'I{}{}{}C{}{}{}{}'.format(
1083            iLower,
1084            '+' if iExtra else '',
1085            'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra),
1086            cLower,
1087            '+' if cExtra else '',
1088            'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra),
1089            '&D=0' if d0 else '')
1090
1091    def value(self, index, extra):
1092        i,c,d0 = self.splitSymbol(index)
1093        iExtra = i.extraBits()
1094        ce, ie = extra>>iExtra, extra&(1<<iExtra)-1
1095        insert = i.value(ie)
1096        copy = c.value(ce)
1097        return insert, copy, d0
1098
1099    def explanation(self, index, extra):
1100        insert, copy, d0 = self.value(index, extra)
1101        if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy)
1102        else: return 'Literal: {}, copy: {}'.format(insert, copy)
1103
1104class DistanceAlphabet(WithExtra):
1105    """Represent the distance encoding.
1106    Dynamically generated alphabet.
1107    This is what the documentation should have said:
1108    Ignoring offsets for the moment, the "long" encoding works as follows:
1109    Write the distance in binary as follows:
1110    1xy..yz..z, then the distance symbol consists of n..nxz..z
1111    Where:
1112    n is one less than number of bits in y
1113    x is a single bit
1114    y..y are n+1 extra bits (encoded in the bit stream)
1115    z..z is NPOSTFIX bits that are part of the symbol
1116    The offsets are so as to start at the lowest useable value:
1117    if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1
1118    then n..nxz..z is symbol -NDIRECT-16
1119    >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1120    >>> print(d[4], d[17], d[34])
1121    last-1 1 10xx00-5
1122    >>> [str(d[x]) for x in range(26, 32)]
1123    ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5']
1124    """
1125    def __init__(self, number, *, NPOSTFIX, NDIRECT):
1126        self.NPOSTFIX = NPOSTFIX
1127        self.NDIRECT = NDIRECT
1128        #set length
1129        #Actually, not all symbols are used,
1130        #only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX)
1131        super().__init__('D'+str(number),
1132            alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX))
1133
1134    def extraBits(self, index):
1135        """Indicate how many extra bits are needed to interpret symbol
1136        >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1137        >>> [d[i].extraBits() for i in range(26)]
1138        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1139        >>> [d[i].extraBits() for i in range(26,36)]
1140        [1, 1, 1, 1, 1, 1, 1, 1, 2, 2]
1141        """
1142        if index<16+self.NDIRECT: return 0
1143        return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
1144
1145    def value(self, dcode, dextra):
1146        """Decode value of symbol together with the extra bits.
1147        >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1148        >>> d[34].value(2)
1149        (0, 35)
1150        """
1151        if dcode<16:
1152            return [(1,0),(2,0),(3,0),(4,0),
1153                    (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3),
1154                    (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3)
1155                ][dcode]
1156        if dcode<16+self.NDIRECT:
1157            return (0,dcode-16)
1158        #we use the original formulas, instead of my clear explanation
1159        POSTFIX_MASK = (1 << self.NPOSTFIX) - 1
1160        ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
1161        hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX
1162        lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK
1163        offset = ((2 + (hcode & 1)) << ndistbits) - 4
1164        distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1
1165        return (0,distance)
1166
1167    def mnemonic(self, index, verbose=False):
1168        """Give mnemonic representation of meaning.
1169        verbose compresses strings of x's
1170        """
1171        if index<16:
1172            return ['last', '2last', '3last', '4last',
1173                'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3',
1174                '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3'
1175                ][index]
1176        if index<16+self.NDIRECT:
1177            return str(index-16)
1178        #construct strings like "1xx01-15"
1179        index -= self.NDIRECT+16
1180        hcode = index >> self.NPOSTFIX
1181        lcode = index & (1<<self.NPOSTFIX)-1
1182        if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}'
1183        else: formatString = '1{0}{1}{4:+d}'
1184        return formatString.format(
1185            hcode&1,
1186            'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1),
1187            lcode, self.NPOSTFIX,
1188            self.NDIRECT+1-(4<<self.NPOSTFIX))
1189
1190    def explanation(self, index, extra):
1191        """
1192        >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1193        >>> d[55].explanation(13)
1194        '11[1101]01-5: [0]+240'
1195        """
1196        extraBits = self.extraBits(index)
1197        extraString = '[{:0{}b}]'.format(extra, extraBits)
1198        return '{0}: [{1[0]}]{1[1]:+d}'.format(
1199            self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString),
1200            self.value(index, extra))
1201
1202#Classes for doing actual work------------------------------------------
1203class ContextModeKeeper:
1204    """For computing the literal context mode.
1205    You feed it characters, and it computes indices in the context map.
1206    """
1207    def __init__(self, mode):
1208        self.chars = deque([0,0], maxlen=2)
1209        self.mode = mode
1210
1211    def setContextMode(self, mode):
1212        """Switch to given context mode (0..3)"""
1213        self.mode = mode
1214    def getIndex(self):
1215        if self.mode==0:  #LSB6
1216            return self.chars[1]&0x3f
1217        elif self.mode==1: #MSB6
1218            return self.chars[1]>>2
1219        elif self.mode==2: #UTF8: character class of previous and a bit of the second
1220            p2,p1 = self.chars
1221            return self.lut0[p1]|self.lut1[p2]
1222        elif self.mode==3: #Signed: initial bits of last two bytes
1223            p2,p1 = self.chars
1224            return self.lut2[p1]<<3|self.lut2[p2]
1225
1226    def add(self, index):
1227        """Adjust the context for output char (as int)."""
1228        self.chars.append(index)
1229
1230    #0: control     #16: quote  #32: ,:;  #48: AEIOU
1231    #4: tab/lf/cr   #20: %      #36: .    #52: BC..Z
1232    #8: space       #24: (<[{   #40: =    #56: aeiou
1233    #12:!#$&*+-/?@| #28: )>]}   #44: 0-9  #60: bc..z
1234    lut0 = [0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  4,  0,  0,  4,  0,  0,
1235            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1236            8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
1237           44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
1238           12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
1239           52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
1240           12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
1241           60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12,  0
1242           ]+[0,1]*32+[2,3]*32
1243    #0: space  1:punctuation  2:digit/upper 3:lower
1244    lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1245             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1246             0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1247             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1248             1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1249             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
1250             1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1251             3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0
1252           ]+[0]*96+[2]*32
1253    #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1
1254    lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7]
1255    assert len(lut0)==len(lut1)==len(lut2)==256
1256
1257class WordList:
1258    """Word list.
1259    >>> WordList().word(7, 35555)
1260    b'Program to '
1261    """
1262    NDBITS = [0,  0,  0,  0, 10, 10, 11, 11, 10, 10,
1263             10, 10, 10,  9,  9,  8,  7,  7,  8,  7,
1264              7,  6,  6,  5,  5]
1265    def __init__(self):
1266        self.file = open('dict', 'rb')
1267        self.compileActions()
1268
1269    def word(self, size, dist):
1270        """Get word
1271        """
1272        #split dist in index and action
1273        ndbits = self.NDBITS[size]
1274        index = dist&(1<<ndbits)-1
1275        action = dist>>ndbits
1276        #compute position in file
1277        position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index
1278        self.file.seek(position)
1279        return self.doAction(self.file.read(size), action)
1280
1281    def upperCase1(self, word):
1282        word = word.decode('utf8')
1283        word = word[0].upper()+word[1:]
1284        return word.encode('utf8')
1285
1286
1287    #Super compact form of action table.
1288    #_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst
1289    actionTable = r"""
1290        0:w        25:w+_for_     50:w+\n\t       75:w+. This_100:w+ize_
1291        1:w+_      26:w[3:]       51:w+:          76:w+,      101:w.U+.
1292        2:_+w+_    27:w[:-2]      52:_+w+._       77:.+w+_    102:\xc2\xa0+w
1293        3:w[1:]    28:w+_a_       53:w+ed_        78:U(w)+(   103:_+w+,
1294        4:U(w)+_   29:w+_that_    54:w[9:]        79:U(w)+.   104:U(w)+="
1295        5:w+_the_  30:_+U(w)      55:w[7:]        80:w+_not_  105:w.U+="
1296        6:_+w      31:w+._        56:w[:-6]       81:_+w+="   106:w+ous_
1297        7:s_+w+_   32:.+w         57:w+(          82:w+er_    107:w.U+,_
1298        8:w+_of_   33:_+w+,_      58:U(w)+,_      83:_+w.U+_  108:U(w)+=\'
1299        9:U(w)     34:w[4:]       59:w[:-8]       84:w+al_    109:_+U(w)+,
1300       10:w+_and_  35:w+_with_    60:w+_at_       85:_+w.U    110:_+w.U+="
1301       11:w[2:]    36:w+\'        61:w+ly_        86:w+=\'    111:_+w.U+,_
1302       12:w[:-1]   37:w+_from_    62:_the_+w+_of_ 87:w.U+"    112:_+w.U+,
1303       13:,_+w+_   38:w+_by_      63:w[:-5]       88:U(w)+._  113:w.U+(
1304       14:w+,_     39:w[5:]       64:w[:-9]       89:_+w+(    114:w.U+._
1305       15:_+U(w)+_ 40:w[6:]       65:_+U(w)+,_    90:w+ful_   115:_+w.U+.
1306       16:w+_in_   41:_the_+w     66:U(w)+"       91:_+U(w)+._116:w.U+=\'
1307       17:w+_to_   42:w[:-4]      67:.+w+(        92:w+ive_   117:_+w.U+._
1308       18:e_+w+_   43:w+. The_    68:w.U+_        93:w+less_  118:_+U(w)+="
1309       19:w+"      44:w.U         69:U(w)+">      94:w.U+\'   119:_+w.U+=\'
1310       20:w+.      45:w+_on_      70:w+="         95:w+est_   120:_+U(w)+=\'
1311       21:w+">     46:w+_as_      71:_+w+.        96:_+U(w)+.
1312       22:w+\n     47:w+_is_      72:.com/+w      97:w.U+">
1313       23:w[:-3]   48:w[:-7]                      98:_+w+=\'
1314       24:w+]      49:w[:-1]+ing_ 74:U(w)+\'      99:U(w)+,
1315        """
1316
1317    def compileActions(self):
1318        """Build the action table from the text above
1319        """
1320        import re
1321        self.actionList = actions = [None]*121
1322        #Action 73, which is too long, looks like this when expanded:
1323        actions[73] = "b' the '+w+b' of the '"
1324        #find out what the columns are
1325        actionLines = self.actionTable.splitlines()
1326        colonPositions = [m.start()
1327            for m in re.finditer(':',actionLines[1])
1328            ]+[100]
1329        columns = [(colonPositions[i]-3,colonPositions[i+1]-3)
1330            for i in range(len(colonPositions)-1)]
1331        for line in self.actionTable.splitlines(keepends=False):
1332            for start,end in columns:
1333                action = line[start:end]
1334                #skip empty actions
1335                if not action or action.isspace(): continue
1336                #chop it up, and check if the colon is properly placed
1337                index, colon, action = action[:3], action[3], action[4:]
1338                assert colon==':'
1339                #remove filler spaces at right
1340                action = action.rstrip()
1341                #replace space symbols
1342                action = action.replace('_', ' ')
1343                wPos = action.index('w')
1344                #add quotes around left string when present
1345                #translation: any pattern from beginning, up to
1346                #(but not including) a + following by a w later on
1347                action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action)
1348                #add quotes around right string when present
1349                #translation: anything with a w in it, followed by a +
1350                #and a pattern up to the end
1351                #(there is no variable lookbehind assertion,
1352                #so we have to copy the pattern)
1353                action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action)
1354                #expand shortcut for uppercaseAll
1355                action = action.replace(".U", ".upper()")
1356                #store action
1357                actions[int(index)] = action
1358
1359    def doAction(self, w, action):
1360        """Perform the proper action
1361        """
1362        #set environment for the UpperCaseFirst
1363        U = self.upperCase1
1364        return eval(self.actionList[action], locals())
1365
1366class Layout:
1367    """Class to layout the output.
1368    """
1369    #display width of hexdata+bitdata
1370    width = 25
1371    #general
1372    def __init__(self, stream):
1373        self.stream = stream
1374        self.bitPtr = self.width
1375
1376    def makeHexData(self, pos):
1377        """Produce hex dump of all data containing the bits
1378        from pos to stream.pos
1379        """
1380        firstAddress = pos+7>>3
1381        lastAddress = self.stream.pos+7>>3
1382        return ''.join(map('{:02x} '.format,
1383            self.stream.data[firstAddress:lastAddress]))
1384
1385    def formatBitData(self, pos, width1, width2=0):
1386        """Show formatted bit data:
1387        Bytes are separated by commas
1388        whole bytes are displayed in hex
1389        >>> Layout(olleke).formatBitData(6, 2, 16)
1390        '|00h|2Eh,|00'
1391        >>> Layout(olleke).formatBitData(4, 1, 0)
1392        '1'
1393        """
1394        result = []
1395        #make empty prefix code explicit
1396        if width1==0: result = ['()', ',']
1397        for width in width1, width2:
1398            #skip empty width2
1399            if width==0: continue
1400            #build result backwards in a list
1401            while width>0:
1402                availableBits = 8-(pos&7)
1403                if width<availableBits:
1404                    #read partial byte, beginning nor ending at boundary
1405                    data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1
1406                    result.append('{:0{}b}'.format(data, width))
1407                elif availableBits<8:
1408                    #read rest of byte, ending at boundary
1409                    data = self.stream.data[pos>>3] >> (pos&7)
1410                    result.append('|{:0{}b}'.format(data, availableBits))
1411                else:
1412                    #read whole byte (in hex), beginning and ending at boundary
1413                    data = self.stream.data[pos>>3]
1414                    result.append('|{:02X}h'.format(data))
1415                width -= availableBits
1416                pos += availableBits
1417            #if width overshot from the availableBits subtraction, fix it
1418            pos += width
1419            #add comma to separate fields
1420            result.append(',')
1421        #concatenate pieces, reversed, skipping the last space
1422        return ''.join(result[-2::-1])
1423
1424    def readPrefixCode(self, alphabet):
1425        """give alphabet the prefix code that is read from the stream
1426        Called for the following alphabets, in this order:
1427        The alphabet in question must have a "logical" order,
1428        otherwise the assignment of symbols doesn't work.
1429        """
1430        mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name))
1431        if mode=='Complex':
1432            #for a complex code, numberOfSymbols means hskip
1433            self.readComplexCode(numberOfSymbols, alphabet)
1434            return alphabet
1435        else:
1436            table = []
1437            #Set table of lengths for mnemonic function
1438            lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1]
1439            #adjust mnemonic function of alphabet class
1440            def myMnemonic(index):
1441                return '{} bit{}: {}'.format(
1442                    lengths[i],
1443                    '' if lengths[i]==1 else 's',
1444                    alphabet.__class__.mnemonic(alphabet, index)
1445                    )
1446            alphabet.mnemonic = myMnemonic
1447            for i in range(numberOfSymbols):
1448                table.append(self.verboseRead(alphabet, skipExtra=True).index)
1449            #restore mnemonic
1450            del alphabet.mnemonic
1451            if numberOfSymbols==4:
1452                #read tree shape to redefine lengths
1453                lengths = self.verboseRead(TreeShapeAlhabet())
1454            #construct the alphabet prefix code
1455            alphabet.setLength(dict(zip(table, lengths)))
1456        return alphabet
1457
1458    def readComplexCode(self, hskip, alphabet):
1459        """Read complex code"""
1460        stream = self.stream
1461        #read the lengths for the length code
1462        lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:]
1463        codeLengths = {}
1464        total = 0
1465        lol = LengthOfLengthAlphabet('##'+alphabet.name)
1466        #lengthCode will be used for coding the lengths of the new code
1467        #we use it for display until now; definition comes below
1468        lengthCode = LengthAlphabet('#'+alphabet.name)
1469        lengthIter = iter(lengths)
1470        while total<32:
1471            newSymbol = next(lengthIter)
1472            lol.description = str(lengthCode[newSymbol])
1473            length = self.verboseRead(lol)
1474            if length:
1475                codeLengths[newSymbol] = length
1476                total += 32>>length
1477            if total>=32:
1478                break
1479        if total>32: raise ValueError("Stream format")
1480        #Now set the encoding of the lengthCode
1481        lengthCode.setLength(codeLengths)
1482        print("***** Lengths for {} will be coded as:".format(alphabet.name))
1483        lengthCode.showCode()
1484        #Now determine the symbol lengths with the lengthCode
1485        symbolLengths = {}
1486        total = 0
1487        lastLength = 8
1488        alphabetIter = iter(alphabet)
1489        while total<32768:
1490            #look ahead to see what is going to happen
1491            length = lengthCode.decodePeek(
1492                self.stream.peek(lengthCode.maxLength))[1].index
1493            #in every branch, set lengthCode.description to explanatory text
1494            #lengthCode calls format(symbol, extra) with this string
1495            if length==0:
1496                symbol = next(alphabetIter)
1497                lengthCode.description = 'symbol {} unused'.format(symbol)
1498                self.verboseRead(lengthCode)
1499                #unused symbol
1500                continue
1501            if length==16:
1502                lengthCode.description = \
1503                    '{1}+3 symbols of length '+str(lastLength)
1504                extra = self.verboseRead(lengthCode)
1505                #scan series of 16s (repeat counts)
1506                #start with repeat count 2
1507                repeat = 2
1508                startSymbol = next(alphabetIter)
1509                endSymbol = next(alphabetIter)
1510                symbolLengths[startSymbol.index] = \
1511                    symbolLengths[endSymbol.index] = lastLength
1512                #count the two just defined symbols
1513                total += 2*32768>>lastLength
1514                #note: loop may end because we're there
1515                #even if a 16 _appears_ to follow
1516                while True:
1517                    #determine last symbol
1518                    oldRepeat = repeat
1519                    repeat = (repeat-2<<2)+extra+3
1520                    #read as many symbols as repeat increased
1521                    for i in range(oldRepeat, repeat):
1522                        endSymbol = next(alphabetIter)
1523                        symbolLengths[endSymbol.index] = lastLength
1524                    #compute new total; it may be end of loop
1525                    total += (repeat-oldRepeat)*32768>>lastLength
1526                    if total>=32768: break
1527                    #see if there is more to do
1528                    length = lengthCode.decodePeek(
1529                        self.stream.peek(lengthCode.maxLength))[1].index
1530                    if length!=16: break
1531                    lengthCode.description = 'total {}+{{1}} symbols'.format(
1532                        (repeat-2<<2)+3)
1533                    extra = self.verboseRead(lengthCode)
1534            elif length==17:
1535                #read, and show explanation
1536                lengthCode.description = '{1}+3 unused'
1537                extra = self.verboseRead(lengthCode)
1538                #scan series of 17s (groups of zero counts)
1539                #start with repeat count 2
1540                repeat = 2
1541                startSymbol = next(alphabetIter)
1542                endSymbol = next(alphabetIter)
1543                #note: loop will not end with total==32768,
1544                #since total doesn't change here
1545                while True:
1546                    #determine last symbol
1547                    oldRepeat = repeat
1548                    repeat = (repeat-2<<3)+extra+3
1549                    #read as many symbols as repeat increases
1550                    for i in range(repeat-oldRepeat):
1551                        endSymbol = next(alphabetIter)
1552                    #see if there is more to do
1553                    length = lengthCode.decodePeek(
1554                        self.stream.peek(lengthCode.maxLength))[1].index
1555                    if length!=17: break
1556                    lengthCode.description = 'total {}+{{1}} unused'.format(
1557                        (repeat-2<<3)+3)
1558                    extra = self.verboseRead(lengthCode)
1559            else:
1560                symbol = next(alphabetIter)
1561                #double braces for format
1562                char = str(symbol)
1563                if char in '{}': char *= 2
1564                lengthCode.description = \
1565                    'Length for {} is {{0.index}} bits'.format(char)
1566                #output is not needed (will be 0)
1567                self.verboseRead(lengthCode)
1568                symbolLengths[symbol.index] = length
1569                total += 32768>>length
1570                lastLength = length
1571        assert total==32768
1572        alphabet.setLength(symbolLengths)
1573        print('End of table. Prefix code '+alphabet.name+':')
1574        alphabet.showCode()
1575
1576    #stream
1577    def processStream(self):
1578        """Process a brotli stream.
1579        """
1580        print('addr  hex{:{}s}binary context explanation'.format(
1581            '', self.width-10))
1582        print('Stream header'.center(60, '-'))
1583        self.windowSize = self.verboseRead(WindowSizeAlphabet())
1584        print('Metablock header'.center(60, '='))
1585        self.ISLAST = False
1586        self.output = bytearray()
1587        while not self.ISLAST:
1588            self.ISLAST = self.verboseRead(
1589                BoolCode('LAST', description="Last block"))
1590            if self.ISLAST:
1591                if self.verboseRead(
1592                    BoolCode('EMPTY', description="Empty block")): break
1593            if self.metablockLength(): continue
1594            if not self.ISLAST and self.uncompressed(): continue
1595            print('Block type descriptors'.center(60, '-'))
1596            self.numberOfBlockTypes = {}
1597            self.currentBlockCounts = {}
1598            self.blockTypeCodes = {}
1599            self.blockCountCodes = {}
1600            for blockType in (L,I,D): self.blockType(blockType)
1601            print('Distance code parameters'.center(60, '-'))
1602            self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet())
1603            self.readLiteralContextModes()
1604            print('Context maps'.center(60, '-'))
1605            self.cmaps = {}
1606            #keep the number of each kind of prefix tree for the last loop
1607            numberOfTrees = {I: self.numberOfBlockTypes[I]}
1608            for blockType in (L,D):
1609                numberOfTrees[blockType] = self.contextMap(blockType)
1610            print('Prefix code lists'.center(60, '-'))
1611            self.prefixCodes = {}
1612            for blockType in (L,I,D):
1613                self.readPrefixArray(blockType, numberOfTrees[blockType])
1614            self.metablock()
1615
1616    #metablock header
1617    def verboseRead(self, alphabet, context='', skipExtra=False):
1618        """Read symbol and extra from stream and explain what happens.
1619        Returns the value of the symbol
1620        >>> olleke.pos = 0
1621        >>> l = Layout(olleke)
1622        >>> l.verboseRead(WindowSizeAlphabet())
1623        0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
1624        4194288
1625        """
1626        #TODO 2: verbosity level, e.g. show only codes and maps in header
1627        stream = self.stream
1628        pos = stream.pos
1629        if skipExtra:
1630            length, symbol = alphabet.readTuple(stream)
1631            extraBits, extra = 0, None
1632        else:
1633            length, symbol, extraBits, extra = alphabet.readTupleAndExtra(
1634                stream)
1635        #fields: address, hex data, binary data, name of alphabet, explanation
1636        hexdata = self.makeHexData(pos)
1637        addressField = '{:04x}'.format(pos+7>>3) if hexdata else ''
1638        bitdata = self.formatBitData(pos, length, extraBits)
1639        #bitPtr moves bitdata so that the bytes are easier to read
1640        #jump back to right if a new byte starts
1641        if '|' in bitdata[1:]:
1642            #start over on the right side
1643            self.bitPtr = self.width
1644        fillWidth = self.bitPtr-(len(hexdata)+len(bitdata))
1645        if fillWidth<0: fillWidth = 0
1646        print('{:<5s} {:<{}s} {:7s} {}'.format(
1647            addressField,
1648            hexdata+' '*fillWidth+bitdata, self.width,
1649            context+alphabet.name,
1650            symbol if skipExtra else symbol.explanation(extra),
1651            ))
1652        #jump to the right if we started with a '|'
1653        #because we didn't jump before printing
1654        if bitdata.startswith('|'): self.bitPtr = self.width
1655        else: self.bitPtr -= len(bitdata)
1656        return symbol if skipExtra else symbol.value(extra)
1657
1658    def metablockLength(self):
1659        """Read MNIBBLES and meta block length;
1660        if empty block, skip block and return true.
1661        """
1662        self.MLEN = self.verboseRead(MetablockLengthAlphabet())
1663        if self.MLEN:
1664            return False
1665        #empty block; skip and return False
1666        self.verboseRead(ReservedAlphabet())
1667        MSKIP = self.verboseRead(SkipLengthAlphabet())
1668        self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
1669        self.stream.pos += 8*MSKIP
1670        print("Skipping to {:x}".format(self.stream.pos>>3))
1671        return True
1672
1673    def uncompressed(self):
1674        """If true, handle uncompressed data
1675        """
1676        ISUNCOMPRESSED = self.verboseRead(
1677            BoolCode('UNCMPR', description='Is uncompressed?'))
1678        if ISUNCOMPRESSED:
1679            self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
1680            print('Uncompressed data:')
1681            self.output += self.stream.readBytes(self.MLEN)
1682            print(outputFormatter(self.output[-self.MLEN:]))
1683        return ISUNCOMPRESSED
1684
1685    def blockType(self, kind):
1686        """Read block type switch descriptor for given kind of blockType."""
1687        NBLTYPES = self.verboseRead(TypeCountAlphabet(
1688            'BT#'+kind[0].upper(),
1689            description='{} block types'.format(kind),
1690            ))
1691        self.numberOfBlockTypes[kind] = NBLTYPES
1692        if NBLTYPES>=2:
1693            self.blockTypeCodes[kind] = self.readPrefixCode(
1694                BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES))
1695            self.blockCountCodes[kind] = self.readPrefixCode(
1696                BlockCountAlphabet('BC'+kind[0].upper()))
1697            blockCount = self.verboseRead(self.blockCountCodes[kind])
1698        else:
1699            blockCount = 1<<24
1700        self.currentBlockCounts[kind] = blockCount
1701
1702    def readLiteralContextModes(self):
1703        """Read literal context modes.
1704        LSB6: lower 6 bits of last char
1705        MSB6: upper 6 bits of last char
1706        UTF8: rougly dependent on categories:
1707            upper 4 bits depend on category of last char:
1708                control/whitespace/space/ punctuation/quote/%/open/close/
1709                comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant
1710            lower 2 bits depend on category of 2nd last char:
1711                space/punctuation/digit or upper/lowercase
1712        signed: hamming weight of last 2 chars
1713        """
1714        print('Context modes'.center(60, '-'))
1715        self.literalContextModes = []
1716        for i in range(self.numberOfBlockTypes[L]):
1717            self.literalContextModes.append(
1718                self.verboseRead(LiteralContextMode(number=i)))
1719
1720    def contextMap(self, kind):
1721        """Read context maps
1722        Returns the number of differnt values on the context map
1723        (In other words, the number of prefix trees)
1724        """
1725        NTREES = self.verboseRead(TypeCountAlphabet(
1726            kind[0].upper()+'T#',
1727            description='{} prefix trees'.format(kind)))
1728        mapSize = {L:64, D:4}[kind]
1729        if NTREES<2:
1730            self.cmaps[kind] = [0]*mapSize
1731        else:
1732            #read CMAPkind
1733            RLEMAX = self.verboseRead(RLEmaxAlphabet(
1734                'RLE#'+kind[0].upper(),
1735                description=kind+' context map'))
1736            alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX)
1737            cmapCode = self.readPrefixCode(alphabet)
1738            tableSize = mapSize*self.numberOfBlockTypes[kind]
1739            cmap = []
1740            while len(cmap)<tableSize:
1741                cmapCode.description = 'map {}, entry {}'.format(
1742                    *divmod(len(cmap), mapSize))
1743                count, value = self.verboseRead(cmapCode)
1744                cmap.extend([value]*count)
1745            assert len(cmap)==tableSize
1746            IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF'))
1747            if IMTF:
1748                self.IMTF(cmap)
1749            if kind==L:
1750                print('Context maps for literal data:')
1751                for i in range(0, len(cmap), 64):
1752                    print(*(
1753                        ''.join(map(str, cmap[j:j+8]))
1754                        for j in range(i, i+64, 8)
1755                        ))
1756            else:
1757                print('Context map for distances:')
1758                print(*(
1759                    ''.join(map('{:x}'.format, cmap[i:i+4]))
1760                    for i in range(0, len(cmap), 4)
1761                    ))
1762            self.cmaps[kind] = cmap
1763        return NTREES
1764
1765    @staticmethod
1766    def IMTF(v):
1767        """In place inverse move to front transform.
1768        """
1769        #mtf is initialized virtually with range(infinity)
1770        mtf = []
1771        for i, vi in enumerate(v):
1772            #get old value from mtf. If never seen, take virtual value
1773            try: value = mtf.pop(vi)
1774            except IndexError: value = vi
1775            #put value at front
1776            mtf.insert(0, value)
1777            #replace transformed value
1778            v[i] = value
1779
1780    def readPrefixArray(self, kind, numberOfTrees):
1781        """Read prefix code array"""
1782        prefixes = []
1783        for i in range(numberOfTrees):
1784            if kind==L: alphabet = LiteralAlphabet(i)
1785            elif kind==I: alphabet = InsertAndCopyAlphabet(i)
1786            elif kind==D: alphabet = DistanceAlphabet(
1787                i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT)
1788            self.readPrefixCode(alphabet)
1789            prefixes.append(alphabet)
1790        self.prefixCodes[kind] = prefixes
1791
1792    #metablock data
1793    def metablock(self):
1794        """Process the data.
1795        Relevant variables of self:
1796        numberOfBlockTypes[kind]: number of block types
1797        currentBlockTypes[kind]: current block types (=0)
1798        literalContextModes: the context modes for the literal block types
1799        currentBlockCounts[kind]: counters for block types
1800        blockTypeCodes[kind]: code for block type
1801        blockCountCodes[kind]: code for block count
1802        cmaps[kind]: the context maps (not for I)
1803        prefixCodes[kind][#]: the prefix codes
1804        lastDistances: the last four distances
1805        lastChars: the last two chars
1806        output: the result
1807        """
1808        print('Meta block contents'.center(60, '='))
1809        self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1}
1810        self.lastDistances = deque([17,16,11,4], maxlen=4)
1811        #the current context mode is for block type 0
1812        self.contextMode = ContextModeKeeper(self.literalContextModes[0])
1813        wordList = WordList()
1814
1815        #setup distance callback function
1816        def distanceCallback(symbol, extra):
1817            "callback function for displaying decoded distance"
1818            index, offset = symbol.value(extra)
1819            if index:
1820                #recent distance
1821                distance = self.lastDistances[-index]+offset
1822                return 'Distance: {}last{:+d}={}'.format(index, offset, distance)
1823            #absolute value
1824            if offset<=maxDistance:
1825                return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset)
1826            #word list value
1827            action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen])
1828            return '{}-{} gives word {},{} action {}'.format(
1829                offset, maxDistance, copyLen, word, action)
1830        for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback
1831
1832        blockLen = 0
1833        #there we go
1834        while blockLen<self.MLEN:
1835            #get insert&copy command
1836            litLen, copyLen, dist0Flag = self.verboseRead(
1837                self.prefixCodes[I][
1838                    self.figureBlockType(I)])
1839            #literal data
1840            for i in range(litLen):
1841                bt = self.figureBlockType(L)
1842                cm = self.contextMode.getIndex()
1843                ct = self.cmaps[L][bt<<6|cm]
1844                char = self.verboseRead(
1845                    self.prefixCodes[L][ct],
1846                    context='{},{}='.format(bt,cm))
1847                self.contextMode.add(char)
1848                self.output.append(char)
1849            blockLen += litLen
1850            #check if we're done
1851            if blockLen>=self.MLEN: return
1852            #distance
1853            #distances are computed relative to output length, at most window size
1854            maxDistance = min(len(self.output), self.windowSize)
1855            if dist0Flag:
1856                distance = self.lastDistances[-1]
1857            else:
1858                bt = self.figureBlockType(D)
1859                cm = {2:0, 3:1, 4:2}.get(copyLen, 3)
1860                ct = self.cmaps[D][bt<<2|cm]
1861                index, offset = self.verboseRead(
1862                    self.prefixCodes[D][ct],
1863                    context='{},{}='.format(bt,cm))
1864                distance = self.lastDistances[-index]+offset if index else offset
1865                if index==1 and offset==0:
1866                    #to make sure distance is not put in last distance list
1867                    dist0Flag = True
1868            if distance<=maxDistance:
1869                #copy from output
1870                for i in range(
1871                        maxDistance-distance,
1872                        maxDistance-distance+copyLen):
1873                    self.output.append(self.output[i])
1874                if not dist0Flag: self.lastDistances.append(distance)
1875                comment = 'Seen before'
1876            else:
1877                #fetch from wordlist
1878                newWord = wordList.word(copyLen, distance-maxDistance-1)
1879                self.output.extend(newWord)
1880                #adjust copyLen to reflect actual new data
1881                copyLen = len(newWord)
1882                comment = 'From wordlist'
1883            blockLen += copyLen
1884            print(' '*40,
1885                comment,
1886                ': "',
1887                outputFormatter(self.output[-copyLen:]),
1888                '"',
1889                sep='')
1890            self.contextMode.add(self.output[-2])
1891            self.contextMode.add(self.output[-1])
1892
1893    def figureBlockType(self, kind):
1894        counts, types = self.currentBlockCounts, self.currentBlockTypes
1895        if counts[kind]==0:
1896            newType = self.verboseRead(self.blockTypeCodes[kind])
1897            if newType==-2: newType = types['P'+kind]
1898            elif newType==-1:
1899                newType = (types[kind]+1)%self.numberOfBlockTypes[kind]
1900            types['P'+kind] = types[kind]
1901            types[kind] = newType
1902            counts[kind] = self.verboseRead(self.blockCountCodes[kind])
1903        counts[kind] -=1
1904        return types[kind]
1905
1906__test__ = {
1907'BitStream': """
1908    >>> bs = BitStream(b'Jurjen')
1909    >>> bs.readBytes(2)
1910    b'Ju'
1911    >>> bs.read(6) #r=01110010
1912    50
1913    >>> bs
1914    BitStream(pos=2:6)
1915    >>> bs.peek(5)  #j=01101010
1916    9
1917    >>> bs.readBytes(2)
1918    Traceback (most recent call last):
1919        ...
1920    ValueError: readBytes: need byte boundary
1921    """,
1922
1923'Symbol': """
1924    >>> a=Symbol(MetablockLengthAlphabet(),5)
1925    >>> len(a)
1926    2
1927    >>> int(a)
1928    5
1929    >>> a.bitPattern()
1930    '01'
1931    >>> a.value(200000)
1932    200001
1933    >>> a.explanation(300000)
1934    'data length: 493e0h+1=300001'
1935    """,
1936
1937'RangeDecoder': """
1938    >>> a=RangeDecoder(bitLength=3)
1939    >>> len(a)
1940    8
1941    >>> a.name='t'
1942    >>> list(a)
1943    [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)]
1944    >>> a[2]
1945    Symbol(t, 2)
1946    >>> a.bitPattern(4)
1947    '100'
1948    >>> a.length(2)
1949    3
1950    >>> a.decodePeek(15)
1951    (3, Symbol(t, 7))
1952    >>>
1953
1954    """,
1955
1956'PrefixDecoder': """
1957    >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4})
1958    >>> len(a)
1959    4
1960    >>> a.name='t'
1961    >>> list(a)
1962    [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)]
1963    >>> a.decodePeek(22)
1964    (1, Symbol(t, 1))
1965    >>> a.decodePeek(27)
1966    (3, Symbol(t, 3))
1967    >>> a.length(1)
1968    1
1969    >>> a.length(4)
1970    3
1971    """,
1972
1973'Code': """
1974    >>> a=Code('t',alphabetSize=10)
1975    >>> len(a)
1976    10
1977    >>> a.showCode()
1978    0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9
1979    >>> a.setLength({2:1,3:2,5:3,6:3})
1980    >>> a.showCode()
1981      0:2  01:3 011:5 111:6
1982    >>> len(a)
1983    4
1984    >>> def callback(i): return 'call{}back'.format(i)
1985    >>> a=Code('t',callback=callback,bitLength=3)
1986    >>> a[6].explanation()
1987    'call6back'
1988    """,
1989
1990'WithExtra': """
1991    >>> class A(WithExtra):
1992    ...    extraTable = [0,1,1,2,2]
1993    >>> a=A('t',alphabetSize=5)
1994    >>> a[1]
1995    Symbol(t, 1)
1996    >>> a.extraBits(2)
1997    1
1998    >>> a.mnemonic(4)
1999    '4'
2000    >>> a.readTupleAndExtra(BitStream(b'\x5b'))
2001    (3, Symbol(t, 3), 2, 3)
2002    """,
2003
2004'BoolCode': """
2005    >>> BoolCode('test')[0].explanation()
2006    '0: False'
2007    """,
2008
2009'Enumerator': """
2010    >>> class A(Enumerator):
2011    ...    extraTable = [0,1,1,2,2]
2012    ...    value0=3
2013    >>> a=A(alphabetLength=5)
2014    >>> a.value(3)
2015    Traceback (most recent call last):
2016        ...
2017    TypeError: value() missing 1 required positional argument: 'extra'
2018    >>> a.explanation(3,4)
2019    'xx 011: 8-11; 8+4=12'
2020    """,
2021
2022'WindowSizeAlphabet': """
2023    >>> windowSizeAlphabet = WindowSizeAlphabet()
2024    >>> windowSizeAlphabet[0]
2025    Traceback (most recent call last):
2026        ...
2027    ValueError: No symbol WindowSizeAlphabet[0]
2028    >>> len(windowSizeAlphabet)
2029    16
2030    >>> windowSizeAlphabet[21]
2031    Symbol(WSIZE, 21)
2032    >>> windowSizeAlphabet[21].bitPattern()
2033    '1001'
2034    >>> windowSizeAlphabet[21].extraBits()
2035    0
2036    >>> windowSizeAlphabet[21].index
2037    21
2038    >>> windowSizeAlphabet[10].value()
2039    1008
2040    >>> windowSizeAlphabet[10].explanation()
2041    'windowsize=(1<<10)-16=1008'
2042    >>> windowSizeAlphabet.showCode()
2043          0:65520    1100001:16368    1110001:32752       0011:262128
2044    0000001:131056   0010001:None        1001:2097136     1011:4194288
2045    1000001:4080     1010001:8176        0101:524272      0111:1048560
2046    0100001:1008     0110001:2032        1101:8388592     1111:16777200
2047    """,
2048
2049'TypeCountAlphabet': """
2050    >>> typeCountAlphabet = TypeCountAlphabet(description='bananas')
2051    >>> len(typeCountAlphabet)
2052    9
2053    >>> typeCountAlphabet[3]
2054    Symbol(BT#, 3)
2055    >>> typeCountAlphabet[9]
2056    Traceback (most recent call last):
2057        ...
2058    ValueError: No symbol TypeCountAlphabet[9]
2059    >>> print(typeCountAlphabet[3])
2060    xx,0101
2061    >>> typeCountAlphabet[8].value(127)
2062    256
2063    >>> typeCountAlphabet[4].explanation(2)
2064    'xxx,0111: 11 bananas'
2065    >>> typeCountAlphabet[0].explanation()
2066    '0: 1 banana'
2067    """,
2068
2069'DistanceParamAlphabet': """
2070    >>> dpa = DistanceParamAlphabet()
2071    >>> dpa.showCode()
2072    00:PF0 01:PF1 10:PF2 11:PF3
2073    >>> dpa.readTupleAndExtra(BitStream(b'\\x29'))
2074    (2, Symbol(DIST, 1), 4, 10)
2075    >>> dpa.explanation(2, 5)
2076    '2 postfix bits and 0101<<2=20 direct codes'
2077    """,
2078
2079'LiteralAlphabet': """
2080    >>> LiteralAlphabet(-1).showCode()   #doctest: +ELLIPSIS
2081    00000000:\\x00 00110100:4    01101000:h    10011100:\\x9c 11010000:\\xd0
2082    00000001:\\x01 00110101:5    01101001:i    10011101:\\x9d 11010001:\\xd1
2083    00000010:\\x02 00110110:6    01101010:j    10011110:\\x9e 11010010:\\xd2
2084    ...
2085    00101111:/    01100011:c    10010111:\\x97 11001011:\\xcb 11111111:\\xff
2086    00110000:0    01100100:d    10011000:\\x98 11001100:\\xcc
2087    00110001:1    01100101:e    10011001:\\x99 11001101:\\xcd
2088    00110010:2    01100110:f    10011010:\\x9a 11001110:\\xce
2089    00110011:3    01100111:g    10011011:\\x9b 11001111:\\xcf
2090    """,
2091
2092'BlockCountAlphabet': """
2093    >>> bc=BlockCountAlphabet('BCL')
2094    >>> len(bc)
2095    26
2096    >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02')
2097    >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2098    'Block count: xx 00000: 1-4; 1+2=3'
2099    >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2100    'Block count: xxx 00110: 33-40; 33+0=33'
2101    >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2102    'Block count: xxxxxx 10001: 305-368; 305+28=333'
2103    >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2104    'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333'
2105    """,
2106
2107'Layout': """
2108    >>> olleke.pos = 0
2109    >>> l = Layout(olleke)
2110    >>> l.verboseRead(WindowSizeAlphabet())
2111    0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
2112    4194288
2113    >>> l.verboseRead(BoolCode('LAST', description="Last block"))
2114                              1     LAST    Last block: 1: True
2115    True
2116    >>> l.verboseRead(BoolCode('EMPTY', description="Empty block"))
2117                             0      EMPTY   Empty block: 0: False
2118    False
2119    >>> l.verboseRead(MetablockLengthAlphabet())
2120    0001  2e 00        |00h|2Eh,|00 MLEN    data length: 002eh+1=47
2121    47
2122    >>> olleke.pos = 76
2123    >>> l = Layout(olleke)
2124    >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True)
2125    000a  82                10|1100 D0      10[15*x]-3
2126    >>> x.explanation(0x86a3)
2127    '10[1000011010100011]-3: [0]+100000'
2128    """,
2129
2130'olleke': """
2131    >>> olleke.pos = 0
2132    >>> try: Layout(olleke).processStream()
2133    ... except NotImplementedError: pass
2134    ... #doctest: +REPORT_NDIFF
2135    addr  hex               binary context explanation
2136    -----------------------Stream header------------------------
2137    0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
2138    ======================Metablock header======================
2139                              1     LAST    Last block: 1: True
2140                             0      EMPTY   Empty block: 0: False
2141    0001  2e 00        |00h|2Eh,|00 MLEN    data length: 002eh+1=47
2142    -------------------Block type descriptors-------------------
2143    0003  00                      0 BT#L    0: 1 literal block type
2144                                 0  BT#I    0: 1 insert&copy block type
2145                                0   BT#D    0: 1 distance block type
2146    ------------------Distance code parameters------------------
2147    0004  44               0|000,00 DIST    0 postfix bits and 0000<<0=0 direct codes
2148    -----------------------Context modes------------------------
2149                         10         LC0     Context mode for type 0: 2(UTF8)
2150    ------------------------Context maps------------------------
2151                        0           LT#     0: 1 literal prefix tree
2152                       0            DT#     0: 1 distance prefix tree
2153    ---------------------Prefix code lists----------------------
2154                     10             PFX     L0 is complex with lengths 3,4,0,5,17...
2155    0005  4f                    1|0 ##L0    len 3: coded with 3 bits
2156                            0111    ##L0    len 4: coded with 1 bits
2157                          10        ##L0    unused: coded with 3 bits
2158    0006  d6                    0|0 ##L0    len 5: skipped
2159                             011    ##L0    zero xxx: coded with 2 bits
2160    ***** Lengths for L0 will be coded as:
2161      0:len 4     01:zero xxx 011:unused   111:len 3
2162    0007  95                1|11,01 #L0     7+3 unused
2163                           0        #L0     Length for \\n is 4 bits
2164                     001,01         #L0     1+3 unused
2165    0008  44                010,0|1 #L0     total 19+2 unused
2166                           0        #L0     Length for " " is 4 bits
2167                          0         #L0     Length for ! is 4 bits
2168    0009  cb                011,|01 #L0     3+3 unused
2169                     |110,01        #L0     total 35+6 unused
2170    000a  82                      0 #L0     Length for K is 4 bits
2171                            000,01  #L0     0+3 unused
2172                           0        #L0     Length for O is 4 bits
2173    000b  4d                   01|1 #L0     symbol P unused
2174                            011     #L0     symbol Q unused
2175                           0        #L0     Length for R is 4 bits
2176    000c  88                000,|01 #L0     0+3 unused
2177                     |100,01        #L0     total 11+4 unused
2178    000d  b6                      0 #L0     Length for b is 4 bits
2179                               011  #L0     symbol c unused
2180                            011     #L0     symbol d unused
2181    000e  27                   11|1 #L0     Length for e is 3 bits
2182                         010,01     #L0     2+3 unused
2183                       |0           #L0     Length for k is 4 bits
2184    000f  1f                    111 #L0     Length for l is 3 bits
2185                             011    #L0     symbol m unused
2186                            0       #L0     Length for n is 4 bits
2187                          |0        #L0     Length for o is 4 bits
2188    0010  c1                 000,01 #L0     0+3 unused
2189                            0       #L0     Length for s is 4 bits
2190    0011  b4                   0|11 #L0     symbol t unused
2191                              0     #L0     Length for u is 4 bits
2192    End of table. Prefix code L0:
2193     000:e   0010:\\n  0110:!   0001:O   0101:b   0011:n   0111:s
2194     100:l   1010:" " 1110:K   1001:R   1101:k   1011:o   1111:u
2195                         11,01      PFX     IC0 is simple with 4 code words
2196    0012  2a                |2Ah|10 IC0     ? bits: I5C4
2197    0013  b5 ec              00|B5h IC0     ? bits: I6+xC7
2198    0015  22            0010|111011 IC0     ? bits: I8+xC5
2199    0016  8c            001100|0010 IC0     ? bits: I0C14+xx
2200                       0            SHAPE   False: lengths 2,2,2,2
2201    0017  74                 10,0|1 PFX     D0 is simple with 3 code words
2202    0018  a6                0|01110 D0      1 bit: 2last-3
2203                      010011        D0      2 bits: 11xx-3
2204    0019  aa                01010|1 D0      2 bits: 11xxx-3
2205    ====================Meta block contents=====================
2206                       |1,01        IC0     Literal: 9, copy: 5
2207    001a  41                   0001 0,0=L0  O
2208                            100     0,48=L0 l
2209    001b  a2                   10|0 0,62=L0 l
2210                            000     0,63=L0 e
2211    001c  a1                  1|101 0,59=L0 k
2212                           000      0,63=L0 e
2213                      |1010         0,59=L0 " "
2214    001d  b5                   0101 0,11=L0 b
2215                          |1011     0,60=L0 o
2216    001e  24                      0 0,3=D0  Distance: 2last-3=8
2217                                            Seen before: "lleke"
2218                              0,10  IC0     Literal: 6, copy: 7
2219                         |0010      0,59=L0 \\n
2220    001f  89                   1001 0,7=L0  R
2221                            000     0,52=L0 e
2222    0020  fa                  010|1 0,58=L0 b
2223                          1111      0,63=L0 u
2224    0021  eb                  011|1 0,59=L0 s
2225                         11,01      0,3=D0  Absolute value: 12 (pos 8)
2226                                            Seen before: "olleke\\n"
2227    0022  db                 01,1|1 IC0     Literal: 0, copy: 15
2228                      |110,11       0,3=D0  Absolute value: 27 (pos 0)
2229                                            Seen before: "Olleke bolleke\\n"
2230    0023  f8                     00 IC0     Literal: 5, copy: 4
2231                             1110   0,7=L0  K
2232    0024  2c                  00|11 0,52=L0 n
2233                          1011      0,62=L0 o
2234    0025  0d                   1|00 0,59=L0 l
2235                           0110     0,63=L0 !
2236    """,
2237
2238'file': """
2239    >>> try: Layout(BitStream(
2240    ... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb')
2241    ...     .read())).processStream()
2242    ... except NotImplementedError: pass
2243    addr  hex               binary context explanation
2244    -----------------------Stream header------------------------
2245    0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
2246    ======================Metablock header======================
2247                              1     LAST    Last block: 1: True
2248                             0      EMPTY   Empty block: 0: False
2249    0001  13 00        |00h|13h,|00 MLEN    data length: 0013h+1=20
2250    -------------------Block type descriptors-------------------
2251    0003  00                      0 BT#L    0: 1 literal block type
2252                                 0  BT#I    0: 1 insert&copy block type
2253                                0   BT#D    0: 1 distance block type
2254    ------------------Distance code parameters------------------
2255    0004  a4               0|000,00 DIST    0 postfix bits and 0000<<0=0 direct codes
2256    -----------------------Context modes------------------------
2257                         10         LC0     Context mode for type 0: 2(UTF8)
2258    ------------------------Context maps------------------------
2259                        0           LT#     0: 1 literal prefix tree
2260                       0            DT#     0: 1 distance prefix tree
2261    ---------------------Prefix code lists----------------------
2262    0005  b0                 0|1,01 PFX     L0 is simple with 2 code words
2263    0006  b2              0|1011000 L0      1 bit: X
2264    0007  ea              0|1011001 L0      1 bit: Y
2265                     01,01          PFX     IC0 is simple with 2 code words
2266    0008  81            0000001|111 IC0     1 bit: I1C9&D=0
2267    0009  47 02             0|47h|1 IC0     1 bit: I1C9
2268                       00,01        PFX     D0 is simple with 1 code word
2269    000b  8a                010|000 D0      0 bits: 10x-3
2270    ====================Meta block contents=====================
2271                           1        IC0     Literal: 1, copy: 9
2272                          0         0,0=L0  X
2273                      0,()          0,3=D0  Absolute value: 1 (pos 0)
2274                                            Seen before: "XXXXXXXXX"
2275                     0              IC0     Literal: 1, copy: 9, same distance
2276                   |1               0,54=L0 Y
2277                                            Seen before: "YYYYYYYYY"
2278    """,
2279
2280'XY': """
2281    >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream()
2282    ... except NotImplementedError: pass
2283    addr  hex               binary context explanation
2284    -----------------------Stream header------------------------
2285    0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
2286    ======================Metablock header======================
2287                              1     LAST    Last block: 1: True
2288                             0      EMPTY   Empty block: 0: False
2289    0001  13 00        |00h|13h,|00 MLEN    data length: 0013h+1=20
2290    -------------------Block type descriptors-------------------
2291    0003  00                      0 BT#L    0: 1 literal block type
2292                                 0  BT#I    0: 1 insert&copy block type
2293                                0   BT#D    0: 1 distance block type
2294    ------------------Distance code parameters------------------
2295    0004  a4               0|000,00 DIST    0 postfix bits and 0000<<0=0 direct codes
2296    -----------------------Context modes------------------------
2297                         10         LC0     Context mode for type 0: 2(UTF8)
2298    ------------------------Context maps------------------------
2299                        0           LT#     0: 1 literal prefix tree
2300                       0            DT#     0: 1 distance prefix tree
2301    ---------------------Prefix code lists----------------------
2302    0005  b0                 0|1,01 PFX     L0 is simple with 2 code words
2303    0006  b2              0|1011000 L0      1 bit: X
2304    0007  82              0|1011001 L0      1 bit: Y
2305                     00,01          PFX     IC0 is simple with 1 code word
2306    0008  84            0000100|100 IC0     0 bits: I4C6&D=0
2307    0009  00                 00,0|1 PFX     D0 is simple with 1 code word
2308    000a  e0                0|00000 D0      0 bits: last
2309    ====================Meta block contents=====================
2310                          ()        IC0     Literal: 4, copy: 6, same distance
2311                         0          0,0=L0  X
2312                        0           0,52=L0 X
2313                       0            0,54=L0 X
2314                      0             0,54=L0 X
2315                                            Seen before: "XXXXXX"
2316                    ()              IC0     Literal: 4, copy: 6, same distance
2317                   1                0,54=L0 Y
2318                  1                 0,54=L0 Y
2319                |1                  0,54=L0 Y
2320    000b  01                      1 0,54=L0 Y
2321                                            Seen before: "YYYYYY"
2322    """,
2323
2324'empty': """
2325    >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream()
2326    ... except NotImplementedError: pass
2327    addr  hex               binary context explanation
2328    -----------------------Stream header------------------------
2329    0000  81                0000001 WSIZE   windowsize=(1<<17)-16=131056
2330    ======================Metablock header======================
2331                          |1        LAST    Last block: 1: True
2332    0001  16                      0 EMPTY   Empty block: 0: False
2333                                11  MLEN    11: empty block
2334                               0    RSVD    Reserved (must be zero)
2335    0002  00           000000|00,01 SKIP    skip length: 0h+1=1
2336                    |00             SKIP    2 bits ignored
2337    Skipping to 4
2338    """,
2339
2340}
2341
2342if __name__=='__main__':
2343    import sys
2344    if len(sys.argv)>1:
2345        l = Layout(BitStream(open(sys.argv[1],'rb').read()))
2346        l.processStream()
2347    else:
2348        sys.path.append("h:/Persoonlijk/bin")
2349        try:
2350            import brotli
2351            open('brotlidump.br', 'wb').write(
2352                brotli.compress(
2353                    open('brotlidump.py', 'r').read()
2354                ))
2355            olleke = BitStream(brotli.compress(
2356                'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!'))
2357        except ImportError: pass
2358        import doctest
2359        doctest.testmod(optionflags=doctest.REPORT_NDIFF
2360            #|doctest.FAIL_FAST
2361            )
2362