1
2/*-------------------------------------------------------------*/
3/*--- Block sorting machinery                               ---*/
4/*---                                           blocksort.c ---*/
5/*-------------------------------------------------------------*/
6
7/* ------------------------------------------------------------------
8   This file is part of bzip2/libbzip2, a program and library for
9   lossless, block-sorting data compression.
10
11   bzip2/libbzip2 version 1.0.5 of 10 December 2007
12   Copyright (C) 1996-2007 Julian Seward <jseward@bzip.org>
13
14   Please read the WARNING, DISCLAIMER and PATENTS sections in the
15   README file.
16
17   This program is released under the terms of the license contained
18   in the file LICENSE.
19   ------------------------------------------------------------------ */
20
21
22#include "bzlib_private.h"
23
24/*---------------------------------------------*/
25/*--- Fallback O(N log(N)^2) sorting        ---*/
26/*--- algorithm, for repetitive blocks      ---*/
27/*---------------------------------------------*/
28
29/*---------------------------------------------*/
30static
31__inline__
32void fallbackSimpleSort ( UInt32* fmap,
33                          UInt32* eclass,
34                          Int32   lo,
35                          Int32   hi )
36{
37   Int32 i, j, tmp;
38   UInt32 ec_tmp;
39
40   if (lo == hi) return;
41
42   if (hi - lo > 3) {
43      for ( i = hi-4; i >= lo; i-- ) {
44         tmp = fmap[i];
45         ec_tmp = eclass[tmp];
46         for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
47            fmap[j-4] = fmap[j];
48         fmap[j-4] = tmp;
49      }
50   }
51
52   for ( i = hi-1; i >= lo; i-- ) {
53      tmp = fmap[i];
54      ec_tmp = eclass[tmp];
55      for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
56         fmap[j-1] = fmap[j];
57      fmap[j-1] = tmp;
58   }
59}
60
61
62/*---------------------------------------------*/
63#define fswap(zz1, zz2) \
64   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
65
66#define fvswap(zzp1, zzp2, zzn)       \
67{                                     \
68   Int32 yyp1 = (zzp1);               \
69   Int32 yyp2 = (zzp2);               \
70   Int32 yyn  = (zzn);                \
71   while (yyn > 0) {                  \
72      fswap(fmap[yyp1], fmap[yyp2]);  \
73      yyp1++; yyp2++; yyn--;          \
74   }                                  \
75}
76
77
78#define fmin(a,b) ((a) < (b)) ? (a) : (b)
79
80#define fpush(lz,hz) { stackLo[sp] = lz; \
81                       stackHi[sp] = hz; \
82                       sp++; }
83
84#define fpop(lz,hz) { sp--;              \
85                      lz = stackLo[sp];  \
86                      hz = stackHi[sp]; }
87
88#define FALLBACK_QSORT_SMALL_THRESH 10
89#define FALLBACK_QSORT_STACK_SIZE   100
90
91
92static
93void fallbackQSort3 ( UInt32* fmap,
94                      UInt32* eclass,
95                      Int32   loSt,
96                      Int32   hiSt )
97{
98   Int32 unLo, unHi, ltLo, gtHi, n, m;
99   Int32 sp, lo, hi;
100   UInt32 med, r, r3;
101   Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
102   Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
103
104   r = 0;
105
106   sp = 0;
107   fpush ( loSt, hiSt );
108
109   while (sp > 0) {
110
111      AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
112
113      fpop ( lo, hi );
114      if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
115         fallbackSimpleSort ( fmap, eclass, lo, hi );
116         continue;
117      }
118
119      /* Random partitioning.  Median of 3 sometimes fails to
120         avoid bad cases.  Median of 9 seems to help but
121         looks rather expensive.  This too seems to work but
122         is cheaper.  Guidance for the magic constants
123         7621 and 32768 is taken from Sedgewick's algorithms
124         book, chapter 35.
125      */
126      r = ((r * 7621) + 1) % 32768;
127      r3 = r % 3;
128      if (r3 == 0) med = eclass[fmap[lo]]; else
129      if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
130                   med = eclass[fmap[hi]];
131
132      unLo = ltLo = lo;
133      unHi = gtHi = hi;
134
135      while (1) {
136         while (1) {
137            if (unLo > unHi) break;
138            n = (Int32)eclass[fmap[unLo]] - (Int32)med;
139            if (n == 0) {
140               fswap(fmap[unLo], fmap[ltLo]);
141               ltLo++; unLo++;
142               continue;
143            };
144            if (n > 0) break;
145            unLo++;
146         }
147         while (1) {
148            if (unLo > unHi) break;
149            n = (Int32)eclass[fmap[unHi]] - (Int32)med;
150            if (n == 0) {
151               fswap(fmap[unHi], fmap[gtHi]);
152               gtHi--; unHi--;
153               continue;
154            };
155            if (n < 0) break;
156            unHi--;
157         }
158         if (unLo > unHi) break;
159         fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
160      }
161
162      AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
163
164      if (gtHi < ltLo) continue;
165
166      n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
167      m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
168
169      n = lo + unLo - ltLo - 1;
170      m = hi - (gtHi - unHi) + 1;
171
172      if (n - lo > hi - m) {
173         fpush ( lo, n );
174         fpush ( m, hi );
175      } else {
176         fpush ( m, hi );
177         fpush ( lo, n );
178      }
179   }
180}
181
182#undef fmin
183#undef fpush
184#undef fpop
185#undef fswap
186#undef fvswap
187#undef FALLBACK_QSORT_SMALL_THRESH
188#undef FALLBACK_QSORT_STACK_SIZE
189
190
191/*---------------------------------------------*/
192/* Pre:
193      nblock > 0
194      eclass exists for [0 .. nblock-1]
195      ((UChar*)eclass) [0 .. nblock-1] holds block
196      ptr exists for [0 .. nblock-1]
197
198   Post:
199      ((UChar*)eclass) [0 .. nblock-1] holds block
200      All other areas of eclass destroyed
201      fmap [0 .. nblock-1] holds sorted order
202      bhtab [ 0 .. 2+(nblock/32) ] destroyed
203*/
204
205#define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
206#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
207#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
208#define      WORD_BH(zz)  bhtab[(zz) >> 5]
209#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
210
211static
212void fallbackSort ( UInt32* fmap,
213                    UInt32* eclass,
214                    UInt32* bhtab,
215                    Int32   nblock,
216                    Int32   verb )
217{
218   Int32 ftab[257];
219   Int32 ftabCopy[256];
220   Int32 H, i, j, k, l, r, cc, cc1;
221   Int32 nNotDone;
222   Int32 nBhtab;
223   UChar* eclass8 = (UChar*)eclass;
224
225   /*--
226      Initial 1-char radix sort to generate
227      initial fmap and initial BH bits.
228   --*/
229   if (verb >= 4)
230      VPrintf0 ( "        bucket sorting ...\n" );
231   for (i = 0; i < 257;    i++) ftab[i] = 0;
232   for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
233   for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
234   for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
235
236   for (i = 0; i < nblock; i++) {
237      j = eclass8[i];
238      k = ftab[j] - 1;
239      ftab[j] = k;
240      fmap[k] = i;
241   }
242
243   nBhtab = 2 + (nblock / 32);
244   for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
245   for (i = 0; i < 256; i++) SET_BH(ftab[i]);
246
247   /*--
248      Inductively refine the buckets.  Kind-of an
249      "exponential radix sort" (!), inspired by the
250      Manber-Myers suffix array construction algorithm.
251   --*/
252
253   /*-- set sentinel bits for block-end detection --*/
254   for (i = 0; i < 32; i++) {
255      SET_BH(nblock + 2*i);
256      CLEAR_BH(nblock + 2*i + 1);
257   }
258
259   /*-- the log(N) loop --*/
260   H = 1;
261   while (1) {
262
263      if (verb >= 4)
264         VPrintf1 ( "        depth %6d has ", H );
265
266      j = 0;
267      for (i = 0; i < nblock; i++) {
268         if (ISSET_BH(i)) j = i;
269         k = fmap[i] - H; if (k < 0) k += nblock;
270         eclass[k] = j;
271      }
272
273      nNotDone = 0;
274      r = -1;
275      while (1) {
276
277	 /*-- find the next non-singleton bucket --*/
278         k = r + 1;
279         while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
280         if (ISSET_BH(k)) {
281            while (WORD_BH(k) == 0xffffffff) k += 32;
282            while (ISSET_BH(k)) k++;
283         }
284         l = k - 1;
285         if (l >= nblock) break;
286         while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
287         if (!ISSET_BH(k)) {
288            while (WORD_BH(k) == 0x00000000) k += 32;
289            while (!ISSET_BH(k)) k++;
290         }
291         r = k - 1;
292         if (r >= nblock) break;
293
294         /*-- now [l, r] bracket current bucket --*/
295         if (r > l) {
296            nNotDone += (r - l + 1);
297            fallbackQSort3 ( fmap, eclass, l, r );
298
299            /*-- scan bucket and generate header bits-- */
300            cc = -1;
301            for (i = l; i <= r; i++) {
302               cc1 = eclass[fmap[i]];
303               if (cc != cc1) { SET_BH(i); cc = cc1; };
304            }
305         }
306      }
307
308      if (verb >= 4)
309         VPrintf1 ( "%6d unresolved strings\n", nNotDone );
310
311      H *= 2;
312      if (H > nblock || nNotDone == 0) break;
313   }
314
315   /*--
316      Reconstruct the original block in
317      eclass8 [0 .. nblock-1], since the
318      previous phase destroyed it.
319   --*/
320   if (verb >= 4)
321      VPrintf0 ( "        reconstructing block ...\n" );
322   j = 0;
323   for (i = 0; i < nblock; i++) {
324      while (ftabCopy[j] == 0) j++;
325      ftabCopy[j]--;
326      eclass8[fmap[i]] = (UChar)j;
327   }
328   AssertH ( j < 256, 1005 );
329}
330
331#undef       SET_BH
332#undef     CLEAR_BH
333#undef     ISSET_BH
334#undef      WORD_BH
335#undef UNALIGNED_BH
336
337
338/*---------------------------------------------*/
339/*--- The main, O(N^2 log(N)) sorting       ---*/
340/*--- algorithm.  Faster for "normal"       ---*/
341/*--- non-repetitive blocks.                ---*/
342/*---------------------------------------------*/
343
344/*---------------------------------------------*/
345static
346__inline__
347Bool mainGtU ( UInt32  i1,
348               UInt32  i2,
349               UChar*  block,
350               UInt16* quadrant,
351               UInt32  nblock,
352               Int32*  budget )
353{
354   Int32  k;
355   UChar  c1, c2;
356   UInt16 s1, s2;
357
358   AssertD ( i1 != i2, "mainGtU" );
359   /* 1 */
360   c1 = block[i1]; c2 = block[i2];
361   if (c1 != c2) return (c1 > c2);
362   i1++; i2++;
363   /* 2 */
364   c1 = block[i1]; c2 = block[i2];
365   if (c1 != c2) return (c1 > c2);
366   i1++; i2++;
367   /* 3 */
368   c1 = block[i1]; c2 = block[i2];
369   if (c1 != c2) return (c1 > c2);
370   i1++; i2++;
371   /* 4 */
372   c1 = block[i1]; c2 = block[i2];
373   if (c1 != c2) return (c1 > c2);
374   i1++; i2++;
375   /* 5 */
376   c1 = block[i1]; c2 = block[i2];
377   if (c1 != c2) return (c1 > c2);
378   i1++; i2++;
379   /* 6 */
380   c1 = block[i1]; c2 = block[i2];
381   if (c1 != c2) return (c1 > c2);
382   i1++; i2++;
383   /* 7 */
384   c1 = block[i1]; c2 = block[i2];
385   if (c1 != c2) return (c1 > c2);
386   i1++; i2++;
387   /* 8 */
388   c1 = block[i1]; c2 = block[i2];
389   if (c1 != c2) return (c1 > c2);
390   i1++; i2++;
391   /* 9 */
392   c1 = block[i1]; c2 = block[i2];
393   if (c1 != c2) return (c1 > c2);
394   i1++; i2++;
395   /* 10 */
396   c1 = block[i1]; c2 = block[i2];
397   if (c1 != c2) return (c1 > c2);
398   i1++; i2++;
399   /* 11 */
400   c1 = block[i1]; c2 = block[i2];
401   if (c1 != c2) return (c1 > c2);
402   i1++; i2++;
403   /* 12 */
404   c1 = block[i1]; c2 = block[i2];
405   if (c1 != c2) return (c1 > c2);
406   i1++; i2++;
407
408   k = nblock + 8;
409
410   do {
411      /* 1 */
412      c1 = block[i1]; c2 = block[i2];
413      if (c1 != c2) return (c1 > c2);
414      s1 = quadrant[i1]; s2 = quadrant[i2];
415      if (s1 != s2) return (s1 > s2);
416      i1++; i2++;
417      /* 2 */
418      c1 = block[i1]; c2 = block[i2];
419      if (c1 != c2) return (c1 > c2);
420      s1 = quadrant[i1]; s2 = quadrant[i2];
421      if (s1 != s2) return (s1 > s2);
422      i1++; i2++;
423      /* 3 */
424      c1 = block[i1]; c2 = block[i2];
425      if (c1 != c2) return (c1 > c2);
426      s1 = quadrant[i1]; s2 = quadrant[i2];
427      if (s1 != s2) return (s1 > s2);
428      i1++; i2++;
429      /* 4 */
430      c1 = block[i1]; c2 = block[i2];
431      if (c1 != c2) return (c1 > c2);
432      s1 = quadrant[i1]; s2 = quadrant[i2];
433      if (s1 != s2) return (s1 > s2);
434      i1++; i2++;
435      /* 5 */
436      c1 = block[i1]; c2 = block[i2];
437      if (c1 != c2) return (c1 > c2);
438      s1 = quadrant[i1]; s2 = quadrant[i2];
439      if (s1 != s2) return (s1 > s2);
440      i1++; i2++;
441      /* 6 */
442      c1 = block[i1]; c2 = block[i2];
443      if (c1 != c2) return (c1 > c2);
444      s1 = quadrant[i1]; s2 = quadrant[i2];
445      if (s1 != s2) return (s1 > s2);
446      i1++; i2++;
447      /* 7 */
448      c1 = block[i1]; c2 = block[i2];
449      if (c1 != c2) return (c1 > c2);
450      s1 = quadrant[i1]; s2 = quadrant[i2];
451      if (s1 != s2) return (s1 > s2);
452      i1++; i2++;
453      /* 8 */
454      c1 = block[i1]; c2 = block[i2];
455      if (c1 != c2) return (c1 > c2);
456      s1 = quadrant[i1]; s2 = quadrant[i2];
457      if (s1 != s2) return (s1 > s2);
458      i1++; i2++;
459
460      if (i1 >= nblock) i1 -= nblock;
461      if (i2 >= nblock) i2 -= nblock;
462
463      k -= 8;
464      (*budget)--;
465   }
466      while (k >= 0);
467
468   return False;
469}
470
471
472/*---------------------------------------------*/
473/*--
474   Knuth's increments seem to work better
475   than Incerpi-Sedgewick here.  Possibly
476   because the number of elems to sort is
477   usually small, typically <= 20.
478--*/
479static
480Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
481                   9841, 29524, 88573, 265720,
482                   797161, 2391484 };
483
484static
485void mainSimpleSort ( UInt32* ptr,
486                      UChar*  block,
487                      UInt16* quadrant,
488                      Int32   nblock,
489                      Int32   lo,
490                      Int32   hi,
491                      Int32   d,
492                      Int32*  budget )
493{
494   Int32 i, j, h, bigN, hp;
495   UInt32 v;
496
497   bigN = hi - lo + 1;
498   if (bigN < 2) return;
499
500   hp = 0;
501   while (incs[hp] < bigN) hp++;
502   hp--;
503
504   for (; hp >= 0; hp--) {
505      h = incs[hp];
506
507      i = lo + h;
508      while (True) {
509
510         /*-- copy 1 --*/
511         if (i > hi) break;
512         v = ptr[i];
513         j = i;
514         while ( mainGtU (
515                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
516                 ) ) {
517            ptr[j] = ptr[j-h];
518            j = j - h;
519            if (j <= (lo + h - 1)) break;
520         }
521         ptr[j] = v;
522         i++;
523
524         /*-- copy 2 --*/
525         if (i > hi) break;
526         v = ptr[i];
527         j = i;
528         while ( mainGtU (
529                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
530                 ) ) {
531            ptr[j] = ptr[j-h];
532            j = j - h;
533            if (j <= (lo + h - 1)) break;
534         }
535         ptr[j] = v;
536         i++;
537
538         /*-- copy 3 --*/
539         if (i > hi) break;
540         v = ptr[i];
541         j = i;
542         while ( mainGtU (
543                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
544                 ) ) {
545            ptr[j] = ptr[j-h];
546            j = j - h;
547            if (j <= (lo + h - 1)) break;
548         }
549         ptr[j] = v;
550         i++;
551
552         if (*budget < 0) return;
553      }
554   }
555}
556
557
558/*---------------------------------------------*/
559/*--
560   The following is an implementation of
561   an elegant 3-way quicksort for strings,
562   described in a paper "Fast Algorithms for
563   Sorting and Searching Strings", by Robert
564   Sedgewick and Jon L. Bentley.
565--*/
566
567#define mswap(zz1, zz2) \
568   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
569
570#define mvswap(zzp1, zzp2, zzn)       \
571{                                     \
572   Int32 yyp1 = (zzp1);               \
573   Int32 yyp2 = (zzp2);               \
574   Int32 yyn  = (zzn);                \
575   while (yyn > 0) {                  \
576      mswap(ptr[yyp1], ptr[yyp2]);    \
577      yyp1++; yyp2++; yyn--;          \
578   }                                  \
579}
580
581static
582__inline__
583UChar mmed3 ( UChar a, UChar b, UChar c )
584{
585   UChar t;
586   if (a > b) { t = a; a = b; b = t; };
587   if (b > c) {
588      b = c;
589      if (a > b) b = a;
590   }
591   return b;
592}
593
594#define mmin(a,b) ((a) < (b)) ? (a) : (b)
595
596#define mpush(lz,hz,dz) { stackLo[sp] = lz; \
597                          stackHi[sp] = hz; \
598                          stackD [sp] = dz; \
599                          sp++; }
600
601#define mpop(lz,hz,dz) { sp--;             \
602                         lz = stackLo[sp]; \
603                         hz = stackHi[sp]; \
604                         dz = stackD [sp]; }
605
606
607#define mnextsize(az) (nextHi[az]-nextLo[az])
608
609#define mnextswap(az,bz)                                        \
610   { Int32 tz;                                                  \
611     tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
612     tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
613     tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
614
615
616#define MAIN_QSORT_SMALL_THRESH 20
617#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
618#define MAIN_QSORT_STACK_SIZE 100
619
620static
621void mainQSort3 ( UInt32* ptr,
622                  UChar*  block,
623                  UInt16* quadrant,
624                  Int32   nblock,
625                  Int32   loSt,
626                  Int32   hiSt,
627                  Int32   dSt,
628                  Int32*  budget )
629{
630   Int32 unLo, unHi, ltLo, gtHi, n, m, med;
631   Int32 sp, lo, hi, d;
632
633   Int32 stackLo[MAIN_QSORT_STACK_SIZE];
634   Int32 stackHi[MAIN_QSORT_STACK_SIZE];
635   Int32 stackD [MAIN_QSORT_STACK_SIZE];
636
637   Int32 nextLo[3];
638   Int32 nextHi[3];
639   Int32 nextD [3];
640
641   sp = 0;
642   mpush ( loSt, hiSt, dSt );
643
644   while (sp > 0) {
645
646      AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
647
648      mpop ( lo, hi, d );
649      if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
650          d > MAIN_QSORT_DEPTH_THRESH) {
651         mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
652         if (*budget < 0) return;
653         continue;
654      }
655
656      med = (Int32)
657            mmed3 ( block[ptr[ lo         ]+d],
658                    block[ptr[ hi         ]+d],
659                    block[ptr[ (lo+hi)>>1 ]+d] );
660
661      unLo = ltLo = lo;
662      unHi = gtHi = hi;
663
664      while (True) {
665         while (True) {
666            if (unLo > unHi) break;
667            n = ((Int32)block[ptr[unLo]+d]) - med;
668            if (n == 0) {
669               mswap(ptr[unLo], ptr[ltLo]);
670               ltLo++; unLo++; continue;
671            };
672            if (n >  0) break;
673            unLo++;
674         }
675         while (True) {
676            if (unLo > unHi) break;
677            n = ((Int32)block[ptr[unHi]+d]) - med;
678            if (n == 0) {
679               mswap(ptr[unHi], ptr[gtHi]);
680               gtHi--; unHi--; continue;
681            };
682            if (n <  0) break;
683            unHi--;
684         }
685         if (unLo > unHi) break;
686         mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
687      }
688
689      AssertD ( unHi == unLo-1, "mainQSort3(2)" );
690
691      if (gtHi < ltLo) {
692         mpush(lo, hi, d+1 );
693         continue;
694      }
695
696      n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
697      m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
698
699      n = lo + unLo - ltLo - 1;
700      m = hi - (gtHi - unHi) + 1;
701
702      nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
703      nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
704      nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
705
706      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
707      if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
708      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
709
710      AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
711      AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
712
713      mpush (nextLo[0], nextHi[0], nextD[0]);
714      mpush (nextLo[1], nextHi[1], nextD[1]);
715      mpush (nextLo[2], nextHi[2], nextD[2]);
716   }
717}
718
719#undef mswap
720#undef mvswap
721#undef mpush
722#undef mpop
723#undef mmin
724#undef mnextsize
725#undef mnextswap
726#undef MAIN_QSORT_SMALL_THRESH
727#undef MAIN_QSORT_DEPTH_THRESH
728#undef MAIN_QSORT_STACK_SIZE
729
730
731/*---------------------------------------------*/
732/* Pre:
733      nblock > N_OVERSHOOT
734      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
735      ((UChar*)block32) [0 .. nblock-1] holds block
736      ptr exists for [0 .. nblock-1]
737
738   Post:
739      ((UChar*)block32) [0 .. nblock-1] holds block
740      All other areas of block32 destroyed
741      ftab [0 .. 65536 ] destroyed
742      ptr [0 .. nblock-1] holds sorted order
743      if (*budget < 0), sorting was abandoned
744*/
745
746#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
747#define SETMASK (1 << 21)
748#define CLEARMASK (~(SETMASK))
749
750static
751void mainSort ( UInt32* ptr,
752                UChar*  block,
753                UInt16* quadrant,
754                UInt32* ftab,
755                Int32   nblock,
756                Int32   verb,
757                Int32*  budget )
758{
759   Int32  i, j, k, ss, sb;
760   Int32  runningOrder[256];
761   Bool   bigDone[256];
762   Int32  copyStart[256];
763   Int32  copyEnd  [256];
764   UChar  c1;
765   Int32  numQSorted;
766   UInt16 s;
767   if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
768
769   /*-- set up the 2-byte frequency table --*/
770   for (i = 65536; i >= 0; i--) ftab[i] = 0;
771
772   j = block[0] << 8;
773   i = nblock-1;
774   for (; i >= 3; i -= 4) {
775      quadrant[i] = 0;
776      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
777      ftab[j]++;
778      quadrant[i-1] = 0;
779      j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
780      ftab[j]++;
781      quadrant[i-2] = 0;
782      j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
783      ftab[j]++;
784      quadrant[i-3] = 0;
785      j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
786      ftab[j]++;
787   }
788   for (; i >= 0; i--) {
789      quadrant[i] = 0;
790      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
791      ftab[j]++;
792   }
793
794   /*-- (emphasises close relationship of block & quadrant) --*/
795   for (i = 0; i < BZ_N_OVERSHOOT; i++) {
796      block   [nblock+i] = block[i];
797      quadrant[nblock+i] = 0;
798   }
799
800   if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
801
802   /*-- Complete the initial radix sort --*/
803   for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
804
805   s = block[0] << 8;
806   i = nblock-1;
807   for (; i >= 3; i -= 4) {
808      s = (s >> 8) | (block[i] << 8);
809      j = ftab[s] -1;
810      ftab[s] = j;
811      ptr[j] = i;
812      s = (s >> 8) | (block[i-1] << 8);
813      j = ftab[s] -1;
814      ftab[s] = j;
815      ptr[j] = i-1;
816      s = (s >> 8) | (block[i-2] << 8);
817      j = ftab[s] -1;
818      ftab[s] = j;
819      ptr[j] = i-2;
820      s = (s >> 8) | (block[i-3] << 8);
821      j = ftab[s] -1;
822      ftab[s] = j;
823      ptr[j] = i-3;
824   }
825   for (; i >= 0; i--) {
826      s = (s >> 8) | (block[i] << 8);
827      j = ftab[s] -1;
828      ftab[s] = j;
829      ptr[j] = i;
830   }
831
832   /*--
833      Now ftab contains the first loc of every small bucket.
834      Calculate the running order, from smallest to largest
835      big bucket.
836   --*/
837   for (i = 0; i <= 255; i++) {
838      bigDone     [i] = False;
839      runningOrder[i] = i;
840   }
841
842   {
843      Int32 vv;
844      Int32 h = 1;
845      do h = 3 * h + 1; while (h <= 256);
846      do {
847         h = h / 3;
848         for (i = h; i <= 255; i++) {
849            vv = runningOrder[i];
850            j = i;
851            while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
852               runningOrder[j] = runningOrder[j-h];
853               j = j - h;
854               if (j <= (h - 1)) goto zero;
855            }
856            zero:
857            runningOrder[j] = vv;
858         }
859      } while (h != 1);
860   }
861
862   /*--
863      The main sorting loop.
864   --*/
865
866   numQSorted = 0;
867
868   for (i = 0; i <= 255; i++) {
869
870      /*--
871         Process big buckets, starting with the least full.
872         Basically this is a 3-step process in which we call
873         mainQSort3 to sort the small buckets [ss, j], but
874         also make a big effort to avoid the calls if we can.
875      --*/
876      ss = runningOrder[i];
877
878      /*--
879         Step 1:
880         Complete the big bucket [ss] by quicksorting
881         any unsorted small buckets [ss, j], for j != ss.
882         Hopefully previous pointer-scanning phases have already
883         completed many of the small buckets [ss, j], so
884         we don't have to sort them at all.
885      --*/
886      for (j = 0; j <= 255; j++) {
887         if (j != ss) {
888            sb = (ss << 8) + j;
889            if ( ! (ftab[sb] & SETMASK) ) {
890               Int32 lo = ftab[sb]   & CLEARMASK;
891               Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
892               if (hi > lo) {
893                  if (verb >= 4)
894                     VPrintf4 ( "        qsort [0x%x, 0x%x]   "
895                                "done %d   this %d\n",
896                                ss, j, numQSorted, hi - lo + 1 );
897                  mainQSort3 (
898                     ptr, block, quadrant, nblock,
899                     lo, hi, BZ_N_RADIX, budget
900                  );
901                  numQSorted += (hi - lo + 1);
902                  if (*budget < 0) return;
903               }
904            }
905            ftab[sb] |= SETMASK;
906         }
907      }
908
909      AssertH ( !bigDone[ss], 1006 );
910
911      /*--
912         Step 2:
913         Now scan this big bucket [ss] so as to synthesise the
914         sorted order for small buckets [t, ss] for all t,
915         including, magically, the bucket [ss,ss] too.
916         This will avoid doing Real Work in subsequent Step 1's.
917      --*/
918      {
919         for (j = 0; j <= 255; j++) {
920            copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
921            copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
922         }
923         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
924            k = ptr[j]-1; if (k < 0) k += nblock;
925            c1 = block[k];
926            if (!bigDone[c1])
927               ptr[ copyStart[c1]++ ] = k;
928         }
929         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
930            k = ptr[j]-1; if (k < 0) k += nblock;
931            c1 = block[k];
932            if (!bigDone[c1])
933               ptr[ copyEnd[c1]-- ] = k;
934         }
935      }
936
937      AssertH ( (copyStart[ss]-1 == copyEnd[ss])
938                ||
939                /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
940                   Necessity for this case is demonstrated by compressing
941                   a sequence of approximately 48.5 million of character
942                   251; 1.0.0/1.0.1 will then die here. */
943                (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
944                1007 )
945
946      for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
947
948      /*--
949         Step 3:
950         The [ss] big bucket is now done.  Record this fact,
951         and update the quadrant descriptors.  Remember to
952         update quadrants in the overshoot area too, if
953         necessary.  The "if (i < 255)" test merely skips
954         this updating for the last bucket processed, since
955         updating for the last bucket is pointless.
956
957         The quadrant array provides a way to incrementally
958         cache sort orderings, as they appear, so as to
959         make subsequent comparisons in fullGtU() complete
960         faster.  For repetitive blocks this makes a big
961         difference (but not big enough to be able to avoid
962         the fallback sorting mechanism, exponential radix sort).
963
964         The precise meaning is: at all times:
965
966            for 0 <= i < nblock and 0 <= j <= nblock
967
968            if block[i] != block[j],
969
970               then the relative values of quadrant[i] and
971                    quadrant[j] are meaningless.
972
973               else {
974                  if quadrant[i] < quadrant[j]
975                     then the string starting at i lexicographically
976                     precedes the string starting at j
977
978                  else if quadrant[i] > quadrant[j]
979                     then the string starting at j lexicographically
980                     precedes the string starting at i
981
982                  else
983                     the relative ordering of the strings starting
984                     at i and j has not yet been determined.
985               }
986      --*/
987      bigDone[ss] = True;
988
989      if (i < 255) {
990         Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
991         Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
992         Int32 shifts   = 0;
993
994         while ((bbSize >> shifts) > 65534) shifts++;
995
996         for (j = bbSize-1; j >= 0; j--) {
997            Int32 a2update     = ptr[bbStart + j];
998            UInt16 qVal        = (UInt16)(j >> shifts);
999            quadrant[a2update] = qVal;
1000            if (a2update < BZ_N_OVERSHOOT)
1001               quadrant[a2update + nblock] = qVal;
1002         }
1003         AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1004      }
1005
1006   }
1007
1008   if (verb >= 4)
1009      VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1010                 nblock, numQSorted, nblock - numQSorted );
1011}
1012
1013#undef BIGFREQ
1014#undef SETMASK
1015#undef CLEARMASK
1016
1017
1018/*---------------------------------------------*/
1019/* Pre:
1020      nblock > 0
1021      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1022      ((UChar*)arr2)  [0 .. nblock-1] holds block
1023      arr1 exists for [0 .. nblock-1]
1024
1025   Post:
1026      ((UChar*)arr2) [0 .. nblock-1] holds block
1027      All other areas of block destroyed
1028      ftab [ 0 .. 65536 ] destroyed
1029      arr1 [0 .. nblock-1] holds sorted order
1030*/
1031void BZ2_blockSort ( EState* s )
1032{
1033   UInt32* ptr    = s->ptr;
1034   UChar*  block  = s->block;
1035   UInt32* ftab   = s->ftab;
1036   Int32   nblock = s->nblock;
1037   Int32   verb   = s->verbosity;
1038   Int32   wfact  = s->workFactor;
1039   UInt16* quadrant;
1040   Int32   budget;
1041   Int32   budgetInit;
1042   Int32   i;
1043
1044   if (nblock < 10000) {
1045      fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1046   } else {
1047      /* Calculate the location for quadrant, remembering to get
1048         the alignment right.  Assumes that &(block[0]) is at least
1049         2-byte aligned -- this should be ok since block is really
1050         the first section of arr2.
1051      */
1052      i = nblock+BZ_N_OVERSHOOT;
1053      if (i & 1) i++;
1054      quadrant = (UInt16*)(&(block[i]));
1055
1056      /* (wfact-1) / 3 puts the default-factor-30
1057         transition point at very roughly the same place as
1058         with v0.1 and v0.9.0.
1059         Not that it particularly matters any more, since the
1060         resulting compressed stream is now the same regardless
1061         of whether or not we use the main sort or fallback sort.
1062      */
1063      if (wfact < 1  ) wfact = 1;
1064      if (wfact > 100) wfact = 100;
1065      budgetInit = nblock * ((wfact-1) / 3);
1066      budget = budgetInit;
1067
1068      mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1069      if (verb >= 3)
1070         VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1071                    budgetInit - budget,
1072                    nblock,
1073                    (float)(budgetInit - budget) /
1074                    (float)(nblock==0 ? 1 : nblock) );
1075      if (budget < 0) {
1076         if (verb >= 2)
1077            VPrintf0 ( "    too repetitive; using fallback"
1078                       " sorting algorithm\n" );
1079         fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1080      }
1081   }
1082
1083   s->origPtr = -1;
1084   for (i = 0; i < s->nblock; i++)
1085      if (ptr[i] == 0)
1086         { s->origPtr = i; break; };
1087
1088   AssertH( s->origPtr != -1, 1003 );
1089}
1090
1091
1092/*-------------------------------------------------------------*/
1093/*--- end                                       blocksort.c ---*/
1094/*-------------------------------------------------------------*/
1095