lossless.c revision 0406ce1417f76f2034833414dcecc9f56253640c
1// Copyright 2012 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// Image transforms and color space conversion methods for lossless decoder.
11//
12// Authors: Vikas Arora (vikaas.arora@gmail.com)
13//          Jyrki Alakuijala (jyrki@google.com)
14//          Urvang Joshi (urvang@google.com)
15
16#include "./dsp.h"
17
18// Define the following if target arch is sure to have SSE2
19// #define WEBP_TARGET_HAS_SSE2
20
21#if defined(__cplusplus) || defined(c_plusplus)
22extern "C" {
23#endif
24
25#if defined(WEBP_TARGET_HAS_SSE2)
26#include <emmintrin.h>
27#endif
28
29#include <math.h>
30#include <stdlib.h>
31#include "./lossless.h"
32#include "../dec/vp8li.h"
33#include "./yuv.h"
34
35#define MAX_DIFF_COST (1e30f)
36
37// lookup table for small values of log2(int)
38#define APPROX_LOG_MAX  4096
39#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
40const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
41  0.0000000000000000f, 0.0000000000000000f,
42  1.0000000000000000f, 1.5849625007211560f,
43  2.0000000000000000f, 2.3219280948873621f,
44  2.5849625007211560f, 2.8073549220576041f,
45  3.0000000000000000f, 3.1699250014423121f,
46  3.3219280948873621f, 3.4594316186372973f,
47  3.5849625007211560f, 3.7004397181410921f,
48  3.8073549220576041f, 3.9068905956085187f,
49  4.0000000000000000f, 4.0874628412503390f,
50  4.1699250014423121f, 4.2479275134435852f,
51  4.3219280948873626f, 4.3923174227787606f,
52  4.4594316186372973f, 4.5235619560570130f,
53  4.5849625007211560f, 4.6438561897747243f,
54  4.7004397181410917f, 4.7548875021634682f,
55  4.8073549220576037f, 4.8579809951275718f,
56  4.9068905956085187f, 4.9541963103868749f,
57  5.0000000000000000f, 5.0443941193584533f,
58  5.0874628412503390f, 5.1292830169449663f,
59  5.1699250014423121f, 5.2094533656289501f,
60  5.2479275134435852f, 5.2854022188622487f,
61  5.3219280948873626f, 5.3575520046180837f,
62  5.3923174227787606f, 5.4262647547020979f,
63  5.4594316186372973f, 5.4918530963296747f,
64  5.5235619560570130f, 5.5545888516776376f,
65  5.5849625007211560f, 5.6147098441152083f,
66  5.6438561897747243f, 5.6724253419714951f,
67  5.7004397181410917f, 5.7279204545631987f,
68  5.7548875021634682f, 5.7813597135246599f,
69  5.8073549220576037f, 5.8328900141647412f,
70  5.8579809951275718f, 5.8826430493618415f,
71  5.9068905956085187f, 5.9307373375628866f,
72  5.9541963103868749f, 5.9772799234999167f,
73  6.0000000000000000f, 6.0223678130284543f,
74  6.0443941193584533f, 6.0660891904577720f,
75  6.0874628412503390f, 6.1085244567781691f,
76  6.1292830169449663f, 6.1497471195046822f,
77  6.1699250014423121f, 6.1898245588800175f,
78  6.2094533656289501f, 6.2288186904958804f,
79  6.2479275134435852f, 6.2667865406949010f,
80  6.2854022188622487f, 6.3037807481771030f,
81  6.3219280948873626f, 6.3398500028846243f,
82  6.3575520046180837f, 6.3750394313469245f,
83  6.3923174227787606f, 6.4093909361377017f,
84  6.4262647547020979f, 6.4429434958487279f,
85  6.4594316186372973f, 6.4757334309663976f,
86  6.4918530963296747f, 6.5077946401986963f,
87  6.5235619560570130f, 6.5391588111080309f,
88  6.5545888516776376f, 6.5698556083309478f,
89  6.5849625007211560f, 6.5999128421871278f,
90  6.6147098441152083f, 6.6293566200796094f,
91  6.6438561897747243f, 6.6582114827517946f,
92  6.6724253419714951f, 6.6865005271832185f,
93  6.7004397181410917f, 6.7142455176661224f,
94  6.7279204545631987f, 6.7414669864011464f,
95  6.7548875021634682f, 6.7681843247769259f,
96  6.7813597135246599f, 6.7944158663501061f,
97  6.8073549220576037f, 6.8201789624151878f,
98  6.8328900141647412f, 6.8454900509443747f,
99  6.8579809951275718f, 6.8703647195834047f,
100  6.8826430493618415f, 6.8948177633079437f,
101  6.9068905956085187f, 6.9188632372745946f,
102  6.9307373375628866f, 6.9425145053392398f,
103  6.9541963103868749f, 6.9657842846620869f,
104  6.9772799234999167f, 6.9886846867721654f,
105  7.0000000000000000f, 7.0112272554232539f,
106  7.0223678130284543f, 7.0334230015374501f,
107  7.0443941193584533f, 7.0552824355011898f,
108  7.0660891904577720f, 7.0768155970508308f,
109  7.0874628412503390f, 7.0980320829605263f,
110  7.1085244567781691f, 7.1189410727235076f,
111  7.1292830169449663f, 7.1395513523987936f,
112  7.1497471195046822f, 7.1598713367783890f,
113  7.1699250014423121f, 7.1799090900149344f,
114  7.1898245588800175f, 7.1996723448363644f,
115  7.2094533656289501f, 7.2191685204621611f,
116  7.2288186904958804f, 7.2384047393250785f,
117  7.2479275134435852f, 7.2573878426926521f,
118  7.2667865406949010f, 7.2761244052742375f,
119  7.2854022188622487f, 7.2946207488916270f,
120  7.3037807481771030f, 7.3128829552843557f,
121  7.3219280948873626f, 7.3309168781146167f,
122  7.3398500028846243f, 7.3487281542310771f,
123  7.3575520046180837f, 7.3663222142458160f,
124  7.3750394313469245f, 7.3837042924740519f,
125  7.3923174227787606f, 7.4008794362821843f,
126  7.4093909361377017f, 7.4178525148858982f,
127  7.4262647547020979f, 7.4346282276367245f,
128  7.4429434958487279f, 7.4512111118323289f,
129  7.4594316186372973f, 7.4676055500829976f,
130  7.4757334309663976f, 7.4838157772642563f,
131  7.4918530963296747f, 7.4998458870832056f,
132  7.5077946401986963f, 7.5156998382840427f,
133  7.5235619560570130f, 7.5313814605163118f,
134  7.5391588111080309f, 7.5468944598876364f,
135  7.5545888516776376f, 7.5622424242210728f,
136  7.5698556083309478f, 7.5774288280357486f,
137  7.5849625007211560f, 7.5924570372680806f,
138  7.5999128421871278f, 7.6073303137496104f,
139  7.6147098441152083f, 7.6220518194563764f,
140  7.6293566200796094f, 7.6366246205436487f,
141  7.6438561897747243f, 7.6510516911789281f,
142  7.6582114827517946f, 7.6653359171851764f,
143  7.6724253419714951f, 7.6794800995054464f,
144  7.6865005271832185f, 7.6934869574993252f,
145  7.7004397181410917f, 7.7073591320808825f,
146  7.7142455176661224f, 7.7210991887071855f,
147  7.7279204545631987f, 7.7347096202258383f,
148  7.7414669864011464f, 7.7481928495894605f,
149  7.7548875021634682f, 7.7615512324444795f,
150  7.7681843247769259f, 7.7747870596011736f,
151  7.7813597135246599f, 7.7879025593914317f,
152  7.7944158663501061f, 7.8008998999203047f,
153  7.8073549220576037f, 7.8137811912170374f,
154  7.8201789624151878f, 7.8265484872909150f,
155  7.8328900141647412f, 7.8392037880969436f,
156  7.8454900509443747f, 7.8517490414160571f,
157  7.8579809951275718f, 7.8641861446542797f,
158  7.8703647195834047f, 7.8765169465649993f,
159  7.8826430493618415f, 7.8887432488982591f,
160  7.8948177633079437f, 7.9008668079807486f,
161  7.9068905956085187f, 7.9128893362299619f,
162  7.9188632372745946f, 7.9248125036057812f,
163  7.9307373375628866f, 7.9366379390025709f,
164  7.9425145053392398f, 7.9483672315846778f,
165  7.9541963103868749f, 7.9600019320680805f,
166  7.9657842846620869f, 7.9715435539507719f,
167  7.9772799234999167f, 7.9829935746943103f,
168  7.9886846867721654f, 7.9943534368588577f
169};
170
171const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
172  0.00000000f,    0.00000000f,  2.00000000f,   4.75488750f,
173  8.00000000f,   11.60964047f,  15.50977500f,  19.65148445f,
174  24.00000000f,  28.52932501f,  33.21928095f,  38.05374781f,
175  43.01955001f,  48.10571634f,  53.30296891f,  58.60335893f,
176  64.00000000f,  69.48686830f,  75.05865003f,  80.71062276f,
177  86.43856190f,  92.23866588f,  98.10749561f,  104.04192499f,
178  110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
179  134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
180  160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
181  186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
182  212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
183  240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
184  268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
185  296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
186  325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
187  354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
188  384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
189  413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
190  444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
191  474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
192  505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
193  536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
194  568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
195  600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
196  632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
197  664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
198  696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
199  729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
200  762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
201  795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
202  828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
203  862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
204  896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
205  929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
206  963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
207  998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
208  1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
209  1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
210  1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
211  1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
212  1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
213  1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
214  1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
215  1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
216  1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
217  1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
218  1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
219  1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
220  1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
221  1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
222  1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
223  1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
224  1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
225  1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
226  1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
227  1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
228  1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
229  1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
230  1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
231  1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
232  1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
233  1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
234  1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
235  2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
236};
237
238float VP8LFastSLog2Slow(int v) {
239  assert(v >= LOG_LOOKUP_IDX_MAX);
240  if (v < APPROX_LOG_MAX) {
241    int log_cnt = 0;
242    const float v_f = (float)v;
243    while (v >= LOG_LOOKUP_IDX_MAX) {
244      ++log_cnt;
245      v = v >> 1;
246    }
247    return v_f * (kLog2Table[v] + log_cnt);
248  } else {
249    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
250  }
251}
252
253float VP8LFastLog2Slow(int v) {
254  assert(v >= LOG_LOOKUP_IDX_MAX);
255  if (v < APPROX_LOG_MAX) {
256    int log_cnt = 0;
257    while (v >= LOG_LOOKUP_IDX_MAX) {
258      ++log_cnt;
259      v = v >> 1;
260    }
261    return kLog2Table[v] + log_cnt;
262  } else {
263    return (float)(LOG_2_RECIPROCAL * log((double)v));
264  }
265}
266
267//------------------------------------------------------------------------------
268// Image transforms.
269
270// In-place sum of each component with mod 256.
271static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
272  const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
273  const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
274  *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
275}
276
277static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
278  return (((a0 ^ a1) & 0xfefefefeL) >> 1) + (a0 & a1);
279}
280
281static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
282  return Average2(Average2(a0, a2), a1);
283}
284
285static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
286                                     uint32_t a2, uint32_t a3) {
287  return Average2(Average2(a0, a1), Average2(a2, a3));
288}
289
290#if defined(WEBP_TARGET_HAS_SSE2)
291static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
292                                                   uint32_t c2) {
293  const __m128i zero = _mm_setzero_si128();
294  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
295  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
296  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
297  const __m128i V1 = _mm_add_epi16(C0, C1);
298  const __m128i V2 = _mm_sub_epi16(V1, C2);
299  const __m128i b = _mm_packus_epi16(V2, V2);
300  const uint32_t output = _mm_cvtsi128_si32(b);
301  return output;
302}
303
304static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
305                                                   uint32_t c2) {
306  const uint32_t ave = Average2(c0, c1);
307  const __m128i zero = _mm_setzero_si128();
308  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
309  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
310  const __m128i A1 = _mm_sub_epi16(A0, B0);
311  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
312  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
313  const __m128i A3 = _mm_srai_epi16(A2, 1);
314  const __m128i A4 = _mm_add_epi16(A0, A3);
315  const __m128i A5 = _mm_packus_epi16(A4, A4);
316  const uint32_t output = _mm_cvtsi128_si32(A5);
317  return output;
318}
319
320static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
321  int pa_minus_pb;
322  const __m128i zero = _mm_setzero_si128();
323  const __m128i A0 = _mm_cvtsi32_si128(a);
324  const __m128i B0 = _mm_cvtsi32_si128(b);
325  const __m128i C0 = _mm_cvtsi32_si128(c);
326  const __m128i AC0 = _mm_subs_epu8(A0, C0);
327  const __m128i CA0 = _mm_subs_epu8(C0, A0);
328  const __m128i BC0 = _mm_subs_epu8(B0, C0);
329  const __m128i CB0 = _mm_subs_epu8(C0, B0);
330  const __m128i AC = _mm_or_si128(AC0, CA0);
331  const __m128i BC = _mm_or_si128(BC0, CB0);
332  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
333  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
334  const __m128i diff = _mm_sub_epi16(pb, pa);
335  {
336    int16_t out[8];
337    _mm_storeu_si128((__m128i*)out, diff);
338    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
339  }
340  return (pa_minus_pb <= 0) ? a : b;
341}
342
343#else
344
345static WEBP_INLINE uint32_t Clip255(uint32_t a) {
346  if (a < 256) {
347    return a;
348  }
349  // return 0, when a is a negative integer.
350  // return 255, when a is positive.
351  return ~a >> 24;
352}
353
354static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
355  return Clip255(a + b - c);
356}
357
358static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
359                                                   uint32_t c2) {
360  const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
361  const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
362                                         (c1 >> 16) & 0xff,
363                                         (c2 >> 16) & 0xff);
364  const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
365                                         (c1 >> 8) & 0xff,
366                                         (c2 >> 8) & 0xff);
367  const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
368  return (a << 24) | (r << 16) | (g << 8) | b;
369}
370
371static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
372  return Clip255(a + (a - b) / 2);
373}
374
375static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
376                                                   uint32_t c2) {
377  const uint32_t ave = Average2(c0, c1);
378  const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
379  const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
380  const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
381  const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
382  return (a << 24) | (r << 16) | (g << 8) | b;
383}
384
385static WEBP_INLINE int Sub3(int a, int b, int c) {
386  const int pb = b - c;
387  const int pa = a - c;
388  return abs(pb) - abs(pa);
389}
390
391static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
392  const int pa_minus_pb =
393      Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
394      Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
395      Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
396      Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
397  return (pa_minus_pb <= 0) ? a : b;
398}
399#endif
400
401//------------------------------------------------------------------------------
402// Predictors
403
404static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
405  (void)top;
406  (void)left;
407  return ARGB_BLACK;
408}
409static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
410  (void)top;
411  return left;
412}
413static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
414  (void)left;
415  return top[0];
416}
417static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
418  (void)left;
419  return top[1];
420}
421static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
422  (void)left;
423  return top[-1];
424}
425static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
426  const uint32_t pred = Average3(left, top[0], top[1]);
427  return pred;
428}
429static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
430  const uint32_t pred = Average2(left, top[-1]);
431  return pred;
432}
433static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
434  const uint32_t pred = Average2(left, top[0]);
435  return pred;
436}
437static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
438  const uint32_t pred = Average2(top[-1], top[0]);
439  (void)left;
440  return pred;
441}
442static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
443  const uint32_t pred = Average2(top[0], top[1]);
444  (void)left;
445  return pred;
446}
447static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
448  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
449  return pred;
450}
451static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
452  const uint32_t pred = Select(top[0], left, top[-1]);
453  return pred;
454}
455static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
456  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
457  return pred;
458}
459static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
460  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
461  return pred;
462}
463
464typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
465static const PredictorFunc kPredictors[16] = {
466  Predictor0, Predictor1, Predictor2, Predictor3,
467  Predictor4, Predictor5, Predictor6, Predictor7,
468  Predictor8, Predictor9, Predictor10, Predictor11,
469  Predictor12, Predictor13,
470  Predictor0, Predictor0    // <- padding security sentinels
471};
472
473// TODO(vikasa): Replace 256 etc with defines.
474static float PredictionCostSpatial(const int* counts,
475                                   int weight_0, double exp_val) {
476  const int significant_symbols = 16;
477  const double exp_decay_factor = 0.6;
478  double bits = weight_0 * counts[0];
479  int i;
480  for (i = 1; i < significant_symbols; ++i) {
481    bits += exp_val * (counts[i] + counts[256 - i]);
482    exp_val *= exp_decay_factor;
483  }
484  return (float)(-0.1 * bits);
485}
486
487// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
488static float CombinedShannonEntropy(const int* const X,
489                                    const int* const Y, int n) {
490  int i;
491  double retval = 0.;
492  int sumX = 0, sumXY = 0;
493  for (i = 0; i < n; ++i) {
494    const int x = X[i];
495    const int xy = X[i] + Y[i];
496    if (x != 0) {
497      sumX += x;
498      retval -= VP8LFastSLog2(x);
499    }
500    if (xy != 0) {
501      sumXY += xy;
502      retval -= VP8LFastSLog2(xy);
503    }
504  }
505  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
506  return (float)retval;
507}
508
509static float PredictionCostSpatialHistogram(int accumulated[4][256],
510                                            int tile[4][256]) {
511  int i;
512  double retval = 0;
513  for (i = 0; i < 4; ++i) {
514    const double kExpValue = 0.94;
515    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
516    retval += CombinedShannonEntropy(tile[i], accumulated[i], 256);
517  }
518  return (float)retval;
519}
520
521static int GetBestPredictorForTile(int width, int height,
522                                   int tile_x, int tile_y, int bits,
523                                   int accumulated[4][256],
524                                   const uint32_t* const argb_scratch) {
525  const int kNumPredModes = 14;
526  const int col_start = tile_x << bits;
527  const int row_start = tile_y << bits;
528  const int tile_size = 1 << bits;
529  const int ymax = (tile_size <= height - row_start) ?
530      tile_size : height - row_start;
531  const int xmax = (tile_size <= width - col_start) ?
532      tile_size : width - col_start;
533  int histo[4][256];
534  float best_diff = MAX_DIFF_COST;
535  int best_mode = 0;
536
537  int mode;
538  for (mode = 0; mode < kNumPredModes; ++mode) {
539    const uint32_t* current_row = argb_scratch;
540    const PredictorFunc pred_func = kPredictors[mode];
541    float cur_diff;
542    int y;
543    memset(&histo[0][0], 0, sizeof(histo));
544    for (y = 0; y < ymax; ++y) {
545      int x;
546      const int row = row_start + y;
547      const uint32_t* const upper_row = current_row;
548      current_row = upper_row + width;
549      for (x = 0; x < xmax; ++x) {
550        const int col = col_start + x;
551        uint32_t predict;
552        uint32_t predict_diff;
553        if (row == 0) {
554          predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
555        } else if (col == 0) {
556          predict = upper_row[col];  // Top.
557        } else {
558          predict = pred_func(current_row[col - 1], upper_row + col);
559        }
560        predict_diff = VP8LSubPixels(current_row[col], predict);
561        ++histo[0][predict_diff >> 24];
562        ++histo[1][((predict_diff >> 16) & 0xff)];
563        ++histo[2][((predict_diff >> 8) & 0xff)];
564        ++histo[3][(predict_diff & 0xff)];
565      }
566    }
567    cur_diff = PredictionCostSpatialHistogram(accumulated, histo);
568    if (cur_diff < best_diff) {
569      best_diff = cur_diff;
570      best_mode = mode;
571    }
572  }
573
574  return best_mode;
575}
576
577static void CopyTileWithPrediction(int width, int height,
578                                   int tile_x, int tile_y, int bits, int mode,
579                                   const uint32_t* const argb_scratch,
580                                   uint32_t* const argb) {
581  const int col_start = tile_x << bits;
582  const int row_start = tile_y << bits;
583  const int tile_size = 1 << bits;
584  const int ymax = (tile_size <= height - row_start) ?
585      tile_size : height - row_start;
586  const int xmax = (tile_size <= width - col_start) ?
587      tile_size : width - col_start;
588  const PredictorFunc pred_func = kPredictors[mode];
589  const uint32_t* current_row = argb_scratch;
590
591  int y;
592  for (y = 0; y < ymax; ++y) {
593    int x;
594    const int row = row_start + y;
595    const uint32_t* const upper_row = current_row;
596    current_row = upper_row + width;
597    for (x = 0; x < xmax; ++x) {
598      const int col = col_start + x;
599      const int pix = row * width + col;
600      uint32_t predict;
601      if (row == 0) {
602        predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
603      } else if (col == 0) {
604        predict = upper_row[col];  // Top.
605      } else {
606        predict = pred_func(current_row[col - 1], upper_row + col);
607      }
608      argb[pix] = VP8LSubPixels(current_row[col], predict);
609    }
610  }
611}
612
613void VP8LResidualImage(int width, int height, int bits,
614                       uint32_t* const argb, uint32_t* const argb_scratch,
615                       uint32_t* const image) {
616  const int max_tile_size = 1 << bits;
617  const int tiles_per_row = VP8LSubSampleSize(width, bits);
618  const int tiles_per_col = VP8LSubSampleSize(height, bits);
619  uint32_t* const upper_row = argb_scratch;
620  uint32_t* const current_tile_rows = argb_scratch + width;
621  int tile_y;
622  int histo[4][256];
623  memset(histo, 0, sizeof(histo));
624  for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
625    const int tile_y_offset = tile_y * max_tile_size;
626    const int this_tile_height =
627        (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset;
628    int tile_x;
629    if (tile_y > 0) {
630      memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width,
631             width * sizeof(*upper_row));
632    }
633    memcpy(current_tile_rows, &argb[tile_y_offset * width],
634           this_tile_height * width * sizeof(*current_tile_rows));
635    for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
636      int pred;
637      int y;
638      const int tile_x_offset = tile_x * max_tile_size;
639      int all_x_max = tile_x_offset + max_tile_size;
640      if (all_x_max > width) {
641        all_x_max = width;
642      }
643      pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits, histo,
644                                     argb_scratch);
645      image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
646      CopyTileWithPrediction(width, height, tile_x, tile_y, bits, pred,
647                             argb_scratch, argb);
648      for (y = 0; y < max_tile_size; ++y) {
649        int ix;
650        int all_x;
651        int all_y = tile_y_offset + y;
652        if (all_y >= height) {
653          break;
654        }
655        ix = all_y * width + tile_x_offset;
656        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
657          const uint32_t a = argb[ix];
658          ++histo[0][a >> 24];
659          ++histo[1][((a >> 16) & 0xff)];
660          ++histo[2][((a >> 8) & 0xff)];
661          ++histo[3][(a & 0xff)];
662        }
663      }
664    }
665  }
666}
667
668// Inverse prediction.
669static void PredictorInverseTransform(const VP8LTransform* const transform,
670                                      int y_start, int y_end, uint32_t* data) {
671  const int width = transform->xsize_;
672  if (y_start == 0) {  // First Row follows the L (mode=1) mode.
673    int x;
674    const uint32_t pred0 = Predictor0(data[-1], NULL);
675    AddPixelsEq(data, pred0);
676    for (x = 1; x < width; ++x) {
677      const uint32_t pred1 = Predictor1(data[x - 1], NULL);
678      AddPixelsEq(data + x, pred1);
679    }
680    data += width;
681    ++y_start;
682  }
683
684  {
685    int y = y_start;
686    const int mask = (1 << transform->bits_) - 1;
687    const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
688    const uint32_t* pred_mode_base =
689        transform->data_ + (y >> transform->bits_) * tiles_per_row;
690
691    while (y < y_end) {
692      int x;
693      const uint32_t pred2 = Predictor2(data[-1], data - width);
694      const uint32_t* pred_mode_src = pred_mode_base;
695      PredictorFunc pred_func;
696
697      // First pixel follows the T (mode=2) mode.
698      AddPixelsEq(data, pred2);
699
700      // .. the rest:
701      pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
702      for (x = 1; x < width; ++x) {
703        uint32_t pred;
704        if ((x & mask) == 0) {    // start of tile. Read predictor function.
705          pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
706        }
707        pred = pred_func(data[x - 1], data + x - width);
708        AddPixelsEq(data + x, pred);
709      }
710      data += width;
711      ++y;
712      if ((y & mask) == 0) {   // Use the same mask, since tiles are squares.
713        pred_mode_base += tiles_per_row;
714      }
715    }
716  }
717}
718
719void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
720  int i = 0;
721#if defined(WEBP_TARGET_HAS_SSE2)
722  const __m128i mask = _mm_set1_epi32(0x0000ff00);
723  for (; i + 4 < num_pixs; i += 4) {
724    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
725    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
726    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
727    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
728    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
729    const __m128i out = _mm_sub_epi8(in, in_0g0g);
730    _mm_storeu_si128((__m128i*)&argb_data[i], out);
731  }
732  // fallthrough and finish off with plain-C
733#endif
734  for (; i < num_pixs; ++i) {
735    const uint32_t argb = argb_data[i];
736    const uint32_t green = (argb >> 8) & 0xff;
737    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
738    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
739    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
740  }
741}
742
743// Add green to blue and red channels (i.e. perform the inverse transform of
744// 'subtract green').
745static void AddGreenToBlueAndRed(const VP8LTransform* const transform,
746                                 int y_start, int y_end, uint32_t* data) {
747  const int width = transform->xsize_;
748  const uint32_t* const data_end = data + (y_end - y_start) * width;
749#if defined(WEBP_TARGET_HAS_SSE2)
750  const __m128i mask = _mm_set1_epi32(0x0000ff00);
751  for (; data + 4 < data_end; data += 4) {
752    const __m128i in = _mm_loadu_si128((__m128i*)data);
753    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
754    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
755    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
756    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
757    const __m128i out = _mm_add_epi8(in, in_0g0g);
758    _mm_storeu_si128((__m128i*)data, out);
759  }
760  // fallthrough and finish off with plain-C
761#endif
762  while (data < data_end) {
763    const uint32_t argb = *data;
764    const uint32_t green = ((argb >> 8) & 0xff);
765    uint32_t red_blue = (argb & 0x00ff00ffu);
766    red_blue += (green << 16) | green;
767    red_blue &= 0x00ff00ffu;
768    *data++ = (argb & 0xff00ff00u) | red_blue;
769  }
770}
771
772typedef struct {
773  // Note: the members are uint8_t, so that any negative values are
774  // automatically converted to "mod 256" values.
775  uint8_t green_to_red_;
776  uint8_t green_to_blue_;
777  uint8_t red_to_blue_;
778} Multipliers;
779
780static WEBP_INLINE void MultipliersClear(Multipliers* m) {
781  m->green_to_red_ = 0;
782  m->green_to_blue_ = 0;
783  m->red_to_blue_ = 0;
784}
785
786static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
787                                                int8_t color) {
788  return (uint32_t)((int)(color_pred) * color) >> 5;
789}
790
791static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
792                                               Multipliers* const m) {
793  m->green_to_red_  = (color_code >>  0) & 0xff;
794  m->green_to_blue_ = (color_code >>  8) & 0xff;
795  m->red_to_blue_   = (color_code >> 16) & 0xff;
796}
797
798static WEBP_INLINE uint32_t MultipliersToColorCode(Multipliers* const m) {
799  return 0xff000000u |
800         ((uint32_t)(m->red_to_blue_) << 16) |
801         ((uint32_t)(m->green_to_blue_) << 8) |
802         m->green_to_red_;
803}
804
805static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m,
806                                           uint32_t argb, int inverse) {
807  const uint32_t green = argb >> 8;
808  const uint32_t red = argb >> 16;
809  uint32_t new_red = red;
810  uint32_t new_blue = argb;
811
812  if (inverse) {
813    new_red += ColorTransformDelta(m->green_to_red_, green);
814    new_red &= 0xff;
815    new_blue += ColorTransformDelta(m->green_to_blue_, green);
816    new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
817    new_blue &= 0xff;
818  } else {
819    new_red -= ColorTransformDelta(m->green_to_red_, green);
820    new_red &= 0xff;
821    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
822    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
823    new_blue &= 0xff;
824  }
825  return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
826}
827
828static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
829                                             uint32_t argb) {
830  const uint32_t green = argb >> 8;
831  uint32_t new_red = argb >> 16;
832  new_red -= ColorTransformDelta(green_to_red, green);
833  return (new_red & 0xff);
834}
835
836static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
837                                              uint8_t red_to_blue,
838                                              uint32_t argb) {
839  const uint32_t green = argb >> 8;
840  const uint32_t red = argb >> 16;
841  uint8_t new_blue = argb;
842  new_blue -= ColorTransformDelta(green_to_blue, green);
843  new_blue -= ColorTransformDelta(red_to_blue, red);
844  return (new_blue & 0xff);
845}
846
847static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
848                                          int ix, int xsize) {
849  const uint32_t v = argb[ix];
850  if (ix >= xsize + 3) {
851    if (v == argb[ix - xsize] &&
852        argb[ix - 1] == argb[ix - xsize - 1] &&
853        argb[ix - 2] == argb[ix - xsize - 2] &&
854        argb[ix - 3] == argb[ix - xsize - 3]) {
855      return 1;
856    }
857    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
858  } else if (ix >= 3) {
859    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
860  }
861  return 0;
862}
863
864static float PredictionCostCrossColor(const int accumulated[256],
865                                      const int counts[256]) {
866  // Favor low entropy, locally and globally.
867  // Favor small absolute values for PredictionCostSpatial
868  static const double kExpValue = 2.4;
869  return CombinedShannonEntropy(counts, accumulated, 256) +
870         PredictionCostSpatial(counts, 3, kExpValue);
871}
872
873static Multipliers GetBestColorTransformForTile(
874    int tile_x, int tile_y, int bits,
875    Multipliers prevX,
876    Multipliers prevY,
877    int step, int xsize, int ysize,
878    int* accumulated_red_histo,
879    int* accumulated_blue_histo,
880    const uint32_t* const argb) {
881  float best_diff = MAX_DIFF_COST;
882  float cur_diff;
883  const int halfstep = step / 2;
884  const int max_tile_size = 1 << bits;
885  const int tile_y_offset = tile_y * max_tile_size;
886  const int tile_x_offset = tile_x * max_tile_size;
887  int green_to_red;
888  int green_to_blue;
889  int red_to_blue;
890  int all_x_max = tile_x_offset + max_tile_size;
891  int all_y_max = tile_y_offset + max_tile_size;
892  Multipliers best_tx;
893  MultipliersClear(&best_tx);
894  if (all_x_max > xsize) {
895    all_x_max = xsize;
896  }
897  if (all_y_max > ysize) {
898    all_y_max = ysize;
899  }
900
901  for (green_to_red = -64; green_to_red <= 64; green_to_red += halfstep) {
902    int histo[256] = { 0 };
903    int all_y;
904
905    for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
906      int ix = all_y * xsize + tile_x_offset;
907      int all_x;
908      for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
909        if (SkipRepeatedPixels(argb, ix, xsize)) {
910          continue;
911        }
912        ++histo[TransformColorRed(green_to_red, argb[ix])];  // red.
913      }
914    }
915    cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]);
916    if ((uint8_t)green_to_red == prevX.green_to_red_) {
917      cur_diff -= 3;  // favor keeping the areas locally similar
918    }
919    if ((uint8_t)green_to_red == prevY.green_to_red_) {
920      cur_diff -= 3;  // favor keeping the areas locally similar
921    }
922    if (green_to_red == 0) {
923      cur_diff -= 3;
924    }
925    if (cur_diff < best_diff) {
926      best_diff = cur_diff;
927      best_tx.green_to_red_ = green_to_red;
928    }
929  }
930  best_diff = MAX_DIFF_COST;
931  for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) {
932    for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) {
933      int all_y;
934      int histo[256] = { 0 };
935      for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
936        int all_x;
937        int ix = all_y * xsize + tile_x_offset;
938        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
939          if (SkipRepeatedPixels(argb, ix, xsize)) {
940            continue;
941          }
942          ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])];
943        }
944      }
945      cur_diff =
946          PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]);
947      if ((uint8_t)green_to_blue == prevX.green_to_blue_) {
948        cur_diff -= 3;  // favor keeping the areas locally similar
949      }
950      if ((uint8_t)green_to_blue == prevY.green_to_blue_) {
951        cur_diff -= 3;  // favor keeping the areas locally similar
952      }
953      if ((uint8_t)red_to_blue == prevX.red_to_blue_) {
954        cur_diff -= 3;  // favor keeping the areas locally similar
955      }
956      if ((uint8_t)red_to_blue == prevY.red_to_blue_) {
957        cur_diff -= 3;  // favor keeping the areas locally similar
958      }
959      if (green_to_blue == 0) {
960        cur_diff -= 3;
961      }
962      if (red_to_blue == 0) {
963        cur_diff -= 3;
964      }
965      if (cur_diff < best_diff) {
966        best_diff = cur_diff;
967        best_tx.green_to_blue_ = green_to_blue;
968        best_tx.red_to_blue_ = red_to_blue;
969      }
970    }
971  }
972  return best_tx;
973}
974
975static void CopyTileWithColorTransform(int xsize, int ysize,
976                                       int tile_x, int tile_y, int bits,
977                                       Multipliers color_transform,
978                                       uint32_t* const argb) {
979  int y;
980  int xscan = 1 << bits;
981  int yscan = 1 << bits;
982  tile_x <<= bits;
983  tile_y <<= bits;
984  if (xscan > xsize - tile_x) {
985    xscan = xsize - tile_x;
986  }
987  if (yscan > ysize - tile_y) {
988    yscan = ysize - tile_y;
989  }
990  yscan += tile_y;
991  for (y = tile_y; y < yscan; ++y) {
992    int ix = y * xsize + tile_x;
993    const int end_ix = ix + xscan;
994    for (; ix < end_ix; ++ix) {
995      argb[ix] = TransformColor(&color_transform, argb[ix], 0);
996    }
997  }
998}
999
1000void VP8LColorSpaceTransform(int width, int height, int bits, int step,
1001                             uint32_t* const argb, uint32_t* image) {
1002  const int max_tile_size = 1 << bits;
1003  int tile_xsize = VP8LSubSampleSize(width, bits);
1004  int tile_ysize = VP8LSubSampleSize(height, bits);
1005  int accumulated_red_histo[256] = { 0 };
1006  int accumulated_blue_histo[256] = { 0 };
1007  int tile_y;
1008  int tile_x;
1009  Multipliers prevX;
1010  Multipliers prevY;
1011  MultipliersClear(&prevY);
1012  MultipliersClear(&prevX);
1013  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
1014    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
1015      Multipliers color_transform;
1016      int all_x_max;
1017      int y;
1018      const int tile_y_offset = tile_y * max_tile_size;
1019      const int tile_x_offset = tile_x * max_tile_size;
1020      if (tile_y != 0) {
1021        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
1022        ColorCodeToMultipliers(image[(tile_y - 1) * tile_xsize + tile_x],
1023                               &prevY);
1024      } else if (tile_x != 0) {
1025        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
1026      }
1027      color_transform =
1028          GetBestColorTransformForTile(tile_x, tile_y, bits,
1029                                       prevX, prevY,
1030                                       step, width, height,
1031                                       &accumulated_red_histo[0],
1032                                       &accumulated_blue_histo[0],
1033                                       argb);
1034      image[tile_y * tile_xsize + tile_x] =
1035          MultipliersToColorCode(&color_transform);
1036      CopyTileWithColorTransform(width, height, tile_x, tile_y, bits,
1037                                 color_transform, argb);
1038
1039      // Gather accumulated histogram data.
1040      all_x_max = tile_x_offset + max_tile_size;
1041      if (all_x_max > width) {
1042        all_x_max = width;
1043      }
1044      for (y = 0; y < max_tile_size; ++y) {
1045        int ix;
1046        int all_x;
1047        int all_y = tile_y_offset + y;
1048        if (all_y >= height) {
1049          break;
1050        }
1051        ix = all_y * width + tile_x_offset;
1052        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
1053          if (ix >= 2 &&
1054              argb[ix] == argb[ix - 2] &&
1055              argb[ix] == argb[ix - 1]) {
1056            continue;  // repeated pixels are handled by backward references
1057          }
1058          if (ix >= width + 2 &&
1059              argb[ix - 2] == argb[ix - width - 2] &&
1060              argb[ix - 1] == argb[ix - width - 1] &&
1061              argb[ix] == argb[ix - width]) {
1062            continue;  // repeated pixels are handled by backward references
1063          }
1064          ++accumulated_red_histo[(argb[ix] >> 16) & 0xff];
1065          ++accumulated_blue_histo[argb[ix] & 0xff];
1066        }
1067      }
1068    }
1069  }
1070}
1071
1072// Color space inverse transform.
1073static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
1074                                       int y_start, int y_end, uint32_t* data) {
1075  const int width = transform->xsize_;
1076  const int mask = (1 << transform->bits_) - 1;
1077  const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
1078  int y = y_start;
1079  const uint32_t* pred_row =
1080      transform->data_ + (y >> transform->bits_) * tiles_per_row;
1081
1082  while (y < y_end) {
1083    const uint32_t* pred = pred_row;
1084    Multipliers m = { 0, 0, 0 };
1085    int x;
1086
1087    for (x = 0; x < width; ++x) {
1088      if ((x & mask) == 0) ColorCodeToMultipliers(*pred++, &m);
1089      data[x] = TransformColor(&m, data[x], 1);
1090    }
1091    data += width;
1092    ++y;
1093    if ((y & mask) == 0) pred_row += tiles_per_row;;
1094  }
1095}
1096
1097// Separate out pixels packed together using pixel-bundling.
1098// We define two methods for ARGB data (uint32_t) and alpha-only data (uint8_t).
1099#define COLOR_INDEX_INVERSE(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)             \
1100void FUNC_NAME(const VP8LTransform* const transform,                           \
1101               int y_start, int y_end, const TYPE* src, TYPE* dst) {           \
1102  int y;                                                                       \
1103  const int bits_per_pixel = 8 >> transform->bits_;                            \
1104  const int width = transform->xsize_;                                         \
1105  const uint32_t* const color_map = transform->data_;                          \
1106  if (bits_per_pixel < 8) {                                                    \
1107    const int pixels_per_byte = 1 << transform->bits_;                         \
1108    const int count_mask = pixels_per_byte - 1;                                \
1109    const uint32_t bit_mask = (1 << bits_per_pixel) - 1;                       \
1110    for (y = y_start; y < y_end; ++y) {                                        \
1111      uint32_t packed_pixels = 0;                                              \
1112      int x;                                                                   \
1113      for (x = 0; x < width; ++x) {                                            \
1114        /* We need to load fresh 'packed_pixels' once every                */  \
1115        /* 'pixels_per_byte' increments of x. Fortunately, pixels_per_byte */  \
1116        /* is a power of 2, so can just use a mask for that, instead of    */  \
1117        /* decrementing a counter.                                         */  \
1118        if ((x & count_mask) == 0) packed_pixels = GET_INDEX(*src++);          \
1119        *dst++ = GET_VALUE(color_map[packed_pixels & bit_mask]);               \
1120        packed_pixels >>= bits_per_pixel;                                      \
1121      }                                                                        \
1122    }                                                                          \
1123  } else {                                                                     \
1124    for (y = y_start; y < y_end; ++y) {                                        \
1125      int x;                                                                   \
1126      for (x = 0; x < width; ++x) {                                            \
1127        *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                      \
1128      }                                                                        \
1129    }                                                                          \
1130  }                                                                            \
1131}
1132
1133static WEBP_INLINE uint32_t GetARGBIndex(uint32_t idx) {
1134  return (idx >> 8) & 0xff;
1135}
1136
1137static WEBP_INLINE uint8_t GetAlphaIndex(uint8_t idx) {
1138  return idx;
1139}
1140
1141static WEBP_INLINE uint32_t GetARGBValue(uint32_t val) {
1142  return val;
1143}
1144
1145static WEBP_INLINE uint8_t GetAlphaValue(uint32_t val) {
1146  return (val >> 8) & 0xff;
1147}
1148
1149static COLOR_INDEX_INVERSE(ColorIndexInverseTransform, uint32_t, GetARGBIndex,
1150                           GetARGBValue)
1151COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, uint8_t, GetAlphaIndex,
1152                    GetAlphaValue)
1153
1154#undef COLOR_INDEX_INVERSE
1155
1156void VP8LInverseTransform(const VP8LTransform* const transform,
1157                          int row_start, int row_end,
1158                          const uint32_t* const in, uint32_t* const out) {
1159  assert(row_start < row_end);
1160  assert(row_end <= transform->ysize_);
1161  switch (transform->type_) {
1162    case SUBTRACT_GREEN:
1163      AddGreenToBlueAndRed(transform, row_start, row_end, out);
1164      break;
1165    case PREDICTOR_TRANSFORM:
1166      PredictorInverseTransform(transform, row_start, row_end, out);
1167      if (row_end != transform->ysize_) {
1168        // The last predicted row in this iteration will be the top-pred row
1169        // for the first row in next iteration.
1170        const int width = transform->xsize_;
1171        memcpy(out - width, out + (row_end - row_start - 1) * width,
1172               width * sizeof(*out));
1173      }
1174      break;
1175    case CROSS_COLOR_TRANSFORM:
1176      ColorSpaceInverseTransform(transform, row_start, row_end, out);
1177      break;
1178    case COLOR_INDEXING_TRANSFORM:
1179      if (in == out && transform->bits_ > 0) {
1180        // Move packed pixels to the end of unpacked region, so that unpacking
1181        // can occur seamlessly.
1182        // Also, note that this is the only transform that applies on
1183        // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
1184        // transforms work on effective width of xsize_.
1185        const int out_stride = (row_end - row_start) * transform->xsize_;
1186        const int in_stride = (row_end - row_start) *
1187            VP8LSubSampleSize(transform->xsize_, transform->bits_);
1188        uint32_t* const src = out + out_stride - in_stride;
1189        memmove(src, out, in_stride * sizeof(*src));
1190        ColorIndexInverseTransform(transform, row_start, row_end, src, out);
1191      } else {
1192        ColorIndexInverseTransform(transform, row_start, row_end, in, out);
1193      }
1194      break;
1195  }
1196}
1197
1198//------------------------------------------------------------------------------
1199// Color space conversion.
1200
1201static int is_big_endian(void) {
1202  static const union {
1203    uint16_t w;
1204    uint8_t b[2];
1205  } tmp = { 1 };
1206  return (tmp.b[0] != 1);
1207}
1208
1209static void ConvertBGRAToRGB(const uint32_t* src,
1210                             int num_pixels, uint8_t* dst) {
1211  const uint32_t* const src_end = src + num_pixels;
1212  while (src < src_end) {
1213    const uint32_t argb = *src++;
1214    *dst++ = (argb >> 16) & 0xff;
1215    *dst++ = (argb >>  8) & 0xff;
1216    *dst++ = (argb >>  0) & 0xff;
1217  }
1218}
1219
1220static void ConvertBGRAToRGBA(const uint32_t* src,
1221                              int num_pixels, uint8_t* dst) {
1222  const uint32_t* const src_end = src + num_pixels;
1223  while (src < src_end) {
1224    const uint32_t argb = *src++;
1225    *dst++ = (argb >> 16) & 0xff;
1226    *dst++ = (argb >>  8) & 0xff;
1227    *dst++ = (argb >>  0) & 0xff;
1228    *dst++ = (argb >> 24) & 0xff;
1229  }
1230}
1231
1232static void ConvertBGRAToRGBA4444(const uint32_t* src,
1233                                  int num_pixels, uint8_t* dst) {
1234  const uint32_t* const src_end = src + num_pixels;
1235  while (src < src_end) {
1236    const uint32_t argb = *src++;
1237    const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
1238    const uint8_t ba = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
1239#ifdef WEBP_SWAP_16BIT_CSP
1240    *dst++ = ba;
1241    *dst++ = rg;
1242#else
1243    *dst++ = rg;
1244    *dst++ = ba;
1245#endif
1246  }
1247}
1248
1249static void ConvertBGRAToRGB565(const uint32_t* src,
1250                                int num_pixels, uint8_t* dst) {
1251  const uint32_t* const src_end = src + num_pixels;
1252  while (src < src_end) {
1253    const uint32_t argb = *src++;
1254    const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
1255    const uint8_t gb = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
1256#ifdef WEBP_SWAP_16BIT_CSP
1257    *dst++ = gb;
1258    *dst++ = rg;
1259#else
1260    *dst++ = rg;
1261    *dst++ = gb;
1262#endif
1263  }
1264}
1265
1266static void ConvertBGRAToBGR(const uint32_t* src,
1267                             int num_pixels, uint8_t* dst) {
1268  const uint32_t* const src_end = src + num_pixels;
1269  while (src < src_end) {
1270    const uint32_t argb = *src++;
1271    *dst++ = (argb >>  0) & 0xff;
1272    *dst++ = (argb >>  8) & 0xff;
1273    *dst++ = (argb >> 16) & 0xff;
1274  }
1275}
1276
1277static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
1278                       int swap_on_big_endian) {
1279  if (is_big_endian() == swap_on_big_endian) {
1280    const uint32_t* const src_end = src + num_pixels;
1281    while (src < src_end) {
1282      uint32_t argb = *src++;
1283
1284#if !defined(__BIG_ENDIAN__)
1285#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
1286#if defined(__i386__) || defined(__x86_64__)
1287      __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb));
1288      *(uint32_t*)dst = argb;
1289#elif defined(_MSC_VER)
1290      argb = _byteswap_ulong(argb);
1291      *(uint32_t*)dst = argb;
1292#else
1293      dst[0] = (argb >> 24) & 0xff;
1294      dst[1] = (argb >> 16) & 0xff;
1295      dst[2] = (argb >>  8) & 0xff;
1296      dst[3] = (argb >>  0) & 0xff;
1297#endif
1298#else  // WEBP_REFERENCE_IMPLEMENTATION
1299      dst[0] = (argb >> 24) & 0xff;
1300      dst[1] = (argb >> 16) & 0xff;
1301      dst[2] = (argb >>  8) & 0xff;
1302      dst[3] = (argb >>  0) & 0xff;
1303#endif
1304#else  // __BIG_ENDIAN__
1305      dst[0] = (argb >>  0) & 0xff;
1306      dst[1] = (argb >>  8) & 0xff;
1307      dst[2] = (argb >> 16) & 0xff;
1308      dst[3] = (argb >> 24) & 0xff;
1309#endif
1310      dst += sizeof(argb);
1311    }
1312  } else {
1313    memcpy(dst, src, num_pixels * sizeof(*src));
1314  }
1315}
1316
1317void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
1318                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
1319  switch (out_colorspace) {
1320    case MODE_RGB:
1321      ConvertBGRAToRGB(in_data, num_pixels, rgba);
1322      break;
1323    case MODE_RGBA:
1324      ConvertBGRAToRGBA(in_data, num_pixels, rgba);
1325      break;
1326    case MODE_rgbA:
1327      ConvertBGRAToRGBA(in_data, num_pixels, rgba);
1328      WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
1329      break;
1330    case MODE_BGR:
1331      ConvertBGRAToBGR(in_data, num_pixels, rgba);
1332      break;
1333    case MODE_BGRA:
1334      CopyOrSwap(in_data, num_pixels, rgba, 1);
1335      break;
1336    case MODE_bgrA:
1337      CopyOrSwap(in_data, num_pixels, rgba, 1);
1338      WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
1339      break;
1340    case MODE_ARGB:
1341      CopyOrSwap(in_data, num_pixels, rgba, 0);
1342      break;
1343    case MODE_Argb:
1344      CopyOrSwap(in_data, num_pixels, rgba, 0);
1345      WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
1346      break;
1347    case MODE_RGBA_4444:
1348      ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
1349      break;
1350    case MODE_rgbA_4444:
1351      ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
1352      WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
1353      break;
1354    case MODE_RGB_565:
1355      ConvertBGRAToRGB565(in_data, num_pixels, rgba);
1356      break;
1357    default:
1358      assert(0);          // Code flow should not reach here.
1359  }
1360}
1361
1362// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
1363void VP8LBundleColorMap(const uint8_t* const row, int width,
1364                        int xbits, uint32_t* const dst) {
1365  int x;
1366  if (xbits > 0) {
1367    const int bit_depth = 1 << (3 - xbits);
1368    const int mask = (1 << xbits) - 1;
1369    uint32_t code = 0xff000000;
1370    for (x = 0; x < width; ++x) {
1371      const int xsub = x & mask;
1372      if (xsub == 0) {
1373        code = 0xff000000;
1374      }
1375      code |= row[x] << (8 + bit_depth * xsub);
1376      dst[x >> xbits] = code;
1377    }
1378  } else {
1379    for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
1380  }
1381}
1382
1383//------------------------------------------------------------------------------
1384
1385#if defined(__cplusplus) || defined(c_plusplus)
1386}    // extern "C"
1387#endif
1388