lossless.c revision 8b720228d581a84fd173b6dcb2fa295b59db489a
1// Copyright 2012 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// Image transforms and color space conversion methods for lossless decoder.
11//
12// Authors: Vikas Arora (vikaas.arora@gmail.com)
13//          Jyrki Alakuijala (jyrki@google.com)
14//          Urvang Joshi (urvang@google.com)
15
16#include "./dsp.h"
17
18#if defined(WEBP_USE_SSE2)
19#include <emmintrin.h>
20#endif
21
22#include <math.h>
23#include <stdlib.h>
24#include "./lossless.h"
25#include "../dec/vp8li.h"
26#include "./yuv.h"
27
28#define MAX_DIFF_COST (1e30f)
29
30// lookup table for small values of log2(int)
31#define APPROX_LOG_MAX  4096
32#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
33const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
34  0.0000000000000000f, 0.0000000000000000f,
35  1.0000000000000000f, 1.5849625007211560f,
36  2.0000000000000000f, 2.3219280948873621f,
37  2.5849625007211560f, 2.8073549220576041f,
38  3.0000000000000000f, 3.1699250014423121f,
39  3.3219280948873621f, 3.4594316186372973f,
40  3.5849625007211560f, 3.7004397181410921f,
41  3.8073549220576041f, 3.9068905956085187f,
42  4.0000000000000000f, 4.0874628412503390f,
43  4.1699250014423121f, 4.2479275134435852f,
44  4.3219280948873626f, 4.3923174227787606f,
45  4.4594316186372973f, 4.5235619560570130f,
46  4.5849625007211560f, 4.6438561897747243f,
47  4.7004397181410917f, 4.7548875021634682f,
48  4.8073549220576037f, 4.8579809951275718f,
49  4.9068905956085187f, 4.9541963103868749f,
50  5.0000000000000000f, 5.0443941193584533f,
51  5.0874628412503390f, 5.1292830169449663f,
52  5.1699250014423121f, 5.2094533656289501f,
53  5.2479275134435852f, 5.2854022188622487f,
54  5.3219280948873626f, 5.3575520046180837f,
55  5.3923174227787606f, 5.4262647547020979f,
56  5.4594316186372973f, 5.4918530963296747f,
57  5.5235619560570130f, 5.5545888516776376f,
58  5.5849625007211560f, 5.6147098441152083f,
59  5.6438561897747243f, 5.6724253419714951f,
60  5.7004397181410917f, 5.7279204545631987f,
61  5.7548875021634682f, 5.7813597135246599f,
62  5.8073549220576037f, 5.8328900141647412f,
63  5.8579809951275718f, 5.8826430493618415f,
64  5.9068905956085187f, 5.9307373375628866f,
65  5.9541963103868749f, 5.9772799234999167f,
66  6.0000000000000000f, 6.0223678130284543f,
67  6.0443941193584533f, 6.0660891904577720f,
68  6.0874628412503390f, 6.1085244567781691f,
69  6.1292830169449663f, 6.1497471195046822f,
70  6.1699250014423121f, 6.1898245588800175f,
71  6.2094533656289501f, 6.2288186904958804f,
72  6.2479275134435852f, 6.2667865406949010f,
73  6.2854022188622487f, 6.3037807481771030f,
74  6.3219280948873626f, 6.3398500028846243f,
75  6.3575520046180837f, 6.3750394313469245f,
76  6.3923174227787606f, 6.4093909361377017f,
77  6.4262647547020979f, 6.4429434958487279f,
78  6.4594316186372973f, 6.4757334309663976f,
79  6.4918530963296747f, 6.5077946401986963f,
80  6.5235619560570130f, 6.5391588111080309f,
81  6.5545888516776376f, 6.5698556083309478f,
82  6.5849625007211560f, 6.5999128421871278f,
83  6.6147098441152083f, 6.6293566200796094f,
84  6.6438561897747243f, 6.6582114827517946f,
85  6.6724253419714951f, 6.6865005271832185f,
86  6.7004397181410917f, 6.7142455176661224f,
87  6.7279204545631987f, 6.7414669864011464f,
88  6.7548875021634682f, 6.7681843247769259f,
89  6.7813597135246599f, 6.7944158663501061f,
90  6.8073549220576037f, 6.8201789624151878f,
91  6.8328900141647412f, 6.8454900509443747f,
92  6.8579809951275718f, 6.8703647195834047f,
93  6.8826430493618415f, 6.8948177633079437f,
94  6.9068905956085187f, 6.9188632372745946f,
95  6.9307373375628866f, 6.9425145053392398f,
96  6.9541963103868749f, 6.9657842846620869f,
97  6.9772799234999167f, 6.9886846867721654f,
98  7.0000000000000000f, 7.0112272554232539f,
99  7.0223678130284543f, 7.0334230015374501f,
100  7.0443941193584533f, 7.0552824355011898f,
101  7.0660891904577720f, 7.0768155970508308f,
102  7.0874628412503390f, 7.0980320829605263f,
103  7.1085244567781691f, 7.1189410727235076f,
104  7.1292830169449663f, 7.1395513523987936f,
105  7.1497471195046822f, 7.1598713367783890f,
106  7.1699250014423121f, 7.1799090900149344f,
107  7.1898245588800175f, 7.1996723448363644f,
108  7.2094533656289501f, 7.2191685204621611f,
109  7.2288186904958804f, 7.2384047393250785f,
110  7.2479275134435852f, 7.2573878426926521f,
111  7.2667865406949010f, 7.2761244052742375f,
112  7.2854022188622487f, 7.2946207488916270f,
113  7.3037807481771030f, 7.3128829552843557f,
114  7.3219280948873626f, 7.3309168781146167f,
115  7.3398500028846243f, 7.3487281542310771f,
116  7.3575520046180837f, 7.3663222142458160f,
117  7.3750394313469245f, 7.3837042924740519f,
118  7.3923174227787606f, 7.4008794362821843f,
119  7.4093909361377017f, 7.4178525148858982f,
120  7.4262647547020979f, 7.4346282276367245f,
121  7.4429434958487279f, 7.4512111118323289f,
122  7.4594316186372973f, 7.4676055500829976f,
123  7.4757334309663976f, 7.4838157772642563f,
124  7.4918530963296747f, 7.4998458870832056f,
125  7.5077946401986963f, 7.5156998382840427f,
126  7.5235619560570130f, 7.5313814605163118f,
127  7.5391588111080309f, 7.5468944598876364f,
128  7.5545888516776376f, 7.5622424242210728f,
129  7.5698556083309478f, 7.5774288280357486f,
130  7.5849625007211560f, 7.5924570372680806f,
131  7.5999128421871278f, 7.6073303137496104f,
132  7.6147098441152083f, 7.6220518194563764f,
133  7.6293566200796094f, 7.6366246205436487f,
134  7.6438561897747243f, 7.6510516911789281f,
135  7.6582114827517946f, 7.6653359171851764f,
136  7.6724253419714951f, 7.6794800995054464f,
137  7.6865005271832185f, 7.6934869574993252f,
138  7.7004397181410917f, 7.7073591320808825f,
139  7.7142455176661224f, 7.7210991887071855f,
140  7.7279204545631987f, 7.7347096202258383f,
141  7.7414669864011464f, 7.7481928495894605f,
142  7.7548875021634682f, 7.7615512324444795f,
143  7.7681843247769259f, 7.7747870596011736f,
144  7.7813597135246599f, 7.7879025593914317f,
145  7.7944158663501061f, 7.8008998999203047f,
146  7.8073549220576037f, 7.8137811912170374f,
147  7.8201789624151878f, 7.8265484872909150f,
148  7.8328900141647412f, 7.8392037880969436f,
149  7.8454900509443747f, 7.8517490414160571f,
150  7.8579809951275718f, 7.8641861446542797f,
151  7.8703647195834047f, 7.8765169465649993f,
152  7.8826430493618415f, 7.8887432488982591f,
153  7.8948177633079437f, 7.9008668079807486f,
154  7.9068905956085187f, 7.9128893362299619f,
155  7.9188632372745946f, 7.9248125036057812f,
156  7.9307373375628866f, 7.9366379390025709f,
157  7.9425145053392398f, 7.9483672315846778f,
158  7.9541963103868749f, 7.9600019320680805f,
159  7.9657842846620869f, 7.9715435539507719f,
160  7.9772799234999167f, 7.9829935746943103f,
161  7.9886846867721654f, 7.9943534368588577f
162};
163
164const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
165  0.00000000f,    0.00000000f,  2.00000000f,   4.75488750f,
166  8.00000000f,   11.60964047f,  15.50977500f,  19.65148445f,
167  24.00000000f,  28.52932501f,  33.21928095f,  38.05374781f,
168  43.01955001f,  48.10571634f,  53.30296891f,  58.60335893f,
169  64.00000000f,  69.48686830f,  75.05865003f,  80.71062276f,
170  86.43856190f,  92.23866588f,  98.10749561f,  104.04192499f,
171  110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
172  134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
173  160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
174  186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
175  212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
176  240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
177  268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
178  296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
179  325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
180  354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
181  384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
182  413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
183  444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
184  474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
185  505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
186  536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
187  568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
188  600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
189  632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
190  664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
191  696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
192  729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
193  762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
194  795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
195  828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
196  862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
197  896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
198  929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
199  963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
200  998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
201  1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
202  1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
203  1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
204  1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
205  1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
206  1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
207  1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
208  1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
209  1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
210  1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
211  1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
212  1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
213  1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
214  1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
215  1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
216  1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
217  1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
218  1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
219  1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
220  1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
221  1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
222  1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
223  1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
224  1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
225  1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
226  1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
227  1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
228  2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
229};
230
231const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
232  { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
233  { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
234  { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
235  { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
236  { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
237  {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
238  {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
239  {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
240  {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
241  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
242  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
243  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
244  {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
245  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
246  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
247  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
248  {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
249  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
250  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
251  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
252  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
253  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
254  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
255  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
256  {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
257  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
258  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
259  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
260  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
261  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
262  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
263  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
264  {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
265  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
266  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
267  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
268  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
269  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
270  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
271  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
272  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
273  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
274  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
275  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
276  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
277  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
278  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
279  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
280  {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
281  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
282  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
283  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
284  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
285  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
286  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
287  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
288  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
289  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
290  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
291  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
292  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
293  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
294  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
295  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
296};
297
298const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
299   0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  2,  3,  0,  1,  2,  3,
300   0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
301   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
302   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
303   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
304  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
305   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
306  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
307   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
308  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
309  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
310  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
311   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
312  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
313  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
314  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
315   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
316  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
317  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
318  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
319  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
320  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
321  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
322  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
323  127,
324   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
325  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
326  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
327  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
328  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
329  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
330  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
331  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
332};
333
334float VP8LFastSLog2Slow(int v) {
335  assert(v >= LOG_LOOKUP_IDX_MAX);
336  if (v < APPROX_LOG_MAX) {
337    int log_cnt = 0;
338    const float v_f = (float)v;
339    while (v >= LOG_LOOKUP_IDX_MAX) {
340      ++log_cnt;
341      v = v >> 1;
342    }
343    return v_f * (kLog2Table[v] + log_cnt);
344  } else {
345    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
346  }
347}
348
349float VP8LFastLog2Slow(int v) {
350  assert(v >= LOG_LOOKUP_IDX_MAX);
351  if (v < APPROX_LOG_MAX) {
352    int log_cnt = 0;
353    while (v >= LOG_LOOKUP_IDX_MAX) {
354      ++log_cnt;
355      v = v >> 1;
356    }
357    return kLog2Table[v] + log_cnt;
358  } else {
359    return (float)(LOG_2_RECIPROCAL * log((double)v));
360  }
361}
362
363//------------------------------------------------------------------------------
364// Image transforms.
365
366// In-place sum of each component with mod 256.
367static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
368  const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
369  const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
370  *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
371}
372
373static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
374  return (((a0 ^ a1) & 0xfefefefeL) >> 1) + (a0 & a1);
375}
376
377static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
378  return Average2(Average2(a0, a2), a1);
379}
380
381static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
382                                     uint32_t a2, uint32_t a3) {
383  return Average2(Average2(a0, a1), Average2(a2, a3));
384}
385
386static WEBP_INLINE uint32_t Clip255(uint32_t a) {
387  if (a < 256) {
388    return a;
389  }
390  // return 0, when a is a negative integer.
391  // return 255, when a is positive.
392  return ~a >> 24;
393}
394
395static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
396  return Clip255(a + b - c);
397}
398
399static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
400                                                   uint32_t c2) {
401  const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
402  const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
403                                         (c1 >> 16) & 0xff,
404                                         (c2 >> 16) & 0xff);
405  const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
406                                         (c1 >> 8) & 0xff,
407                                         (c2 >> 8) & 0xff);
408  const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
409  return (a << 24) | (r << 16) | (g << 8) | b;
410}
411
412static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
413  return Clip255(a + (a - b) / 2);
414}
415
416static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
417                                                   uint32_t c2) {
418  const uint32_t ave = Average2(c0, c1);
419  const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
420  const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
421  const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
422  const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
423  return (a << 24) | (r << 16) | (g << 8) | b;
424}
425
426static WEBP_INLINE int Sub3(int a, int b, int c) {
427  const int pb = b - c;
428  const int pa = a - c;
429  return abs(pb) - abs(pa);
430}
431
432static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
433  const int pa_minus_pb =
434      Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
435      Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
436      Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
437      Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
438  return (pa_minus_pb <= 0) ? a : b;
439}
440
441//------------------------------------------------------------------------------
442// Predictors
443
444static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
445  (void)top;
446  (void)left;
447  return ARGB_BLACK;
448}
449static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
450  (void)top;
451  return left;
452}
453static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
454  (void)left;
455  return top[0];
456}
457static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
458  (void)left;
459  return top[1];
460}
461static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
462  (void)left;
463  return top[-1];
464}
465static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
466  const uint32_t pred = Average3(left, top[0], top[1]);
467  return pred;
468}
469static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
470  const uint32_t pred = Average2(left, top[-1]);
471  return pred;
472}
473static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
474  const uint32_t pred = Average2(left, top[0]);
475  return pred;
476}
477static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
478  const uint32_t pred = Average2(top[-1], top[0]);
479  (void)left;
480  return pred;
481}
482static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
483  const uint32_t pred = Average2(top[0], top[1]);
484  (void)left;
485  return pred;
486}
487static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
488  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
489  return pred;
490}
491static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
492  const uint32_t pred = VP8LSelect(top[0], left, top[-1]);
493  return pred;
494}
495static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
496  const uint32_t pred = VP8LClampedAddSubtractFull(left, top[0], top[-1]);
497  return pred;
498}
499static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
500  const uint32_t pred = VP8LClampedAddSubtractHalf(left, top[0], top[-1]);
501  return pred;
502}
503
504// TODO(vikasa): Export the predictor array, to allow SSE2 variants.
505typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
506static const PredictorFunc kPredictors[16] = {
507  Predictor0, Predictor1, Predictor2, Predictor3,
508  Predictor4, Predictor5, Predictor6, Predictor7,
509  Predictor8, Predictor9, Predictor10, Predictor11,
510  Predictor12, Predictor13,
511  Predictor0, Predictor0    // <- padding security sentinels
512};
513
514// TODO(vikasa): Replace 256 etc with defines.
515static float PredictionCostSpatial(const int* counts,
516                                   int weight_0, double exp_val) {
517  const int significant_symbols = 16;
518  const double exp_decay_factor = 0.6;
519  double bits = weight_0 * counts[0];
520  int i;
521  for (i = 1; i < significant_symbols; ++i) {
522    bits += exp_val * (counts[i] + counts[256 - i]);
523    exp_val *= exp_decay_factor;
524  }
525  return (float)(-0.1 * bits);
526}
527
528// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
529static float CombinedShannonEntropy(const int* const X,
530                                    const int* const Y, int n) {
531  int i;
532  double retval = 0.;
533  int sumX = 0, sumXY = 0;
534  for (i = 0; i < n; ++i) {
535    const int x = X[i];
536    const int xy = X[i] + Y[i];
537    if (x != 0) {
538      sumX += x;
539      retval -= VP8LFastSLog2(x);
540    }
541    if (xy != 0) {
542      sumXY += xy;
543      retval -= VP8LFastSLog2(xy);
544    }
545  }
546  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
547  return (float)retval;
548}
549
550static float PredictionCostSpatialHistogram(int accumulated[4][256],
551                                            int tile[4][256]) {
552  int i;
553  double retval = 0;
554  for (i = 0; i < 4; ++i) {
555    const double kExpValue = 0.94;
556    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
557    retval += CombinedShannonEntropy(tile[i], accumulated[i], 256);
558  }
559  return (float)retval;
560}
561
562static int GetBestPredictorForTile(int width, int height,
563                                   int tile_x, int tile_y, int bits,
564                                   int accumulated[4][256],
565                                   const uint32_t* const argb_scratch) {
566  const int kNumPredModes = 14;
567  const int col_start = tile_x << bits;
568  const int row_start = tile_y << bits;
569  const int tile_size = 1 << bits;
570  const int ymax = (tile_size <= height - row_start) ?
571      tile_size : height - row_start;
572  const int xmax = (tile_size <= width - col_start) ?
573      tile_size : width - col_start;
574  int histo[4][256];
575  float best_diff = MAX_DIFF_COST;
576  int best_mode = 0;
577
578  int mode;
579  for (mode = 0; mode < kNumPredModes; ++mode) {
580    const uint32_t* current_row = argb_scratch;
581    const PredictorFunc pred_func = kPredictors[mode];
582    float cur_diff;
583    int y;
584    memset(&histo[0][0], 0, sizeof(histo));
585    for (y = 0; y < ymax; ++y) {
586      int x;
587      const int row = row_start + y;
588      const uint32_t* const upper_row = current_row;
589      current_row = upper_row + width;
590      for (x = 0; x < xmax; ++x) {
591        const int col = col_start + x;
592        uint32_t predict;
593        uint32_t predict_diff;
594        if (row == 0) {
595          predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
596        } else if (col == 0) {
597          predict = upper_row[col];  // Top.
598        } else {
599          predict = pred_func(current_row[col - 1], upper_row + col);
600        }
601        predict_diff = VP8LSubPixels(current_row[col], predict);
602        ++histo[0][predict_diff >> 24];
603        ++histo[1][((predict_diff >> 16) & 0xff)];
604        ++histo[2][((predict_diff >> 8) & 0xff)];
605        ++histo[3][(predict_diff & 0xff)];
606      }
607    }
608    cur_diff = PredictionCostSpatialHistogram(accumulated, histo);
609    if (cur_diff < best_diff) {
610      best_diff = cur_diff;
611      best_mode = mode;
612    }
613  }
614
615  return best_mode;
616}
617
618static void CopyTileWithPrediction(int width, int height,
619                                   int tile_x, int tile_y, int bits, int mode,
620                                   const uint32_t* const argb_scratch,
621                                   uint32_t* const argb) {
622  const int col_start = tile_x << bits;
623  const int row_start = tile_y << bits;
624  const int tile_size = 1 << bits;
625  const int ymax = (tile_size <= height - row_start) ?
626      tile_size : height - row_start;
627  const int xmax = (tile_size <= width - col_start) ?
628      tile_size : width - col_start;
629  const PredictorFunc pred_func = kPredictors[mode];
630  const uint32_t* current_row = argb_scratch;
631
632  int y;
633  for (y = 0; y < ymax; ++y) {
634    int x;
635    const int row = row_start + y;
636    const uint32_t* const upper_row = current_row;
637    current_row = upper_row + width;
638    for (x = 0; x < xmax; ++x) {
639      const int col = col_start + x;
640      const int pix = row * width + col;
641      uint32_t predict;
642      if (row == 0) {
643        predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
644      } else if (col == 0) {
645        predict = upper_row[col];  // Top.
646      } else {
647        predict = pred_func(current_row[col - 1], upper_row + col);
648      }
649      argb[pix] = VP8LSubPixels(current_row[col], predict);
650    }
651  }
652}
653
654void VP8LResidualImage(int width, int height, int bits,
655                       uint32_t* const argb, uint32_t* const argb_scratch,
656                       uint32_t* const image) {
657  const int max_tile_size = 1 << bits;
658  const int tiles_per_row = VP8LSubSampleSize(width, bits);
659  const int tiles_per_col = VP8LSubSampleSize(height, bits);
660  uint32_t* const upper_row = argb_scratch;
661  uint32_t* const current_tile_rows = argb_scratch + width;
662  int tile_y;
663  int histo[4][256];
664  memset(histo, 0, sizeof(histo));
665  for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
666    const int tile_y_offset = tile_y * max_tile_size;
667    const int this_tile_height =
668        (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset;
669    int tile_x;
670    if (tile_y > 0) {
671      memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width,
672             width * sizeof(*upper_row));
673    }
674    memcpy(current_tile_rows, &argb[tile_y_offset * width],
675           this_tile_height * width * sizeof(*current_tile_rows));
676    for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
677      int pred;
678      int y;
679      const int tile_x_offset = tile_x * max_tile_size;
680      int all_x_max = tile_x_offset + max_tile_size;
681      if (all_x_max > width) {
682        all_x_max = width;
683      }
684      pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits, histo,
685                                     argb_scratch);
686      image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
687      CopyTileWithPrediction(width, height, tile_x, tile_y, bits, pred,
688                             argb_scratch, argb);
689      for (y = 0; y < max_tile_size; ++y) {
690        int ix;
691        int all_x;
692        int all_y = tile_y_offset + y;
693        if (all_y >= height) {
694          break;
695        }
696        ix = all_y * width + tile_x_offset;
697        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
698          const uint32_t a = argb[ix];
699          ++histo[0][a >> 24];
700          ++histo[1][((a >> 16) & 0xff)];
701          ++histo[2][((a >> 8) & 0xff)];
702          ++histo[3][(a & 0xff)];
703        }
704      }
705    }
706  }
707}
708
709// Inverse prediction.
710static void PredictorInverseTransform(const VP8LTransform* const transform,
711                                      int y_start, int y_end, uint32_t* data) {
712  const int width = transform->xsize_;
713  if (y_start == 0) {  // First Row follows the L (mode=1) mode.
714    int x;
715    const uint32_t pred0 = Predictor0(data[-1], NULL);
716    AddPixelsEq(data, pred0);
717    for (x = 1; x < width; ++x) {
718      const uint32_t pred1 = Predictor1(data[x - 1], NULL);
719      AddPixelsEq(data + x, pred1);
720    }
721    data += width;
722    ++y_start;
723  }
724
725  {
726    int y = y_start;
727    const int mask = (1 << transform->bits_) - 1;
728    const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
729    const uint32_t* pred_mode_base =
730        transform->data_ + (y >> transform->bits_) * tiles_per_row;
731
732    while (y < y_end) {
733      int x;
734      const uint32_t pred2 = Predictor2(data[-1], data - width);
735      const uint32_t* pred_mode_src = pred_mode_base;
736      PredictorFunc pred_func;
737
738      // First pixel follows the T (mode=2) mode.
739      AddPixelsEq(data, pred2);
740
741      // .. the rest:
742      pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
743      for (x = 1; x < width; ++x) {
744        uint32_t pred;
745        if ((x & mask) == 0) {    // start of tile. Read predictor function.
746          pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
747        }
748        pred = pred_func(data[x - 1], data + x - width);
749        AddPixelsEq(data + x, pred);
750      }
751      data += width;
752      ++y;
753      if ((y & mask) == 0) {   // Use the same mask, since tiles are squares.
754        pred_mode_base += tiles_per_row;
755      }
756    }
757  }
758}
759
760static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
761  int i = 0;
762  for (; i < num_pixs; ++i) {
763    const uint32_t argb = argb_data[i];
764    const uint32_t green = (argb >> 8) & 0xff;
765    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
766    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
767    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
768  }
769}
770
771// Add green to blue and red channels (i.e. perform the inverse transform of
772// 'subtract green').
773static void AddGreenToBlueAndRed(uint32_t* data, const uint32_t* data_end) {
774  while (data < data_end) {
775    const uint32_t argb = *data;
776    const uint32_t green = ((argb >> 8) & 0xff);
777    uint32_t red_blue = (argb & 0x00ff00ffu);
778    red_blue += (green << 16) | green;
779    red_blue &= 0x00ff00ffu;
780    *data++ = (argb & 0xff00ff00u) | red_blue;
781  }
782}
783
784typedef struct {
785  // Note: the members are uint8_t, so that any negative values are
786  // automatically converted to "mod 256" values.
787  uint8_t green_to_red_;
788  uint8_t green_to_blue_;
789  uint8_t red_to_blue_;
790} Multipliers;
791
792static WEBP_INLINE void MultipliersClear(Multipliers* m) {
793  m->green_to_red_ = 0;
794  m->green_to_blue_ = 0;
795  m->red_to_blue_ = 0;
796}
797
798static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
799                                                int8_t color) {
800  return (uint32_t)((int)(color_pred) * color) >> 5;
801}
802
803static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
804                                               Multipliers* const m) {
805  m->green_to_red_  = (color_code >>  0) & 0xff;
806  m->green_to_blue_ = (color_code >>  8) & 0xff;
807  m->red_to_blue_   = (color_code >> 16) & 0xff;
808}
809
810static WEBP_INLINE uint32_t MultipliersToColorCode(Multipliers* const m) {
811  return 0xff000000u |
812         ((uint32_t)(m->red_to_blue_) << 16) |
813         ((uint32_t)(m->green_to_blue_) << 8) |
814         m->green_to_red_;
815}
816
817static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m,
818                                           uint32_t argb, int inverse) {
819  const uint32_t green = argb >> 8;
820  const uint32_t red = argb >> 16;
821  uint32_t new_red = red;
822  uint32_t new_blue = argb;
823
824  if (inverse) {
825    new_red += ColorTransformDelta(m->green_to_red_, green);
826    new_red &= 0xff;
827    new_blue += ColorTransformDelta(m->green_to_blue_, green);
828    new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
829    new_blue &= 0xff;
830  } else {
831    new_red -= ColorTransformDelta(m->green_to_red_, green);
832    new_red &= 0xff;
833    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
834    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
835    new_blue &= 0xff;
836  }
837  return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
838}
839
840static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
841                                             uint32_t argb) {
842  const uint32_t green = argb >> 8;
843  uint32_t new_red = argb >> 16;
844  new_red -= ColorTransformDelta(green_to_red, green);
845  return (new_red & 0xff);
846}
847
848static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
849                                              uint8_t red_to_blue,
850                                              uint32_t argb) {
851  const uint32_t green = argb >> 8;
852  const uint32_t red = argb >> 16;
853  uint8_t new_blue = argb;
854  new_blue -= ColorTransformDelta(green_to_blue, green);
855  new_blue -= ColorTransformDelta(red_to_blue, red);
856  return (new_blue & 0xff);
857}
858
859static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
860                                          int ix, int xsize) {
861  const uint32_t v = argb[ix];
862  if (ix >= xsize + 3) {
863    if (v == argb[ix - xsize] &&
864        argb[ix - 1] == argb[ix - xsize - 1] &&
865        argb[ix - 2] == argb[ix - xsize - 2] &&
866        argb[ix - 3] == argb[ix - xsize - 3]) {
867      return 1;
868    }
869    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
870  } else if (ix >= 3) {
871    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
872  }
873  return 0;
874}
875
876static float PredictionCostCrossColor(const int accumulated[256],
877                                      const int counts[256]) {
878  // Favor low entropy, locally and globally.
879  // Favor small absolute values for PredictionCostSpatial
880  static const double kExpValue = 2.4;
881  return CombinedShannonEntropy(counts, accumulated, 256) +
882         PredictionCostSpatial(counts, 3, kExpValue);
883}
884
885static Multipliers GetBestColorTransformForTile(
886    int tile_x, int tile_y, int bits,
887    Multipliers prevX,
888    Multipliers prevY,
889    int step, int xsize, int ysize,
890    int* accumulated_red_histo,
891    int* accumulated_blue_histo,
892    const uint32_t* const argb) {
893  float best_diff = MAX_DIFF_COST;
894  float cur_diff;
895  const int halfstep = step / 2;
896  const int max_tile_size = 1 << bits;
897  const int tile_y_offset = tile_y * max_tile_size;
898  const int tile_x_offset = tile_x * max_tile_size;
899  int green_to_red;
900  int green_to_blue;
901  int red_to_blue;
902  int all_x_max = tile_x_offset + max_tile_size;
903  int all_y_max = tile_y_offset + max_tile_size;
904  Multipliers best_tx;
905  MultipliersClear(&best_tx);
906  if (all_x_max > xsize) {
907    all_x_max = xsize;
908  }
909  if (all_y_max > ysize) {
910    all_y_max = ysize;
911  }
912
913  for (green_to_red = -64; green_to_red <= 64; green_to_red += halfstep) {
914    int histo[256] = { 0 };
915    int all_y;
916
917    for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
918      int ix = all_y * xsize + tile_x_offset;
919      int all_x;
920      for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
921        if (SkipRepeatedPixels(argb, ix, xsize)) {
922          continue;
923        }
924        ++histo[TransformColorRed(green_to_red, argb[ix])];  // red.
925      }
926    }
927    cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]);
928    if ((uint8_t)green_to_red == prevX.green_to_red_) {
929      cur_diff -= 3;  // favor keeping the areas locally similar
930    }
931    if ((uint8_t)green_to_red == prevY.green_to_red_) {
932      cur_diff -= 3;  // favor keeping the areas locally similar
933    }
934    if (green_to_red == 0) {
935      cur_diff -= 3;
936    }
937    if (cur_diff < best_diff) {
938      best_diff = cur_diff;
939      best_tx.green_to_red_ = green_to_red;
940    }
941  }
942  best_diff = MAX_DIFF_COST;
943  for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) {
944    for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) {
945      int all_y;
946      int histo[256] = { 0 };
947      for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
948        int all_x;
949        int ix = all_y * xsize + tile_x_offset;
950        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
951          if (SkipRepeatedPixels(argb, ix, xsize)) {
952            continue;
953          }
954          ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])];
955        }
956      }
957      cur_diff =
958          PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]);
959      if ((uint8_t)green_to_blue == prevX.green_to_blue_) {
960        cur_diff -= 3;  // favor keeping the areas locally similar
961      }
962      if ((uint8_t)green_to_blue == prevY.green_to_blue_) {
963        cur_diff -= 3;  // favor keeping the areas locally similar
964      }
965      if ((uint8_t)red_to_blue == prevX.red_to_blue_) {
966        cur_diff -= 3;  // favor keeping the areas locally similar
967      }
968      if ((uint8_t)red_to_blue == prevY.red_to_blue_) {
969        cur_diff -= 3;  // favor keeping the areas locally similar
970      }
971      if (green_to_blue == 0) {
972        cur_diff -= 3;
973      }
974      if (red_to_blue == 0) {
975        cur_diff -= 3;
976      }
977      if (cur_diff < best_diff) {
978        best_diff = cur_diff;
979        best_tx.green_to_blue_ = green_to_blue;
980        best_tx.red_to_blue_ = red_to_blue;
981      }
982    }
983  }
984  return best_tx;
985}
986
987static void CopyTileWithColorTransform(int xsize, int ysize,
988                                       int tile_x, int tile_y, int bits,
989                                       Multipliers color_transform,
990                                       uint32_t* const argb) {
991  int y;
992  int xscan = 1 << bits;
993  int yscan = 1 << bits;
994  tile_x <<= bits;
995  tile_y <<= bits;
996  if (xscan > xsize - tile_x) {
997    xscan = xsize - tile_x;
998  }
999  if (yscan > ysize - tile_y) {
1000    yscan = ysize - tile_y;
1001  }
1002  yscan += tile_y;
1003  for (y = tile_y; y < yscan; ++y) {
1004    int ix = y * xsize + tile_x;
1005    const int end_ix = ix + xscan;
1006    for (; ix < end_ix; ++ix) {
1007      argb[ix] = TransformColor(&color_transform, argb[ix], 0);
1008    }
1009  }
1010}
1011
1012void VP8LColorSpaceTransform(int width, int height, int bits, int step,
1013                             uint32_t* const argb, uint32_t* image) {
1014  const int max_tile_size = 1 << bits;
1015  int tile_xsize = VP8LSubSampleSize(width, bits);
1016  int tile_ysize = VP8LSubSampleSize(height, bits);
1017  int accumulated_red_histo[256] = { 0 };
1018  int accumulated_blue_histo[256] = { 0 };
1019  int tile_y;
1020  int tile_x;
1021  Multipliers prevX;
1022  Multipliers prevY;
1023  MultipliersClear(&prevY);
1024  MultipliersClear(&prevX);
1025  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
1026    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
1027      Multipliers color_transform;
1028      int all_x_max;
1029      int y;
1030      const int tile_y_offset = tile_y * max_tile_size;
1031      const int tile_x_offset = tile_x * max_tile_size;
1032      if (tile_y != 0) {
1033        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
1034        ColorCodeToMultipliers(image[(tile_y - 1) * tile_xsize + tile_x],
1035                               &prevY);
1036      } else if (tile_x != 0) {
1037        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
1038      }
1039      color_transform =
1040          GetBestColorTransformForTile(tile_x, tile_y, bits,
1041                                       prevX, prevY,
1042                                       step, width, height,
1043                                       &accumulated_red_histo[0],
1044                                       &accumulated_blue_histo[0],
1045                                       argb);
1046      image[tile_y * tile_xsize + tile_x] =
1047          MultipliersToColorCode(&color_transform);
1048      CopyTileWithColorTransform(width, height, tile_x, tile_y, bits,
1049                                 color_transform, argb);
1050
1051      // Gather accumulated histogram data.
1052      all_x_max = tile_x_offset + max_tile_size;
1053      if (all_x_max > width) {
1054        all_x_max = width;
1055      }
1056      for (y = 0; y < max_tile_size; ++y) {
1057        int ix;
1058        int all_x;
1059        int all_y = tile_y_offset + y;
1060        if (all_y >= height) {
1061          break;
1062        }
1063        ix = all_y * width + tile_x_offset;
1064        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
1065          if (ix >= 2 &&
1066              argb[ix] == argb[ix - 2] &&
1067              argb[ix] == argb[ix - 1]) {
1068            continue;  // repeated pixels are handled by backward references
1069          }
1070          if (ix >= width + 2 &&
1071              argb[ix - 2] == argb[ix - width - 2] &&
1072              argb[ix - 1] == argb[ix - width - 1] &&
1073              argb[ix] == argb[ix - width]) {
1074            continue;  // repeated pixels are handled by backward references
1075          }
1076          ++accumulated_red_histo[(argb[ix] >> 16) & 0xff];
1077          ++accumulated_blue_histo[argb[ix] & 0xff];
1078        }
1079      }
1080    }
1081  }
1082}
1083
1084// Color space inverse transform.
1085static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
1086                                       int y_start, int y_end, uint32_t* data) {
1087  const int width = transform->xsize_;
1088  const int mask = (1 << transform->bits_) - 1;
1089  const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
1090  int y = y_start;
1091  const uint32_t* pred_row =
1092      transform->data_ + (y >> transform->bits_) * tiles_per_row;
1093
1094  while (y < y_end) {
1095    const uint32_t* pred = pred_row;
1096    Multipliers m = { 0, 0, 0 };
1097    int x;
1098
1099    for (x = 0; x < width; ++x) {
1100      if ((x & mask) == 0) ColorCodeToMultipliers(*pred++, &m);
1101      data[x] = TransformColor(&m, data[x], 1);
1102    }
1103    data += width;
1104    ++y;
1105    if ((y & mask) == 0) pred_row += tiles_per_row;;
1106  }
1107}
1108
1109// Separate out pixels packed together using pixel-bundling.
1110// We define two methods for ARGB data (uint32_t) and alpha-only data (uint8_t).
1111#define COLOR_INDEX_INVERSE(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)             \
1112void FUNC_NAME(const VP8LTransform* const transform,                           \
1113               int y_start, int y_end, const TYPE* src, TYPE* dst) {           \
1114  int y;                                                                       \
1115  const int bits_per_pixel = 8 >> transform->bits_;                            \
1116  const int width = transform->xsize_;                                         \
1117  const uint32_t* const color_map = transform->data_;                          \
1118  if (bits_per_pixel < 8) {                                                    \
1119    const int pixels_per_byte = 1 << transform->bits_;                         \
1120    const int count_mask = pixels_per_byte - 1;                                \
1121    const uint32_t bit_mask = (1 << bits_per_pixel) - 1;                       \
1122    for (y = y_start; y < y_end; ++y) {                                        \
1123      uint32_t packed_pixels = 0;                                              \
1124      int x;                                                                   \
1125      for (x = 0; x < width; ++x) {                                            \
1126        /* We need to load fresh 'packed_pixels' once every                */  \
1127        /* 'pixels_per_byte' increments of x. Fortunately, pixels_per_byte */  \
1128        /* is a power of 2, so can just use a mask for that, instead of    */  \
1129        /* decrementing a counter.                                         */  \
1130        if ((x & count_mask) == 0) packed_pixels = GET_INDEX(*src++);          \
1131        *dst++ = GET_VALUE(color_map[packed_pixels & bit_mask]);               \
1132        packed_pixels >>= bits_per_pixel;                                      \
1133      }                                                                        \
1134    }                                                                          \
1135  } else {                                                                     \
1136    for (y = y_start; y < y_end; ++y) {                                        \
1137      int x;                                                                   \
1138      for (x = 0; x < width; ++x) {                                            \
1139        *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                      \
1140      }                                                                        \
1141    }                                                                          \
1142  }                                                                            \
1143}
1144
1145static WEBP_INLINE uint32_t GetARGBIndex(uint32_t idx) {
1146  return (idx >> 8) & 0xff;
1147}
1148
1149static WEBP_INLINE uint8_t GetAlphaIndex(uint8_t idx) {
1150  return idx;
1151}
1152
1153static WEBP_INLINE uint32_t GetARGBValue(uint32_t val) {
1154  return val;
1155}
1156
1157static WEBP_INLINE uint8_t GetAlphaValue(uint32_t val) {
1158  return (val >> 8) & 0xff;
1159}
1160
1161static COLOR_INDEX_INVERSE(ColorIndexInverseTransform, uint32_t, GetARGBIndex,
1162                           GetARGBValue)
1163COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, uint8_t, GetAlphaIndex,
1164                    GetAlphaValue)
1165
1166#undef COLOR_INDEX_INVERSE
1167
1168void VP8LInverseTransform(const VP8LTransform* const transform,
1169                          int row_start, int row_end,
1170                          const uint32_t* const in, uint32_t* const out) {
1171  const int width = transform->xsize_;
1172  assert(row_start < row_end);
1173  assert(row_end <= transform->ysize_);
1174  switch (transform->type_) {
1175    case SUBTRACT_GREEN:
1176      VP8LAddGreenToBlueAndRed(out, out + (row_end - row_start) * width);
1177      break;
1178    case PREDICTOR_TRANSFORM:
1179      PredictorInverseTransform(transform, row_start, row_end, out);
1180      if (row_end != transform->ysize_) {
1181        // The last predicted row in this iteration will be the top-pred row
1182        // for the first row in next iteration.
1183        memcpy(out - width, out + (row_end - row_start - 1) * width,
1184               width * sizeof(*out));
1185      }
1186      break;
1187    case CROSS_COLOR_TRANSFORM:
1188      ColorSpaceInverseTransform(transform, row_start, row_end, out);
1189      break;
1190    case COLOR_INDEXING_TRANSFORM:
1191      if (in == out && transform->bits_ > 0) {
1192        // Move packed pixels to the end of unpacked region, so that unpacking
1193        // can occur seamlessly.
1194        // Also, note that this is the only transform that applies on
1195        // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
1196        // transforms work on effective width of xsize_.
1197        const int out_stride = (row_end - row_start) * width;
1198        const int in_stride = (row_end - row_start) *
1199            VP8LSubSampleSize(transform->xsize_, transform->bits_);
1200        uint32_t* const src = out + out_stride - in_stride;
1201        memmove(src, out, in_stride * sizeof(*src));
1202        ColorIndexInverseTransform(transform, row_start, row_end, src, out);
1203      } else {
1204        ColorIndexInverseTransform(transform, row_start, row_end, in, out);
1205      }
1206      break;
1207  }
1208}
1209
1210//------------------------------------------------------------------------------
1211// Color space conversion.
1212
1213static int is_big_endian(void) {
1214  static const union {
1215    uint16_t w;
1216    uint8_t b[2];
1217  } tmp = { 1 };
1218  return (tmp.b[0] != 1);
1219}
1220
1221static void ConvertBGRAToRGB(const uint32_t* src,
1222                             int num_pixels, uint8_t* dst) {
1223  const uint32_t* const src_end = src + num_pixels;
1224  while (src < src_end) {
1225    const uint32_t argb = *src++;
1226    *dst++ = (argb >> 16) & 0xff;
1227    *dst++ = (argb >>  8) & 0xff;
1228    *dst++ = (argb >>  0) & 0xff;
1229  }
1230}
1231
1232static void ConvertBGRAToRGBA(const uint32_t* src,
1233                              int num_pixels, uint8_t* dst) {
1234  const uint32_t* const src_end = src + num_pixels;
1235  while (src < src_end) {
1236    const uint32_t argb = *src++;
1237    *dst++ = (argb >> 16) & 0xff;
1238    *dst++ = (argb >>  8) & 0xff;
1239    *dst++ = (argb >>  0) & 0xff;
1240    *dst++ = (argb >> 24) & 0xff;
1241  }
1242}
1243
1244static void ConvertBGRAToRGBA4444(const uint32_t* src,
1245                                  int num_pixels, uint8_t* dst) {
1246  const uint32_t* const src_end = src + num_pixels;
1247  while (src < src_end) {
1248    const uint32_t argb = *src++;
1249    const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
1250    const uint8_t ba = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
1251#ifdef WEBP_SWAP_16BIT_CSP
1252    *dst++ = ba;
1253    *dst++ = rg;
1254#else
1255    *dst++ = rg;
1256    *dst++ = ba;
1257#endif
1258  }
1259}
1260
1261static void ConvertBGRAToRGB565(const uint32_t* src,
1262                                int num_pixels, uint8_t* dst) {
1263  const uint32_t* const src_end = src + num_pixels;
1264  while (src < src_end) {
1265    const uint32_t argb = *src++;
1266    const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
1267    const uint8_t gb = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
1268#ifdef WEBP_SWAP_16BIT_CSP
1269    *dst++ = gb;
1270    *dst++ = rg;
1271#else
1272    *dst++ = rg;
1273    *dst++ = gb;
1274#endif
1275  }
1276}
1277
1278static void ConvertBGRAToBGR(const uint32_t* src,
1279                             int num_pixels, uint8_t* dst) {
1280  const uint32_t* const src_end = src + num_pixels;
1281  while (src < src_end) {
1282    const uint32_t argb = *src++;
1283    *dst++ = (argb >>  0) & 0xff;
1284    *dst++ = (argb >>  8) & 0xff;
1285    *dst++ = (argb >> 16) & 0xff;
1286  }
1287}
1288
1289static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
1290                       int swap_on_big_endian) {
1291  if (is_big_endian() == swap_on_big_endian) {
1292    const uint32_t* const src_end = src + num_pixels;
1293    while (src < src_end) {
1294      uint32_t argb = *src++;
1295
1296#if !defined(__BIG_ENDIAN__)
1297#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
1298#if defined(__i386__) || defined(__x86_64__)
1299      __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb));
1300      *(uint32_t*)dst = argb;
1301#elif defined(_MSC_VER)
1302      argb = _byteswap_ulong(argb);
1303      *(uint32_t*)dst = argb;
1304#else
1305      dst[0] = (argb >> 24) & 0xff;
1306      dst[1] = (argb >> 16) & 0xff;
1307      dst[2] = (argb >>  8) & 0xff;
1308      dst[3] = (argb >>  0) & 0xff;
1309#endif
1310#else  // WEBP_REFERENCE_IMPLEMENTATION
1311      dst[0] = (argb >> 24) & 0xff;
1312      dst[1] = (argb >> 16) & 0xff;
1313      dst[2] = (argb >>  8) & 0xff;
1314      dst[3] = (argb >>  0) & 0xff;
1315#endif
1316#else  // __BIG_ENDIAN__
1317      dst[0] = (argb >>  0) & 0xff;
1318      dst[1] = (argb >>  8) & 0xff;
1319      dst[2] = (argb >> 16) & 0xff;
1320      dst[3] = (argb >> 24) & 0xff;
1321#endif
1322      dst += sizeof(argb);
1323    }
1324  } else {
1325    memcpy(dst, src, num_pixels * sizeof(*src));
1326  }
1327}
1328
1329void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
1330                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
1331  switch (out_colorspace) {
1332    case MODE_RGB:
1333      ConvertBGRAToRGB(in_data, num_pixels, rgba);
1334      break;
1335    case MODE_RGBA:
1336      ConvertBGRAToRGBA(in_data, num_pixels, rgba);
1337      break;
1338    case MODE_rgbA:
1339      ConvertBGRAToRGBA(in_data, num_pixels, rgba);
1340      WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
1341      break;
1342    case MODE_BGR:
1343      ConvertBGRAToBGR(in_data, num_pixels, rgba);
1344      break;
1345    case MODE_BGRA:
1346      CopyOrSwap(in_data, num_pixels, rgba, 1);
1347      break;
1348    case MODE_bgrA:
1349      CopyOrSwap(in_data, num_pixels, rgba, 1);
1350      WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
1351      break;
1352    case MODE_ARGB:
1353      CopyOrSwap(in_data, num_pixels, rgba, 0);
1354      break;
1355    case MODE_Argb:
1356      CopyOrSwap(in_data, num_pixels, rgba, 0);
1357      WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
1358      break;
1359    case MODE_RGBA_4444:
1360      ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
1361      break;
1362    case MODE_rgbA_4444:
1363      ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
1364      WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
1365      break;
1366    case MODE_RGB_565:
1367      ConvertBGRAToRGB565(in_data, num_pixels, rgba);
1368      break;
1369    default:
1370      assert(0);          // Code flow should not reach here.
1371  }
1372}
1373
1374// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
1375void VP8LBundleColorMap(const uint8_t* const row, int width,
1376                        int xbits, uint32_t* const dst) {
1377  int x;
1378  if (xbits > 0) {
1379    const int bit_depth = 1 << (3 - xbits);
1380    const int mask = (1 << xbits) - 1;
1381    uint32_t code = 0xff000000;
1382    for (x = 0; x < width; ++x) {
1383      const int xsub = x & mask;
1384      if (xsub == 0) {
1385        code = 0xff000000;
1386      }
1387      code |= row[x] << (8 + bit_depth * xsub);
1388      dst[x >> xbits] = code;
1389    }
1390  } else {
1391    for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
1392  }
1393}
1394
1395//------------------------------------------------------------------------------
1396
1397// TODO(vikasa): Move the SSE2 functions to lossless_dsp.c (new file), once
1398// color-space conversion methods (ConvertFromBGRA) are also updated for SSE2.
1399#if defined(WEBP_USE_SSE2)
1400static WEBP_INLINE uint32_t ClampedAddSubtractFullSSE2(uint32_t c0, uint32_t c1,
1401                                                       uint32_t c2) {
1402  const __m128i zero = _mm_setzero_si128();
1403  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
1404  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
1405  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
1406  const __m128i V1 = _mm_add_epi16(C0, C1);
1407  const __m128i V2 = _mm_sub_epi16(V1, C2);
1408  const __m128i b = _mm_packus_epi16(V2, V2);
1409  const uint32_t output = _mm_cvtsi128_si32(b);
1410  return output;
1411}
1412
1413static WEBP_INLINE uint32_t ClampedAddSubtractHalfSSE2(uint32_t c0, uint32_t c1,
1414                                                       uint32_t c2) {
1415  const uint32_t ave = Average2(c0, c1);
1416  const __m128i zero = _mm_setzero_si128();
1417  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
1418  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
1419  const __m128i A1 = _mm_sub_epi16(A0, B0);
1420  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
1421  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
1422  const __m128i A3 = _mm_srai_epi16(A2, 1);
1423  const __m128i A4 = _mm_add_epi16(A0, A3);
1424  const __m128i A5 = _mm_packus_epi16(A4, A4);
1425  const uint32_t output = _mm_cvtsi128_si32(A5);
1426  return output;
1427}
1428
1429static WEBP_INLINE uint32_t SelectSSE2(uint32_t a, uint32_t b, uint32_t c) {
1430  int pa_minus_pb;
1431  const __m128i zero = _mm_setzero_si128();
1432  const __m128i A0 = _mm_cvtsi32_si128(a);
1433  const __m128i B0 = _mm_cvtsi32_si128(b);
1434  const __m128i C0 = _mm_cvtsi32_si128(c);
1435  const __m128i AC0 = _mm_subs_epu8(A0, C0);
1436  const __m128i CA0 = _mm_subs_epu8(C0, A0);
1437  const __m128i BC0 = _mm_subs_epu8(B0, C0);
1438  const __m128i CB0 = _mm_subs_epu8(C0, B0);
1439  const __m128i AC = _mm_or_si128(AC0, CA0);
1440  const __m128i BC = _mm_or_si128(BC0, CB0);
1441  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
1442  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
1443  const __m128i diff = _mm_sub_epi16(pb, pa);
1444  {
1445    int16_t out[8];
1446    _mm_storeu_si128((__m128i*)out, diff);
1447    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
1448  }
1449  return (pa_minus_pb <= 0) ? a : b;
1450}
1451
1452static void SubtractGreenFromBlueAndRedSSE2(uint32_t* argb_data, int num_pixs) {
1453  int i = 0;
1454  const __m128i mask = _mm_set1_epi32(0x0000ff00);
1455  for (; i + 4 < num_pixs; i += 4) {
1456    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
1457    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
1458    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
1459    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
1460    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
1461    const __m128i out = _mm_sub_epi8(in, in_0g0g);
1462    _mm_storeu_si128((__m128i*)&argb_data[i], out);
1463  }
1464  // fallthrough and finish off with plain-C
1465  for (; i < num_pixs; ++i) {
1466    const uint32_t argb = argb_data[i];
1467    const uint32_t green = (argb >> 8) & 0xff;
1468    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
1469    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
1470    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
1471  }
1472}
1473
1474static void AddGreenToBlueAndRedSSE2(uint32_t* data, const uint32_t* data_end) {
1475  const __m128i mask = _mm_set1_epi32(0x0000ff00);
1476  for (; data + 4 < data_end; data += 4) {
1477    const __m128i in = _mm_loadu_si128((__m128i*)data);
1478    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
1479    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
1480    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
1481    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
1482    const __m128i out = _mm_add_epi8(in, in_0g0g);
1483    _mm_storeu_si128((__m128i*)data, out);
1484  }
1485  // fallthrough and finish off with plain-C
1486  while (data < data_end) {
1487    const uint32_t argb = *data;
1488    const uint32_t green = ((argb >> 8) & 0xff);
1489    uint32_t red_blue = (argb & 0x00ff00ffu);
1490    red_blue += (green << 16) | green;
1491    red_blue &= 0x00ff00ffu;
1492    *data++ = (argb & 0xff00ff00u) | red_blue;
1493  }
1494}
1495
1496extern void VP8LDspInitSSE2(void);
1497
1498void VP8LDspInitSSE2(void) {
1499  VP8LClampedAddSubtractFull = ClampedAddSubtractFullSSE2;
1500  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalfSSE2;
1501  VP8LSelect = SelectSSE2;
1502  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRedSSE2;
1503  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRedSSE2;
1504}
1505#endif
1506//------------------------------------------------------------------------------
1507
1508VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
1509VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
1510VP8LPredSelectFunc VP8LSelect;
1511VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
1512VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
1513
1514void VP8LDspInit(void) {
1515  VP8LClampedAddSubtractFull = ClampedAddSubtractFull;
1516  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalf;
1517  VP8LSelect = Select;
1518  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
1519  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
1520
1521  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
1522  if (VP8GetCPUInfo != NULL) {
1523#if defined(WEBP_USE_SSE2)
1524    if (VP8GetCPUInfo(kSSE2)) {
1525      VP8LDspInitSSE2();
1526    }
1527#endif
1528  }
1529}
1530
1531//------------------------------------------------------------------------------
1532
1533