1
2/* filter_sse2_intrinsics.c - SSE2 optimized filter functions
3 *
4 * Copyright (c) 2016 Google, Inc.
5 * Written by Mike Klein and Matt Sarett
6 * Derived from arm/filter_neon_intrinsics.c, which was
7 * Copyright (c) 2014,2016 Glenn Randers-Pehrson
8 *
9 * Last changed in libpng 1.6.22 [(PENDING RELEASE)]
10 *
11 * This code is released under the libpng license.
12 * For conditions of distribution and use, see the disclaimer
13 * and license in png.h
14 */
15
16#include "../../pngpriv.h"
17
18#ifdef PNG_READ_SUPPORTED
19
20#if PNG_INTEL_SSE_IMPLEMENTATION > 0
21
22#include <immintrin.h>
23
24/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
25 * They're positioned like this:
26 *    prev:  c b
27 *    row:   a d
28 * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
29 * whichever of a, b, or c is closest to p=a+b-c.
30 */
31
32static __m128i load4(const void* p) {
33   return _mm_cvtsi32_si128(*(const int*)p);
34}
35
36static void store4(void* p, __m128i v) {
37   *(int*)p = _mm_cvtsi128_si32(v);
38}
39
40static __m128i load3(const void* p) {
41   /* We'll load 2 bytes, then 1 byte,
42    * then mask them together, and finally load into SSE.
43    */
44   const png_uint_16* p01 = p;
45   const png_byte*    p2  = (const png_byte*)(p01+1);
46
47   png_uint_32 v012 = (png_uint_32)(*p01)
48                    | (png_uint_32)(*p2) << 16;
49   return load4(&v012);
50}
51
52static void store3(void* p, __m128i v) {
53   /* We'll pull from SSE as a 32-bit int, then write
54    * its bottom two bytes, then its third byte.
55    */
56   png_uint_32 v012;
57   store4(&v012, v);
58
59   png_uint_16* p01 = p;
60   png_byte*    p2  = (png_byte*)(p01+1);
61   *p01 = v012;
62   *p2  = v012 >> 16;
63}
64
65void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
66   png_const_bytep prev)
67{
68   /* The Sub filter predicts each pixel as the previous pixel, a.
69    * There is no pixel to the left of the first pixel.  It's encoded directly.
70    * That works with our main loop if we just say that left pixel was zero.
71    */
72   png_debug(1, "in png_read_filter_row_sub3_sse2");
73   __m128i a, d = _mm_setzero_si128();
74
75   int rb = row_info->rowbytes;
76   while (rb >= 4) {
77      a = d; d = load4(row);
78      d = _mm_add_epi8(d, a);
79      store3(row, d);
80
81      row += 3;
82      rb  -= 3;
83   }
84   if (rb > 0) {
85      a = d; d = load3(row);
86      d = _mm_add_epi8(d, a);
87      store3(row, d);
88
89      row += 3;
90      rb  -= 3;
91   }
92}
93
94void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
95   png_const_bytep prev)
96{
97   /* The Sub filter predicts each pixel as the previous pixel, a.
98    * There is no pixel to the left of the first pixel.  It's encoded directly.
99    * That works with our main loop if we just say that left pixel was zero.
100    */
101   png_debug(1, "in png_read_filter_row_sub4_sse2");
102   __m128i a, d = _mm_setzero_si128();
103
104   int rb = row_info->rowbytes;
105   while (rb > 0) {
106      a = d; d = load4(row);
107      d = _mm_add_epi8(d, a);
108      store4(row, d);
109
110      row += 4;
111      rb  -= 4;
112   }
113}
114
115void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
116   png_const_bytep prev)
117{
118   /* The Avg filter predicts each pixel as the (truncated) average of a and b.
119    * There's no pixel to the left of the first pixel.  Luckily, it's
120    * predicted to be half of the pixel above it.  So again, this works
121    * perfectly with our loop if we make sure a starts at zero.
122    */
123   png_debug(1, "in png_read_filter_row_avg3_sse2");
124   const __m128i zero = _mm_setzero_si128();
125   __m128i    b;
126   __m128i a, d = zero;
127
128   int rb = row_info->rowbytes;
129   while (rb >= 4) {
130             b = load4(prev);
131      a = d; d = load4(row );
132
133      /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
134      __m128i avg = _mm_avg_epu8(a,b);
135      /* ...but we can fix it up by subtracting off 1 if it rounded up. */
136      avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
137                                            _mm_set1_epi8(1)));
138      d = _mm_add_epi8(d, avg);
139      store3(row, d);
140
141      prev += 3;
142      row  += 3;
143      rb   -= 3;
144   }
145   if (rb > 0) {
146             b = load3(prev);
147      a = d; d = load3(row );
148
149      /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
150      __m128i avg = _mm_avg_epu8(a,b);
151      /* ...but we can fix it up by subtracting off 1 if it rounded up. */
152      avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
153                                            _mm_set1_epi8(1)));
154
155      d = _mm_add_epi8(d, avg);
156      store3(row, d);
157
158      prev += 3;
159      row  += 3;
160      rb   -= 3;
161   }
162}
163
164void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
165   png_const_bytep prev)
166{
167   /* The Avg filter predicts each pixel as the (truncated) average of a and b.
168    * There's no pixel to the left of the first pixel.  Luckily, it's
169    * predicted to be half of the pixel above it.  So again, this works
170    * perfectly with our loop if we make sure a starts at zero.
171    */
172   png_debug(1, "in png_read_filter_row_avg4_sse2");
173   const __m128i zero = _mm_setzero_si128();
174   __m128i    b;
175   __m128i a, d = zero;
176
177   int rb = row_info->rowbytes;
178   while (rb > 0) {
179             b = load4(prev);
180      a = d; d = load4(row );
181
182      /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
183      __m128i avg = _mm_avg_epu8(a,b);
184      /* ...but we can fix it up by subtracting off 1 if it rounded up. */
185      avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
186                                            _mm_set1_epi8(1)));
187
188      d = _mm_add_epi8(d, avg);
189      store4(row, d);
190
191      prev += 4;
192      row  += 4;
193      rb   -= 4;
194   }
195}
196
197/* Returns |x| for 16-bit lanes. */
198static __m128i abs_i16(__m128i x) {
199#if PNG_INTEL_SSE_IMPLEMENTATION >= 2
200   return _mm_abs_epi16(x);
201#else
202   /* Read this all as, return x<0 ? -x : x.
203   * To negate two's complement, you flip all the bits then add 1.
204    */
205   __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
206
207   /* Flip negative lanes. */
208   x = _mm_xor_si128(x, is_negative);
209
210   /* +1 to negative lanes, else +0. */
211   x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15));
212   return x;
213#endif
214}
215
216/* Bytewise c ? t : e. */
217static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
218#if PNG_INTEL_SSE_IMPLEMENTATION >= 3
219   return _mm_blendv_epi8(e,t,c);
220#else
221   return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
222#endif
223}
224
225void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
226   png_const_bytep prev)
227{
228   /* Paeth tries to predict pixel d using the pixel to the left of it, a,
229    * and two pixels from the previous row, b and c:
230    *   prev: c b
231    *   row:  a d
232    * The Paeth function predicts d to be whichever of a, b, or c is nearest to
233    * p=a+b-c.
234    *
235    * The first pixel has no left context, and so uses an Up filter, p = b.
236    * This works naturally with our main loop's p = a+b-c if we force a and c
237    * to zero.
238    * Here we zero b and d, which become c and a respectively at the start of
239    * the loop.
240    */
241   png_debug(1, "in png_read_filter_row_paeth3_sse2");
242   const __m128i zero = _mm_setzero_si128();
243   __m128i c, b = zero,
244           a, d = zero;
245
246   int rb = row_info->rowbytes;
247   while (rb >= 4) {
248      /* It's easiest to do this math (particularly, deal with pc) with 16-bit
249       * intermediates.
250       */
251      c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
252      a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
253
254      /* (p-a) == (a+b-c - a) == (b-c) */
255      __m128i pa = _mm_sub_epi16(b,c);
256
257      /* (p-b) == (a+b-c - b) == (a-c) */
258      __m128i pb = _mm_sub_epi16(a,c);
259
260      /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
261      __m128i pc = _mm_add_epi16(pa,pb);
262
263      pa = abs_i16(pa);  /* |p-a| */
264      pb = abs_i16(pb);  /* |p-b| */
265      pc = abs_i16(pc);  /* |p-c| */
266
267      __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
268
269      /* Paeth breaks ties favoring a over b over c. */
270      __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
271                         if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
272                                                                     c));
273
274      /* Note `_epi8`: we need addition to wrap modulo 255. */
275      d = _mm_add_epi8(d, nearest);
276      store3(row, _mm_packus_epi16(d,d));
277
278      prev += 3;
279      row  += 3;
280      rb   -= 3;
281   }
282   if (rb > 0) {
283      /* It's easiest to do this math (particularly, deal with pc) with 16-bit
284       * intermediates.
285       */
286      c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
287      a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
288
289      /* (p-a) == (a+b-c - a) == (b-c) */
290      __m128i pa = _mm_sub_epi16(b,c);
291
292      /* (p-b) == (a+b-c - b) == (a-c) */
293      __m128i pb = _mm_sub_epi16(a,c);
294
295      /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
296      __m128i pc = _mm_add_epi16(pa,pb);
297
298      pa = abs_i16(pa);  /* |p-a| */
299      pb = abs_i16(pb);  /* |p-b| */
300      pc = abs_i16(pc);  /* |p-c| */
301
302      __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
303
304      /* Paeth breaks ties favoring a over b over c. */
305      __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
306                         if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
307                                                                     c));
308
309      /* Note `_epi8`: we need addition to wrap modulo 255. */
310      d = _mm_add_epi8(d, nearest);
311      store3(row, _mm_packus_epi16(d,d));
312
313      prev += 3;
314      row  += 3;
315      rb   -= 3;
316   }
317}
318
319void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
320   png_const_bytep prev)
321{
322   /* Paeth tries to predict pixel d using the pixel to the left of it, a,
323    * and two pixels from the previous row, b and c:
324    *   prev: c b
325    *   row:  a d
326    * The Paeth function predicts d to be whichever of a, b, or c is nearest to
327    * p=a+b-c.
328    *
329    * The first pixel has no left context, and so uses an Up filter, p = b.
330    * This works naturally with our main loop's p = a+b-c if we force a and c
331    * to zero.
332    * Here we zero b and d, which become c and a respectively at the start of
333    * the loop.
334    */
335   png_debug(1, "in png_read_filter_row_paeth4_sse2");
336   const __m128i zero = _mm_setzero_si128();
337   __m128i c, b = zero,
338           a, d = zero;
339
340   int rb = row_info->rowbytes;
341   while (rb > 0) {
342      /* It's easiest to do this math (particularly, deal with pc) with 16-bit
343       * intermediates.
344       */
345      c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
346      a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
347
348      /* (p-a) == (a+b-c - a) == (b-c) */
349      __m128i pa = _mm_sub_epi16(b,c);
350
351      /* (p-b) == (a+b-c - b) == (a-c) */
352      __m128i pb = _mm_sub_epi16(a,c);
353
354      /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
355      __m128i pc = _mm_add_epi16(pa,pb);
356
357      pa = abs_i16(pa);  /* |p-a| */
358      pb = abs_i16(pb);  /* |p-b| */
359      pc = abs_i16(pc);  /* |p-c| */
360
361      __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
362
363      /* Paeth breaks ties favoring a over b over c. */
364      __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
365                         if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
366                                                                     c));
367
368      /* Note `_epi8`: we need addition to wrap modulo 255. */
369      d = _mm_add_epi8(d, nearest);
370      store4(row, _mm_packus_epi16(d,d));
371
372      prev += 4;
373      row  += 4;
374      rb   -= 4;
375   }
376}
377
378#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */
379#endif /* READ */
380