1 2/* filter_sse2_intrinsics.c - SSE2 optimized filter functions 3 * 4 * Copyright (c) 2016 Google, Inc. 5 * Written by Mike Klein and Matt Sarett 6 * Derived from arm/filter_neon_intrinsics.c, which was 7 * Copyright (c) 2014,2016 Glenn Randers-Pehrson 8 * 9 * Last changed in libpng 1.6.24 [August 4, 2016] 10 * 11 * This code is released under the libpng license. 12 * For conditions of distribution and use, see the disclaimer 13 * and license in png.h 14 */ 15 16#include "../../pngpriv.h" 17 18#ifdef PNG_READ_SUPPORTED 19 20#if PNG_INTEL_SSE_IMPLEMENTATION > 0 21 22#include <immintrin.h> 23 24/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). 25 * They're positioned like this: 26 * prev: c b 27 * row: a d 28 * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be 29 * whichever of a, b, or c is closest to p=a+b-c. 30 */ 31 32static __m128i load4(const void* p) { 33 return _mm_cvtsi32_si128(*(const int*)p); 34} 35 36static void store4(void* p, __m128i v) { 37 *(int*)p = _mm_cvtsi128_si32(v); 38} 39 40static __m128i load3(const void* p) { 41 /* We'll load 2 bytes, then 1 byte, 42 * then mask them together, and finally load into SSE. 43 */ 44 const png_uint_16* p01 = p; 45 const png_byte* p2 = (const png_byte*)(p01+1); 46 47 png_uint_32 v012 = (png_uint_32)(*p01) 48 | (png_uint_32)(*p2) << 16; 49 return load4(&v012); 50} 51 52static void store3(void* p, __m128i v) { 53 /* We'll pull from SSE as a 32-bit int, then write 54 * its bottom two bytes, then its third byte. 55 */ 56 png_uint_32 v012; 57 store4(&v012, v); 58 59 png_uint_16* p01 = p; 60 png_byte* p2 = (png_byte*)(p01+1); 61 *p01 = v012; 62 *p2 = v012 >> 16; 63} 64 65void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, 66 png_const_bytep prev) 67{ 68 /* The Sub filter predicts each pixel as the previous pixel, a. 69 * There is no pixel to the left of the first pixel. It's encoded directly. 70 * That works with our main loop if we just say that left pixel was zero. 71 */ 72 png_debug(1, "in png_read_filter_row_sub3_sse2"); 73 __m128i a, d = _mm_setzero_si128(); 74 75 int rb = row_info->rowbytes; 76 while (rb >= 4) { 77 a = d; d = load4(row); 78 d = _mm_add_epi8(d, a); 79 store3(row, d); 80 81 row += 3; 82 rb -= 3; 83 } 84 if (rb > 0) { 85 a = d; d = load3(row); 86 d = _mm_add_epi8(d, a); 87 store3(row, d); 88 89 row += 3; 90 rb -= 3; 91 } 92} 93 94void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row, 95 png_const_bytep prev) 96{ 97 /* The Sub filter predicts each pixel as the previous pixel, a. 98 * There is no pixel to the left of the first pixel. It's encoded directly. 99 * That works with our main loop if we just say that left pixel was zero. 100 */ 101 png_debug(1, "in png_read_filter_row_sub4_sse2"); 102 __m128i a, d = _mm_setzero_si128(); 103 104 int rb = row_info->rowbytes; 105 while (rb > 0) { 106 a = d; d = load4(row); 107 d = _mm_add_epi8(d, a); 108 store4(row, d); 109 110 row += 4; 111 rb -= 4; 112 } 113} 114 115void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, 116 png_const_bytep prev) 117{ 118 /* The Avg filter predicts each pixel as the (truncated) average of a and b. 119 * There's no pixel to the left of the first pixel. Luckily, it's 120 * predicted to be half of the pixel above it. So again, this works 121 * perfectly with our loop if we make sure a starts at zero. 122 */ 123 png_debug(1, "in png_read_filter_row_avg3_sse2"); 124 const __m128i zero = _mm_setzero_si128(); 125 __m128i b; 126 __m128i a, d = zero; 127 128 int rb = row_info->rowbytes; 129 while (rb >= 4) { 130 b = load4(prev); 131 a = d; d = load4(row ); 132 133 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ 134 __m128i avg = _mm_avg_epu8(a,b); 135 /* ...but we can fix it up by subtracting off 1 if it rounded up. */ 136 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), 137 _mm_set1_epi8(1))); 138 d = _mm_add_epi8(d, avg); 139 store3(row, d); 140 141 prev += 3; 142 row += 3; 143 rb -= 3; 144 } 145 if (rb > 0) { 146 b = load3(prev); 147 a = d; d = load3(row ); 148 149 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ 150 __m128i avg = _mm_avg_epu8(a,b); 151 /* ...but we can fix it up by subtracting off 1 if it rounded up. */ 152 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), 153 _mm_set1_epi8(1))); 154 155 d = _mm_add_epi8(d, avg); 156 store3(row, d); 157 158 prev += 3; 159 row += 3; 160 rb -= 3; 161 } 162} 163 164void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row, 165 png_const_bytep prev) 166{ 167 /* The Avg filter predicts each pixel as the (truncated) average of a and b. 168 * There's no pixel to the left of the first pixel. Luckily, it's 169 * predicted to be half of the pixel above it. So again, this works 170 * perfectly with our loop if we make sure a starts at zero. 171 */ 172 png_debug(1, "in png_read_filter_row_avg4_sse2"); 173 const __m128i zero = _mm_setzero_si128(); 174 __m128i b; 175 __m128i a, d = zero; 176 177 int rb = row_info->rowbytes; 178 while (rb > 0) { 179 b = load4(prev); 180 a = d; d = load4(row ); 181 182 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ 183 __m128i avg = _mm_avg_epu8(a,b); 184 /* ...but we can fix it up by subtracting off 1 if it rounded up. */ 185 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), 186 _mm_set1_epi8(1))); 187 188 d = _mm_add_epi8(d, avg); 189 store4(row, d); 190 191 prev += 4; 192 row += 4; 193 rb -= 4; 194 } 195} 196 197/* Returns |x| for 16-bit lanes. */ 198static __m128i abs_i16(__m128i x) { 199#if PNG_INTEL_SSE_IMPLEMENTATION >= 2 200 return _mm_abs_epi16(x); 201#else 202 /* Read this all as, return x<0 ? -x : x. 203 * To negate two's complement, you flip all the bits then add 1. 204 */ 205 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); 206 207 /* Flip negative lanes. */ 208 x = _mm_xor_si128(x, is_negative); 209 210 /* +1 to negative lanes, else +0. */ 211 x = _mm_sub_epi16(x, is_negative); 212 return x; 213#endif 214} 215 216/* Bytewise c ? t : e. */ 217static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { 218#if PNG_INTEL_SSE_IMPLEMENTATION >= 3 219 return _mm_blendv_epi8(e,t,c); 220#else 221 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); 222#endif 223} 224 225void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row, 226 png_const_bytep prev) 227{ 228 /* Paeth tries to predict pixel d using the pixel to the left of it, a, 229 * and two pixels from the previous row, b and c: 230 * prev: c b 231 * row: a d 232 * The Paeth function predicts d to be whichever of a, b, or c is nearest to 233 * p=a+b-c. 234 * 235 * The first pixel has no left context, and so uses an Up filter, p = b. 236 * This works naturally with our main loop's p = a+b-c if we force a and c 237 * to zero. 238 * Here we zero b and d, which become c and a respectively at the start of 239 * the loop. 240 */ 241 png_debug(1, "in png_read_filter_row_paeth3_sse2"); 242 const __m128i zero = _mm_setzero_si128(); 243 __m128i c, b = zero, 244 a, d = zero; 245 246 int rb = row_info->rowbytes; 247 while (rb >= 4) { 248 /* It's easiest to do this math (particularly, deal with pc) with 16-bit 249 * intermediates. 250 */ 251 c = b; b = _mm_unpacklo_epi8(load4(prev), zero); 252 a = d; d = _mm_unpacklo_epi8(load4(row ), zero); 253 254 /* (p-a) == (a+b-c - a) == (b-c) */ 255 __m128i pa = _mm_sub_epi16(b,c); 256 257 /* (p-b) == (a+b-c - b) == (a-c) */ 258 __m128i pb = _mm_sub_epi16(a,c); 259 260 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ 261 __m128i pc = _mm_add_epi16(pa,pb); 262 263 pa = abs_i16(pa); /* |p-a| */ 264 pb = abs_i16(pb); /* |p-b| */ 265 pc = abs_i16(pc); /* |p-c| */ 266 267 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); 268 269 /* Paeth breaks ties favoring a over b over c. */ 270 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, 271 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, 272 c)); 273 274 /* Note `_epi8`: we need addition to wrap modulo 255. */ 275 d = _mm_add_epi8(d, nearest); 276 store3(row, _mm_packus_epi16(d,d)); 277 278 prev += 3; 279 row += 3; 280 rb -= 3; 281 } 282 if (rb > 0) { 283 /* It's easiest to do this math (particularly, deal with pc) with 16-bit 284 * intermediates. 285 */ 286 c = b; b = _mm_unpacklo_epi8(load3(prev), zero); 287 a = d; d = _mm_unpacklo_epi8(load3(row ), zero); 288 289 /* (p-a) == (a+b-c - a) == (b-c) */ 290 __m128i pa = _mm_sub_epi16(b,c); 291 292 /* (p-b) == (a+b-c - b) == (a-c) */ 293 __m128i pb = _mm_sub_epi16(a,c); 294 295 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ 296 __m128i pc = _mm_add_epi16(pa,pb); 297 298 pa = abs_i16(pa); /* |p-a| */ 299 pb = abs_i16(pb); /* |p-b| */ 300 pc = abs_i16(pc); /* |p-c| */ 301 302 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); 303 304 /* Paeth breaks ties favoring a over b over c. */ 305 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, 306 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, 307 c)); 308 309 /* Note `_epi8`: we need addition to wrap modulo 255. */ 310 d = _mm_add_epi8(d, nearest); 311 store3(row, _mm_packus_epi16(d,d)); 312 313 prev += 3; 314 row += 3; 315 rb -= 3; 316 } 317} 318 319void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, 320 png_const_bytep prev) 321{ 322 /* Paeth tries to predict pixel d using the pixel to the left of it, a, 323 * and two pixels from the previous row, b and c: 324 * prev: c b 325 * row: a d 326 * The Paeth function predicts d to be whichever of a, b, or c is nearest to 327 * p=a+b-c. 328 * 329 * The first pixel has no left context, and so uses an Up filter, p = b. 330 * This works naturally with our main loop's p = a+b-c if we force a and c 331 * to zero. 332 * Here we zero b and d, which become c and a respectively at the start of 333 * the loop. 334 */ 335 png_debug(1, "in png_read_filter_row_paeth4_sse2"); 336 const __m128i zero = _mm_setzero_si128(); 337 __m128i c, b = zero, 338 a, d = zero; 339 340 int rb = row_info->rowbytes; 341 while (rb > 0) { 342 /* It's easiest to do this math (particularly, deal with pc) with 16-bit 343 * intermediates. 344 */ 345 c = b; b = _mm_unpacklo_epi8(load4(prev), zero); 346 a = d; d = _mm_unpacklo_epi8(load4(row ), zero); 347 348 /* (p-a) == (a+b-c - a) == (b-c) */ 349 __m128i pa = _mm_sub_epi16(b,c); 350 351 /* (p-b) == (a+b-c - b) == (a-c) */ 352 __m128i pb = _mm_sub_epi16(a,c); 353 354 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ 355 __m128i pc = _mm_add_epi16(pa,pb); 356 357 pa = abs_i16(pa); /* |p-a| */ 358 pb = abs_i16(pb); /* |p-b| */ 359 pc = abs_i16(pc); /* |p-c| */ 360 361 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); 362 363 /* Paeth breaks ties favoring a over b over c. */ 364 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, 365 if_then_else(_mm_cmpeq_epi16(smallest, pb), b, 366 c)); 367 368 /* Note `_epi8`: we need addition to wrap modulo 255. */ 369 d = _mm_add_epi8(d, nearest); 370 store4(row, _mm_packus_epi16(d,d)); 371 372 prev += 4; 373 row += 4; 374 rb -= 4; 375 } 376} 377 378#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */ 379#endif /* READ */ 380