1 2/* filter_neon_intrinsics.c - NEON optimised filter functions 3 * 4 * Copyright (c) 2014,2016 Glenn Randers-Pehrson 5 * Written by James Yu <james.yu at linaro.org>, October 2013. 6 * Based on filter_neon.S, written by Mans Rullgard, 2011. 7 * 8 * Last changed in libpng 1.6.22 [May 26, 2016] 9 * 10 * This code is released under the libpng license. 11 * For conditions of distribution and use, see the disclaimer 12 * and license in png.h 13 */ 14 15#include "../pngpriv.h" 16 17#ifdef PNG_READ_SUPPORTED 18 19/* This code requires -mfpu=neon on the command line: */ 20#if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ 21 22#include <arm_neon.h> 23 24/* libpng row pointers are not necessarily aligned to any particular boundary, 25 * however this code will only work with appropriate alignment. arm/arm_init.c 26 * checks for this (and will not compile unless it is done). This code uses 27 * variants of png_aligncast to avoid compiler warnings. 28 */ 29#define png_ptr(type,pointer) png_aligncast(type *,pointer) 30#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer) 31 32/* The following relies on a variable 'temp_pointer' being declared with type 33 * 'type'. This is written this way just to hide the GCC strict aliasing 34 * warning; note that the code is safe because there never is an alias between 35 * the input and output pointers. 36 */ 37#define png_ldr(type,pointer)\ 38 (temp_pointer = png_ptr(type,pointer), *temp_pointer) 39 40#if PNG_ARM_NEON_OPT > 0 41 42void 43png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, 44 png_const_bytep prev_row) 45{ 46 png_bytep rp = row; 47 png_bytep rp_stop = row + row_info->rowbytes; 48 png_const_bytep pp = prev_row; 49 50 png_debug(1, "in png_read_filter_row_up_neon"); 51 52 for (; rp < rp_stop; rp += 16, pp += 16) 53 { 54 uint8x16_t qrp, qpp; 55 56 qrp = vld1q_u8(rp); 57 qpp = vld1q_u8(pp); 58 qrp = vaddq_u8(qrp, qpp); 59 vst1q_u8(rp, qrp); 60 } 61} 62 63void 64png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, 65 png_const_bytep prev_row) 66{ 67 png_bytep rp = row; 68 png_bytep rp_stop = row + row_info->rowbytes; 69 70 uint8x16_t vtmp = vld1q_u8(rp); 71 uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp); 72 uint8x8x2_t vrp = *vrpt; 73 74 uint8x8x4_t vdest; 75 vdest.val[3] = vdup_n_u8(0); 76 77 png_debug(1, "in png_read_filter_row_sub3_neon"); 78 79 for (; rp < rp_stop;) 80 { 81 uint8x8_t vtmp1, vtmp2; 82 uint32x2_t *temp_pointer; 83 84 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 85 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 86 vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6); 87 vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 88 89 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 90 vdest.val[2] = vadd_u8(vdest.val[1], vtmp2); 91 vdest.val[3] = vadd_u8(vdest.val[2], vtmp1); 92 93 vtmp = vld1q_u8(rp + 12); 94 vrpt = png_ptr(uint8x8x2_t, &vtmp); 95 vrp = *vrpt; 96 97 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 98 rp += 3; 99 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 100 rp += 3; 101 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 102 rp += 3; 103 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 104 rp += 3; 105 } 106 107 PNG_UNUSED(prev_row) 108} 109 110void 111png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row, 112 png_const_bytep prev_row) 113{ 114 png_bytep rp = row; 115 png_bytep rp_stop = row + row_info->rowbytes; 116 117 uint8x8x4_t vdest; 118 vdest.val[3] = vdup_n_u8(0); 119 120 png_debug(1, "in png_read_filter_row_sub4_neon"); 121 122 for (; rp < rp_stop; rp += 16) 123 { 124 uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp)); 125 uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp); 126 uint8x8x4_t vrp = *vrpt; 127 uint32x2x4_t *temp_pointer; 128 129 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 130 vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]); 131 vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]); 132 vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]); 133 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 134 } 135 136 PNG_UNUSED(prev_row) 137} 138 139void 140png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row, 141 png_const_bytep prev_row) 142{ 143 png_bytep rp = row; 144 png_const_bytep pp = prev_row; 145 png_bytep rp_stop = row + row_info->rowbytes; 146 147 uint8x16_t vtmp; 148 uint8x8x2_t *vrpt; 149 uint8x8x2_t vrp; 150 uint8x8x4_t vdest; 151 vdest.val[3] = vdup_n_u8(0); 152 153 vtmp = vld1q_u8(rp); 154 vrpt = png_ptr(uint8x8x2_t,&vtmp); 155 vrp = *vrpt; 156 157 png_debug(1, "in png_read_filter_row_avg3_neon"); 158 159 for (; rp < rp_stop; pp += 12) 160 { 161 uint8x8_t vtmp1, vtmp2, vtmp3; 162 163 uint8x8x2_t *vppt; 164 uint8x8x2_t vpp; 165 166 uint32x2_t *temp_pointer; 167 168 vtmp = vld1q_u8(pp); 169 vppt = png_ptr(uint8x8x2_t,&vtmp); 170 vpp = *vppt; 171 172 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 173 vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 174 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 175 176 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 177 vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6); 178 vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 179 vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 180 181 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6); 182 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 183 184 vtmp = vld1q_u8(rp + 12); 185 vrpt = png_ptr(uint8x8x2_t,&vtmp); 186 vrp = *vrpt; 187 188 vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2); 189 vdest.val[2] = vadd_u8(vdest.val[2], vtmp3); 190 191 vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 192 193 vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2); 194 vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); 195 196 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 197 rp += 3; 198 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 199 rp += 3; 200 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 201 rp += 3; 202 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 203 rp += 3; 204 } 205} 206 207void 208png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row, 209 png_const_bytep prev_row) 210{ 211 png_bytep rp = row; 212 png_bytep rp_stop = row + row_info->rowbytes; 213 png_const_bytep pp = prev_row; 214 215 uint8x8x4_t vdest; 216 vdest.val[3] = vdup_n_u8(0); 217 218 png_debug(1, "in png_read_filter_row_avg4_neon"); 219 220 for (; rp < rp_stop; rp += 16, pp += 16) 221 { 222 uint32x2x4_t vtmp; 223 uint8x8x4_t *vrpt, *vppt; 224 uint8x8x4_t vrp, vpp; 225 uint32x2x4_t *temp_pointer; 226 227 vtmp = vld4_u32(png_ptr(uint32_t,rp)); 228 vrpt = png_ptr(uint8x8x4_t,&vtmp); 229 vrp = *vrpt; 230 vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 231 vppt = png_ptr(uint8x8x4_t,&vtmp); 232 vpp = *vppt; 233 234 vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 235 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 236 vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); 237 vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 238 vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]); 239 vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); 240 vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]); 241 vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); 242 243 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 244 } 245} 246 247static uint8x8_t 248paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c) 249{ 250 uint8x8_t d, e; 251 uint16x8_t p1, pa, pb, pc; 252 253 p1 = vaddl_u8(a, b); /* a + b */ 254 pc = vaddl_u8(c, c); /* c * 2 */ 255 pa = vabdl_u8(b, c); /* pa */ 256 pb = vabdl_u8(a, c); /* pb */ 257 pc = vabdq_u16(p1, pc); /* pc */ 258 259 p1 = vcleq_u16(pa, pb); /* pa <= pb */ 260 pa = vcleq_u16(pa, pc); /* pa <= pc */ 261 pb = vcleq_u16(pb, pc); /* pb <= pc */ 262 263 p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */ 264 265 d = vmovn_u16(pb); 266 e = vmovn_u16(p1); 267 268 d = vbsl_u8(d, b, c); 269 e = vbsl_u8(e, a, d); 270 271 return e; 272} 273 274void 275png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row, 276 png_const_bytep prev_row) 277{ 278 png_bytep rp = row; 279 png_const_bytep pp = prev_row; 280 png_bytep rp_stop = row + row_info->rowbytes; 281 282 uint8x16_t vtmp; 283 uint8x8x2_t *vrpt; 284 uint8x8x2_t vrp; 285 uint8x8_t vlast = vdup_n_u8(0); 286 uint8x8x4_t vdest; 287 vdest.val[3] = vdup_n_u8(0); 288 289 vtmp = vld1q_u8(rp); 290 vrpt = png_ptr(uint8x8x2_t,&vtmp); 291 vrp = *vrpt; 292 293 png_debug(1, "in png_read_filter_row_paeth3_neon"); 294 295 for (; rp < rp_stop; pp += 12) 296 { 297 uint8x8x2_t *vppt; 298 uint8x8x2_t vpp; 299 uint8x8_t vtmp1, vtmp2, vtmp3; 300 uint32x2_t *temp_pointer; 301 302 vtmp = vld1q_u8(pp); 303 vppt = png_ptr(uint8x8x2_t,&vtmp); 304 vpp = *vppt; 305 306 vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 307 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 308 309 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 310 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 311 vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 312 vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 313 314 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6); 315 vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6); 316 vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2); 317 vdest.val[2] = vadd_u8(vdest.val[2], vtmp1); 318 319 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 320 vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 321 322 vtmp = vld1q_u8(rp + 12); 323 vrpt = png_ptr(uint8x8x2_t,&vtmp); 324 vrp = *vrpt; 325 326 vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3); 327 vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); 328 329 vlast = vtmp2; 330 331 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 332 rp += 3; 333 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 334 rp += 3; 335 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 336 rp += 3; 337 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 338 rp += 3; 339 } 340} 341 342void 343png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, 344 png_const_bytep prev_row) 345{ 346 png_bytep rp = row; 347 png_bytep rp_stop = row + row_info->rowbytes; 348 png_const_bytep pp = prev_row; 349 350 uint8x8_t vlast = vdup_n_u8(0); 351 uint8x8x4_t vdest; 352 vdest.val[3] = vdup_n_u8(0); 353 354 png_debug(1, "in png_read_filter_row_paeth4_neon"); 355 356 for (; rp < rp_stop; rp += 16, pp += 16) 357 { 358 uint32x2x4_t vtmp; 359 uint8x8x4_t *vrpt, *vppt; 360 uint8x8x4_t vrp, vpp; 361 uint32x2x4_t *temp_pointer; 362 363 vtmp = vld4_u32(png_ptr(uint32_t,rp)); 364 vrpt = png_ptr(uint8x8x4_t,&vtmp); 365 vrp = *vrpt; 366 vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 367 vppt = png_ptr(uint8x8x4_t,&vtmp); 368 vpp = *vppt; 369 370 vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 371 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 372 vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); 373 vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 374 vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]); 375 vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); 376 vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]); 377 vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); 378 379 vlast = vpp.val[3]; 380 381 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 382 } 383} 384 385#endif /* PNG_ARM_NEON_OPT > 0 */ 386#endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */ 387#endif /* READ */ 388