1 2/* filter_neon_intrinsics.c - NEON optimised filter functions 3 * 4 * Copyright (c) 2013 Glenn Randers-Pehrson 5 * Written by James Yu <james.yu at linaro.org>, October 2013. 6 * Based on filter_neon.S, written by Mans Rullgard, 2011. 7 * 8 * Last changed in libpng 1.6.8 [December 19, 2013] 9 * 10 * This code is released under the libpng license. 11 * For conditions of distribution and use, see the disclaimer 12 * and license in png.h 13 */ 14 15#include "../pngpriv.h" 16 17/* This code requires -mfpu=neon on the command line: */ 18#if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ 19 20#include <arm_neon.h> 21 22/* libpng row pointers are not necessarily aligned to any particular boundary, 23 * however this code will only work with appropriate alignment. arm/arm_init.c 24 * checks for this (and will not compile unless it is done). This code uses 25 * variants of png_aligncast to avoid compiler warnings. 26 */ 27#define png_ptr(type,pointer) png_aligncast(type *,pointer) 28#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer) 29 30/* The following relies on a variable 'temp_pointer' being declared with type 31 * 'type'. This is written this way just to hide the GCC strict aliasing 32 * warning; note that the code is safe because there never is an alias between 33 * the input and output pointers. 34 */ 35#define png_ldr(type,pointer)\ 36 (temp_pointer = png_ptr(type,pointer), *temp_pointer) 37 38#ifdef PNG_READ_SUPPORTED 39#if PNG_ARM_NEON_OPT > 0 40 41void 42png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, 43 png_const_bytep prev_row) 44{ 45 png_bytep rp = row; 46 png_bytep rp_stop = row + row_info->rowbytes; 47 png_const_bytep pp = prev_row; 48 49 for (; rp < rp_stop; rp += 16, pp += 16) 50 { 51 uint8x16_t qrp, qpp; 52 53 qrp = vld1q_u8(rp); 54 qpp = vld1q_u8(pp); 55 qrp = vaddq_u8(qrp, qpp); 56 vst1q_u8(rp, qrp); 57 } 58} 59 60void 61png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, 62 png_const_bytep prev_row) 63{ 64 png_bytep rp = row; 65 png_bytep rp_stop = row + row_info->rowbytes; 66 67 uint8x16_t vtmp = vld1q_u8(rp); 68 uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp); 69 uint8x8x2_t vrp = *vrpt; 70 71 uint8x8x4_t vdest; 72 vdest.val[3] = vdup_n_u8(0); 73 74 for (; rp < rp_stop;) 75 { 76 uint8x8_t vtmp1, vtmp2; 77 uint32x2_t *temp_pointer; 78 79 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 80 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 81 vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6); 82 vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 83 84 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 85 vdest.val[2] = vadd_u8(vdest.val[1], vtmp2); 86 vdest.val[3] = vadd_u8(vdest.val[2], vtmp1); 87 88 vtmp = vld1q_u8(rp + 12); 89 vrpt = png_ptr(uint8x8x2_t, &vtmp); 90 vrp = *vrpt; 91 92 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 93 rp += 3; 94 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 95 rp += 3; 96 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 97 rp += 3; 98 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 99 rp += 3; 100 } 101 102 PNG_UNUSED(prev_row) 103} 104 105void 106png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row, 107 png_const_bytep prev_row) 108{ 109 png_bytep rp = row; 110 png_bytep rp_stop = row + row_info->rowbytes; 111 112 uint8x8x4_t vdest; 113 vdest.val[3] = vdup_n_u8(0); 114 115 for (; rp < rp_stop; rp += 16) 116 { 117 uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp)); 118 uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp); 119 uint8x8x4_t vrp = *vrpt; 120 uint32x2x4_t *temp_pointer; 121 122 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 123 vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]); 124 vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]); 125 vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]); 126 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 127 } 128 129 PNG_UNUSED(prev_row) 130} 131 132void 133png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row, 134 png_const_bytep prev_row) 135{ 136 png_bytep rp = row; 137 png_const_bytep pp = prev_row; 138 png_bytep rp_stop = row + row_info->rowbytes; 139 140 uint8x16_t vtmp; 141 uint8x8x2_t *vrpt; 142 uint8x8x2_t vrp; 143 uint8x8x4_t vdest; 144 vdest.val[3] = vdup_n_u8(0); 145 146 vtmp = vld1q_u8(rp); 147 vrpt = png_ptr(uint8x8x2_t,&vtmp); 148 vrp = *vrpt; 149 150 for (; rp < rp_stop; pp += 12) 151 { 152 uint8x8_t vtmp1, vtmp2, vtmp3; 153 154 uint8x8x2_t *vppt; 155 uint8x8x2_t vpp; 156 157 uint32x2_t *temp_pointer; 158 159 vtmp = vld1q_u8(pp); 160 vppt = png_ptr(uint8x8x2_t,&vtmp); 161 vpp = *vppt; 162 163 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 164 vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 165 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 166 167 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 168 vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6); 169 vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 170 vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 171 172 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6); 173 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 174 175 vtmp = vld1q_u8(rp + 12); 176 vrpt = png_ptr(uint8x8x2_t,&vtmp); 177 vrp = *vrpt; 178 179 vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2); 180 vdest.val[2] = vadd_u8(vdest.val[2], vtmp3); 181 182 vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 183 184 vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2); 185 vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); 186 187 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 188 rp += 3; 189 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 190 rp += 3; 191 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 192 rp += 3; 193 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 194 rp += 3; 195 } 196} 197 198void 199png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row, 200 png_const_bytep prev_row) 201{ 202 png_bytep rp = row; 203 png_bytep rp_stop = row + row_info->rowbytes; 204 png_const_bytep pp = prev_row; 205 206 uint8x8x4_t vdest; 207 vdest.val[3] = vdup_n_u8(0); 208 209 for (; rp < rp_stop; rp += 16, pp += 16) 210 { 211 uint32x2x4_t vtmp; 212 uint8x8x4_t *vrpt, *vppt; 213 uint8x8x4_t vrp, vpp; 214 uint32x2x4_t *temp_pointer; 215 216 vtmp = vld4_u32(png_ptr(uint32_t,rp)); 217 vrpt = png_ptr(uint8x8x4_t,&vtmp); 218 vrp = *vrpt; 219 vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 220 vppt = png_ptr(uint8x8x4_t,&vtmp); 221 vpp = *vppt; 222 223 vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 224 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 225 vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); 226 vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 227 vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]); 228 vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); 229 vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]); 230 vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); 231 232 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 233 } 234} 235 236static uint8x8_t 237paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c) 238{ 239 uint8x8_t d, e; 240 uint16x8_t p1, pa, pb, pc; 241 242 p1 = vaddl_u8(a, b); /* a + b */ 243 pc = vaddl_u8(c, c); /* c * 2 */ 244 pa = vabdl_u8(b, c); /* pa */ 245 pb = vabdl_u8(a, c); /* pb */ 246 pc = vabdq_u16(p1, pc); /* pc */ 247 248 p1 = vcleq_u16(pa, pb); /* pa <= pb */ 249 pa = vcleq_u16(pa, pc); /* pa <= pc */ 250 pb = vcleq_u16(pb, pc); /* pb <= pc */ 251 252 p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */ 253 254 d = vmovn_u16(pb); 255 e = vmovn_u16(p1); 256 257 d = vbsl_u8(d, b, c); 258 e = vbsl_u8(e, a, d); 259 260 return e; 261} 262 263void 264png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row, 265 png_const_bytep prev_row) 266{ 267 png_bytep rp = row; 268 png_const_bytep pp = prev_row; 269 png_bytep rp_stop = row + row_info->rowbytes; 270 271 uint8x16_t vtmp; 272 uint8x8x2_t *vrpt; 273 uint8x8x2_t vrp; 274 uint8x8_t vlast = vdup_n_u8(0); 275 uint8x8x4_t vdest; 276 vdest.val[3] = vdup_n_u8(0); 277 278 vtmp = vld1q_u8(rp); 279 vrpt = png_ptr(uint8x8x2_t,&vtmp); 280 vrp = *vrpt; 281 282 for (; rp < rp_stop; pp += 12) 283 { 284 uint8x8x2_t *vppt; 285 uint8x8x2_t vpp; 286 uint8x8_t vtmp1, vtmp2, vtmp3; 287 uint32x2_t *temp_pointer; 288 289 vtmp = vld1q_u8(pp); 290 vppt = png_ptr(uint8x8x2_t,&vtmp); 291 vpp = *vppt; 292 293 vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 294 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 295 296 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 297 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 298 vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 299 vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 300 301 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6); 302 vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6); 303 vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2); 304 vdest.val[2] = vadd_u8(vdest.val[2], vtmp1); 305 306 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 307 vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 308 309 vtmp = vld1q_u8(rp + 12); 310 vrpt = png_ptr(uint8x8x2_t,&vtmp); 311 vrp = *vrpt; 312 313 vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3); 314 vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); 315 316 vlast = vtmp2; 317 318 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 319 rp += 3; 320 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 321 rp += 3; 322 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 323 rp += 3; 324 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 325 rp += 3; 326 } 327} 328 329void 330png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, 331 png_const_bytep prev_row) 332{ 333 png_bytep rp = row; 334 png_bytep rp_stop = row + row_info->rowbytes; 335 png_const_bytep pp = prev_row; 336 337 uint8x8_t vlast = vdup_n_u8(0); 338 uint8x8x4_t vdest; 339 vdest.val[3] = vdup_n_u8(0); 340 341 for (; rp < rp_stop; rp += 16, pp += 16) 342 { 343 uint32x2x4_t vtmp; 344 uint8x8x4_t *vrpt, *vppt; 345 uint8x8x4_t vrp, vpp; 346 uint32x2x4_t *temp_pointer; 347 348 vtmp = vld4_u32(png_ptr(uint32_t,rp)); 349 vrpt = png_ptr(uint8x8x4_t,&vtmp); 350 vrp = *vrpt; 351 vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 352 vppt = png_ptr(uint8x8x4_t,&vtmp); 353 vpp = *vppt; 354 355 vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 356 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 357 vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); 358 vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 359 vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]); 360 vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); 361 vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]); 362 vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); 363 364 vlast = vpp.val[3]; 365 366 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 367 } 368} 369 370#endif /* PNG_ARM_NEON_OPT > 0 */ 371#endif /* PNG_READ_SUPPORTED */ 372#endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */ 373