1b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 2b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari/* filter_neon_intrinsics.c - NEON optimised filter functions 3b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * 4b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * Copyright (c) 2013 Glenn Randers-Pehrson 5b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * Written by James Yu <james.yu at linaro.org>, October 2013. 6b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * Based on filter_neon.S, written by Mans Rullgard, 2011. 7b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * 8b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * Last changed in libpng 1.6.8 [December 19, 2013] 9b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * 10b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * This code is released under the libpng license. 11b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * For conditions of distribution and use, see the disclaimer 12b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * and license in png.h 13b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari */ 14b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 15b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#include "../pngpriv.h" 16b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 17b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari/* This code requires -mfpu=neon on the command line: */ 18b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ 19b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 20b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#include <arm_neon.h> 21b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 22b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari/* libpng row pointers are not necessarily aligned to any particular boundary, 23b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * however this code will only work with appropriate alignment. arm/arm_init.c 24b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * checks for this (and will not compile unless it is done). This code uses 25b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * variants of png_aligncast to avoid compiler warnings. 26b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari */ 27b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#define png_ptr(type,pointer) png_aligncast(type *,pointer) 28b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer) 29b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 30b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari/* The following relies on a variable 'temp_pointer' being declared with type 31b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * 'type'. This is written this way just to hide the GCC strict aliasing 32b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * warning; note that the code is safe because there never is an alias between 33b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * the input and output pointers. 34b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari */ 35b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#define png_ldr(type,pointer)\ 36b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari (temp_pointer = png_ptr(type,pointer), *temp_pointer) 37b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 38b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#ifdef PNG_READ_SUPPORTED 39b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#if PNG_ARM_NEON_OPT > 0 40b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 41b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid 42b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, 43b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep prev_row) 44b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{ 45b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp = row; 46b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp_stop = row + row_info->rowbytes; 47b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep pp = prev_row; 48b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 49b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari for (; rp < rp_stop; rp += 16, pp += 16) 50b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari { 51b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x16_t qrp, qpp; 52b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 53b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari qrp = vld1q_u8(rp); 54b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari qpp = vld1q_u8(pp); 55b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari qrp = vaddq_u8(qrp, qpp); 56b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1q_u8(rp, qrp); 57b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari } 58b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari} 59b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 60b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid 61b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, 62b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep prev_row) 63b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{ 64b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp = row; 65b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp_stop = row + row_info->rowbytes; 66b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 67b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x16_t vtmp = vld1q_u8(rp); 68b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp); 69b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x2_t vrp = *vrpt; 70b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 71b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t vdest; 72b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vdup_n_u8(0); 73b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 74b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari for (; rp < rp_stop;) 75b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari { 76b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8_t vtmp1, vtmp2; 77b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint32x2_t *temp_pointer; 78b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 79b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 80b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 81b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6); 82b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 83b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 84b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 85b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[2] = vadd_u8(vdest.val[1], vtmp2); 86b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vadd_u8(vdest.val[2], vtmp1); 87b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 88b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp = vld1q_u8(rp + 12); 89b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrpt = png_ptr(uint8x8x2_t, &vtmp); 90b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrp = *vrpt; 91b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 92b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 93b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 94b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 95b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 96b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 97b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 98b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 99b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 100b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari } 101b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 102b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari PNG_UNUSED(prev_row) 103b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari} 104b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 105b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid 106b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row, 107b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep prev_row) 108b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{ 109b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp = row; 110b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp_stop = row + row_info->rowbytes; 111b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 112b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t vdest; 113b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vdup_n_u8(0); 114b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 115b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari for (; rp < rp_stop; rp += 16) 116b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari { 117b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp)); 118b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp); 119b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t vrp = *vrpt; 120b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint32x2x4_t *temp_pointer; 121b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 122b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 123b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]); 124b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]); 125b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]); 126b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 127b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari } 128b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 129b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari PNG_UNUSED(prev_row) 130b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari} 131b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 132b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid 133b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row, 134b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep prev_row) 135b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{ 136b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp = row; 137b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep pp = prev_row; 138b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp_stop = row + row_info->rowbytes; 139b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 140b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x16_t vtmp; 141b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x2_t *vrpt; 142b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x2_t vrp; 143b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t vdest; 144b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vdup_n_u8(0); 145b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 146b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp = vld1q_u8(rp); 147b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrpt = png_ptr(uint8x8x2_t,&vtmp); 148b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrp = *vrpt; 149b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 150b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari for (; rp < rp_stop; pp += 12) 151b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari { 152b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8_t vtmp1, vtmp2, vtmp3; 153b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 154b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x2_t *vppt; 155b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x2_t vpp; 156b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 157b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint32x2_t *temp_pointer; 158b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 159b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp = vld1q_u8(pp); 160b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vppt = png_ptr(uint8x8x2_t,&vtmp); 161b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vpp = *vppt; 162b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 163b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 164b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 165b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 166b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 167b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 168b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6); 169b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 170b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 171b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 172b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6); 173b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 174b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 175b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp = vld1q_u8(rp + 12); 176b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrpt = png_ptr(uint8x8x2_t,&vtmp); 177b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrp = *vrpt; 178b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 179b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2); 180b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[2] = vadd_u8(vdest.val[2], vtmp3); 181b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 182b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 183b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 184b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2); 185b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); 186b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 187b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 188b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 189b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 190b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 191b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 192b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 193b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 194b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 195b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari } 196b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari} 197b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 198b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid 199b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row, 200b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep prev_row) 201b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{ 202b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp = row; 203b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp_stop = row + row_info->rowbytes; 204b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep pp = prev_row; 205b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 206b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t vdest; 207b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vdup_n_u8(0); 208b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 209b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari for (; rp < rp_stop; rp += 16, pp += 16) 210b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari { 211b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint32x2x4_t vtmp; 212b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t *vrpt, *vppt; 213b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t vrp, vpp; 214b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint32x2x4_t *temp_pointer; 215b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 216b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp = vld4_u32(png_ptr(uint32_t,rp)); 217b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrpt = png_ptr(uint8x8x4_t,&vtmp); 218b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrp = *vrpt; 219b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 220b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vppt = png_ptr(uint8x8x4_t,&vtmp); 221b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vpp = *vppt; 222b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 223b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 224b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 225b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); 226b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 227b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]); 228b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); 229b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]); 230b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); 231b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 232b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 233b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari } 234b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari} 235b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 236b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraristatic uint8x8_t 237b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripaeth(uint8x8_t a, uint8x8_t b, uint8x8_t c) 238b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{ 239b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8_t d, e; 240b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint16x8_t p1, pa, pb, pc; 241b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 242b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari p1 = vaddl_u8(a, b); /* a + b */ 243b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari pc = vaddl_u8(c, c); /* c * 2 */ 244b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari pa = vabdl_u8(b, c); /* pa */ 245b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari pb = vabdl_u8(a, c); /* pb */ 246b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari pc = vabdq_u16(p1, pc); /* pc */ 247b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 248b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari p1 = vcleq_u16(pa, pb); /* pa <= pb */ 249b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari pa = vcleq_u16(pa, pc); /* pa <= pc */ 250b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari pb = vcleq_u16(pb, pc); /* pb <= pc */ 251b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 252b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */ 253b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 254b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari d = vmovn_u16(pb); 255b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari e = vmovn_u16(p1); 256b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 257b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari d = vbsl_u8(d, b, c); 258b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari e = vbsl_u8(e, a, d); 259b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 260b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari return e; 261b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari} 262b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 263b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid 264b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row, 265b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep prev_row) 266b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{ 267b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp = row; 268b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep pp = prev_row; 269b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp_stop = row + row_info->rowbytes; 270b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 271b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x16_t vtmp; 272b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x2_t *vrpt; 273b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x2_t vrp; 274b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8_t vlast = vdup_n_u8(0); 275b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t vdest; 276b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vdup_n_u8(0); 277b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 278b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp = vld1q_u8(rp); 279b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrpt = png_ptr(uint8x8x2_t,&vtmp); 280b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrp = *vrpt; 281b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 282b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari for (; rp < rp_stop; pp += 12) 283b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari { 284b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x2_t *vppt; 285b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x2_t vpp; 286b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8_t vtmp1, vtmp2, vtmp3; 287b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint32x2_t *temp_pointer; 288b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 289b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp = vld1q_u8(pp); 290b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vppt = png_ptr(uint8x8x2_t,&vtmp); 291b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vpp = *vppt; 292b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 293b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 294b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 295b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 296b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 297b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 298b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 299b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 300b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 301b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6); 302b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6); 303b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2); 304b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[2] = vadd_u8(vdest.val[2], vtmp1); 305b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 306b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 307b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 308b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 309b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp = vld1q_u8(rp + 12); 310b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrpt = png_ptr(uint8x8x2_t,&vtmp); 311b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrp = *vrpt; 312b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 313b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3); 314b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); 315b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 316b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vlast = vtmp2; 317b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 318b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 319b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 320b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 321b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 322b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 323b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 324b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 325b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari rp += 3; 326b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari } 327b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari} 328b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 329b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid 330b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, 331b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep prev_row) 332b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{ 333b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp = row; 334b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_bytep rp_stop = row + row_info->rowbytes; 335b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari png_const_bytep pp = prev_row; 336b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 337b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8_t vlast = vdup_n_u8(0); 338b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t vdest; 339b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vdup_n_u8(0); 340b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 341b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari for (; rp < rp_stop; rp += 16, pp += 16) 342b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari { 343b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint32x2x4_t vtmp; 344b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t *vrpt, *vppt; 345b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint8x8x4_t vrp, vpp; 346b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari uint32x2x4_t *temp_pointer; 347b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 348b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp = vld4_u32(png_ptr(uint32_t,rp)); 349b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrpt = png_ptr(uint8x8x4_t,&vtmp); 350b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vrp = *vrpt; 351b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 352b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vppt = png_ptr(uint8x8x4_t,&vtmp); 353b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vpp = *vppt; 354b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 355b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 356b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 357b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); 358b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 359b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]); 360b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); 361b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]); 362b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); 363b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 364b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vlast = vpp.val[3]; 365b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 366b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 367b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari } 368b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari} 369b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari 370b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#endif /* PNG_ARM_NEON_OPT > 0 */ 371b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#endif /* PNG_READ_SUPPORTED */ 372b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */ 373