1b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
2b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari/* filter_neon_intrinsics.c - NEON optimised filter functions
3b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari *
4b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * Copyright (c) 2013 Glenn Randers-Pehrson
5b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * Written by James Yu <james.yu at linaro.org>, October 2013.
6b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * Based on filter_neon.S, written by Mans Rullgard, 2011.
7b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari *
8b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * Last changed in libpng 1.6.8 [December 19, 2013]
9b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari *
10b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * This code is released under the libpng license.
11b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * For conditions of distribution and use, see the disclaimer
12b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * and license in png.h
13b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari */
14b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
15b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#include "../pngpriv.h"
16b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
17b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari/* This code requires -mfpu=neon on the command line: */
18b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
19b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
20b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#include <arm_neon.h>
21b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
22b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari/* libpng row pointers are not necessarily aligned to any particular boundary,
23b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * however this code will only work with appropriate alignment.  arm/arm_init.c
24b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * checks for this (and will not compile unless it is done). This code uses
25b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * variants of png_aligncast to avoid compiler warnings.
26b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari */
27b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#define png_ptr(type,pointer) png_aligncast(type *,pointer)
28b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
29b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
30b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari/* The following relies on a variable 'temp_pointer' being declared with type
31b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * 'type'.  This is written this way just to hide the GCC strict aliasing
32b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * warning; note that the code is safe because there never is an alias between
33b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari * the input and output pointers.
34b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari */
35b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#define png_ldr(type,pointer)\
36b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   (temp_pointer = png_ptr(type,pointer), *temp_pointer)
37b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
38b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#ifdef PNG_READ_SUPPORTED
39b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#if PNG_ARM_NEON_OPT > 0
40b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
41b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid
42b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
43b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep prev_row)
44b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{
45b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp = row;
46b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp_stop = row + row_info->rowbytes;
47b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep pp = prev_row;
48b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
49b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   for (; rp < rp_stop; rp += 16, pp += 16)
50b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   {
51b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x16_t qrp, qpp;
52b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
53b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      qrp = vld1q_u8(rp);
54b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      qpp = vld1q_u8(pp);
55b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      qrp = vaddq_u8(qrp, qpp);
56b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1q_u8(rp, qrp);
57b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   }
58b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari}
59b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
60b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid
61b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
62b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep prev_row)
63b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{
64b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp = row;
65b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp_stop = row + row_info->rowbytes;
66b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
67b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x16_t vtmp = vld1q_u8(rp);
68b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
69b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x2_t vrp = *vrpt;
70b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
71b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x4_t vdest;
72b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vdest.val[3] = vdup_n_u8(0);
73b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
74b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   for (; rp < rp_stop;)
75b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   {
76b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8_t vtmp1, vtmp2;
77b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint32x2_t *temp_pointer;
78b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
79b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
80b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
81b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
82b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
83b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
84b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
85b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
86b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
87b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
88b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp = vld1q_u8(rp + 12);
89b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vrpt = png_ptr(uint8x8x2_t, &vtmp);
90b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vrp = *vrpt;
91b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
92b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
93b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
94b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
95b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
96b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
97b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
98b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
99b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
100b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   }
101b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
102b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   PNG_UNUSED(prev_row)
103b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari}
104b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
105b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid
106b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
107b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep prev_row)
108b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{
109b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp = row;
110b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp_stop = row + row_info->rowbytes;
111b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
112b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x4_t vdest;
113b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vdest.val[3] = vdup_n_u8(0);
114b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
115b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   for (; rp < rp_stop; rp += 16)
116b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   {
117b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
118b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
119b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8x4_t vrp = *vrpt;
120b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint32x2x4_t *temp_pointer;
121b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
122b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
123b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
124b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
125b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
126b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
127b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   }
128b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
129b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   PNG_UNUSED(prev_row)
130b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari}
131b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
132b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid
133b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
134b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep prev_row)
135b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{
136b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp = row;
137b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep pp = prev_row;
138b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp_stop = row + row_info->rowbytes;
139b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
140b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x16_t vtmp;
141b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x2_t *vrpt;
142b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x2_t vrp;
143b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x4_t vdest;
144b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vdest.val[3] = vdup_n_u8(0);
145b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
146b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vtmp = vld1q_u8(rp);
147b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vrpt = png_ptr(uint8x8x2_t,&vtmp);
148b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vrp = *vrpt;
149b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
150b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   for (; rp < rp_stop; pp += 12)
151b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   {
152b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8_t vtmp1, vtmp2, vtmp3;
153b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
154b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8x2_t *vppt;
155b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8x2_t vpp;
156b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
157b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint32x2_t *temp_pointer;
158b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
159b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp = vld1q_u8(pp);
160b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vppt = png_ptr(uint8x8x2_t,&vtmp);
161b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vpp = *vppt;
162b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
163b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
164b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
165b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
166b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
167b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
168b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
169b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
170b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
171b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
172b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
173b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
174b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
175b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp = vld1q_u8(rp + 12);
176b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vrpt = png_ptr(uint8x8x2_t,&vtmp);
177b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vrp = *vrpt;
178b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
179b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
180b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
181b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
182b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
183b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
184b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
185b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
186b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
187b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
188b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
189b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
190b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
191b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
192b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
193b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
194b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
195b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   }
196b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari}
197b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
198b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid
199b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
200b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep prev_row)
201b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{
202b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp = row;
203b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp_stop = row + row_info->rowbytes;
204b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep pp = prev_row;
205b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
206b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x4_t vdest;
207b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vdest.val[3] = vdup_n_u8(0);
208b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
209b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   for (; rp < rp_stop; rp += 16, pp += 16)
210b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   {
211b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint32x2x4_t vtmp;
212b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8x4_t *vrpt, *vppt;
213b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8x4_t vrp, vpp;
214b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint32x2x4_t *temp_pointer;
215b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
216b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp = vld4_u32(png_ptr(uint32_t,rp));
217b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vrpt = png_ptr(uint8x8x4_t,&vtmp);
218b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vrp = *vrpt;
219b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
220b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vppt = png_ptr(uint8x8x4_t,&vtmp);
221b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vpp = *vppt;
222b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
223b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
224b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
225b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
226b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
227b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
228b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
229b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
230b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
231b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
232b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
233b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   }
234b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari}
235b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
236b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraristatic uint8x8_t
237b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripaeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
238b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{
239b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8_t d, e;
240b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint16x8_t p1, pa, pb, pc;
241b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
242b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   p1 = vaddl_u8(a, b); /* a + b */
243b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   pc = vaddl_u8(c, c); /* c * 2 */
244b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   pa = vabdl_u8(b, c); /* pa */
245b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   pb = vabdl_u8(a, c); /* pb */
246b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   pc = vabdq_u16(p1, pc); /* pc */
247b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
248b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   p1 = vcleq_u16(pa, pb); /* pa <= pb */
249b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   pa = vcleq_u16(pa, pc); /* pa <= pc */
250b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   pb = vcleq_u16(pb, pc); /* pb <= pc */
251b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
252b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
253b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
254b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   d = vmovn_u16(pb);
255b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   e = vmovn_u16(p1);
256b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
257b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   d = vbsl_u8(d, b, c);
258b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   e = vbsl_u8(e, a, d);
259b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
260b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   return e;
261b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari}
262b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
263b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid
264b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
265b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep prev_row)
266b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{
267b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp = row;
268b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep pp = prev_row;
269b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp_stop = row + row_info->rowbytes;
270b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
271b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x16_t vtmp;
272b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x2_t *vrpt;
273b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x2_t vrp;
274b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8_t vlast = vdup_n_u8(0);
275b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x4_t vdest;
276b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vdest.val[3] = vdup_n_u8(0);
277b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
278b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vtmp = vld1q_u8(rp);
279b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vrpt = png_ptr(uint8x8x2_t,&vtmp);
280b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vrp = *vrpt;
281b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
282b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   for (; rp < rp_stop; pp += 12)
283b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   {
284b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8x2_t *vppt;
285b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8x2_t vpp;
286b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8_t vtmp1, vtmp2, vtmp3;
287b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint32x2_t *temp_pointer;
288b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
289b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp = vld1q_u8(pp);
290b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vppt = png_ptr(uint8x8x2_t,&vtmp);
291b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vpp = *vppt;
292b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
293b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
294b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
295b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
296b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
297b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
298b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
299b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
300b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
301b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
302b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
303b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
304b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
305b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
306b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
307b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
308b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
309b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp = vld1q_u8(rp + 12);
310b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vrpt = png_ptr(uint8x8x2_t,&vtmp);
311b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vrp = *vrpt;
312b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
313b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
314b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
315b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
316b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vlast = vtmp2;
317b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
318b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
319b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
320b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
321b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
322b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
323b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
324b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
325b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      rp += 3;
326b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   }
327b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari}
328b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
329b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurarivoid
330b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripuraripng_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
331b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep prev_row)
332b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari{
333b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp = row;
334b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_bytep rp_stop = row + row_info->rowbytes;
335b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   png_const_bytep pp = prev_row;
336b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
337b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8_t vlast = vdup_n_u8(0);
338b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   uint8x8x4_t vdest;
339b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   vdest.val[3] = vdup_n_u8(0);
340b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
341b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   for (; rp < rp_stop; rp += 16, pp += 16)
342b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   {
343b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint32x2x4_t vtmp;
344b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8x4_t *vrpt, *vppt;
345b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint8x8x4_t vrp, vpp;
346b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      uint32x2x4_t *temp_pointer;
347b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
348b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp = vld4_u32(png_ptr(uint32_t,rp));
349b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vrpt = png_ptr(uint8x8x4_t,&vtmp);
350b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vrp = *vrpt;
351b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
352b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vppt = png_ptr(uint8x8x4_t,&vtmp);
353b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vpp = *vppt;
354b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
355b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
356b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
357b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
358b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
359b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
360b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
361b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
362b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
363b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
364b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vlast = vpp.val[3];
365b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
366b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
367b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari   }
368b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari}
369b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari
370b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#endif /* PNG_ARM_NEON_OPT > 0 */
371b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#endif /* PNG_READ_SUPPORTED */
372b478e66e7c2621eef5f465e4629ce642db00716bSireesh Tripurari#endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
373