1
2/* filter_neon_intrinsics.c - NEON optimised filter functions
3 *
4 * Copyright (c) 2013 Glenn Randers-Pehrson
5 * Written by James Yu <james.yu at linaro.org>, October 2013.
6 * Based on filter_neon.S, written by Mans Rullgard, 2011.
7 *
8 * Last changed in libpng 1.6.8 [December 19, 2013]
9 *
10 * This code is released under the libpng license.
11 * For conditions of distribution and use, see the disclaimer
12 * and license in png.h
13 */
14
15#include "../pngpriv.h"
16
17/* This code requires -mfpu=neon on the command line: */
18#if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
19
20#include <arm_neon.h>
21
22/* libpng row pointers are not necessarily aligned to any particular boundary,
23 * however this code will only work with appropriate alignment.  arm/arm_init.c
24 * checks for this (and will not compile unless it is done). This code uses
25 * variants of png_aligncast to avoid compiler warnings.
26 */
27#define png_ptr(type,pointer) png_aligncast(type *,pointer)
28#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
29
30/* The following relies on a variable 'temp_pointer' being declared with type
31 * 'type'.  This is written this way just to hide the GCC strict aliasing
32 * warning; note that the code is safe because there never is an alias between
33 * the input and output pointers.
34 */
35#define png_ldr(type,pointer)\
36   (temp_pointer = png_ptr(type,pointer), *temp_pointer)
37
38#ifdef PNG_READ_SUPPORTED
39#if PNG_ARM_NEON_OPT > 0
40
41void
42png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
43   png_const_bytep prev_row)
44{
45   png_bytep rp = row;
46   png_bytep rp_stop = row + row_info->rowbytes;
47   png_const_bytep pp = prev_row;
48
49   for (; rp < rp_stop; rp += 16, pp += 16)
50   {
51      uint8x16_t qrp, qpp;
52
53      qrp = vld1q_u8(rp);
54      qpp = vld1q_u8(pp);
55      qrp = vaddq_u8(qrp, qpp);
56      vst1q_u8(rp, qrp);
57   }
58}
59
60void
61png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
62   png_const_bytep prev_row)
63{
64   png_bytep rp = row;
65   png_bytep rp_stop = row + row_info->rowbytes;
66
67   uint8x16_t vtmp = vld1q_u8(rp);
68   uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
69   uint8x8x2_t vrp = *vrpt;
70
71   uint8x8x4_t vdest;
72   vdest.val[3] = vdup_n_u8(0);
73
74   for (; rp < rp_stop;)
75   {
76      uint8x8_t vtmp1, vtmp2;
77      uint32x2_t *temp_pointer;
78
79      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
80      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
81      vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
82      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
83
84      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
85      vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
86      vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
87
88      vtmp = vld1q_u8(rp + 12);
89      vrpt = png_ptr(uint8x8x2_t, &vtmp);
90      vrp = *vrpt;
91
92      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
93      rp += 3;
94      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
95      rp += 3;
96      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
97      rp += 3;
98      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
99      rp += 3;
100   }
101
102   PNG_UNUSED(prev_row)
103}
104
105void
106png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
107   png_const_bytep prev_row)
108{
109   png_bytep rp = row;
110   png_bytep rp_stop = row + row_info->rowbytes;
111
112   uint8x8x4_t vdest;
113   vdest.val[3] = vdup_n_u8(0);
114
115   for (; rp < rp_stop; rp += 16)
116   {
117      uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
118      uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
119      uint8x8x4_t vrp = *vrpt;
120      uint32x2x4_t *temp_pointer;
121
122      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
123      vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
124      vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
125      vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
126      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
127   }
128
129   PNG_UNUSED(prev_row)
130}
131
132void
133png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
134   png_const_bytep prev_row)
135{
136   png_bytep rp = row;
137   png_const_bytep pp = prev_row;
138   png_bytep rp_stop = row + row_info->rowbytes;
139
140   uint8x16_t vtmp;
141   uint8x8x2_t *vrpt;
142   uint8x8x2_t vrp;
143   uint8x8x4_t vdest;
144   vdest.val[3] = vdup_n_u8(0);
145
146   vtmp = vld1q_u8(rp);
147   vrpt = png_ptr(uint8x8x2_t,&vtmp);
148   vrp = *vrpt;
149
150   for (; rp < rp_stop; pp += 12)
151   {
152      uint8x8_t vtmp1, vtmp2, vtmp3;
153
154      uint8x8x2_t *vppt;
155      uint8x8x2_t vpp;
156
157      uint32x2_t *temp_pointer;
158
159      vtmp = vld1q_u8(pp);
160      vppt = png_ptr(uint8x8x2_t,&vtmp);
161      vpp = *vppt;
162
163      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
164      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
165      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
166
167      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
168      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
169      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
170      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
171
172      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
173      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
174
175      vtmp = vld1q_u8(rp + 12);
176      vrpt = png_ptr(uint8x8x2_t,&vtmp);
177      vrp = *vrpt;
178
179      vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
180      vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
181
182      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
183
184      vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
185      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
186
187      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
188      rp += 3;
189      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
190      rp += 3;
191      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
192      rp += 3;
193      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
194      rp += 3;
195   }
196}
197
198void
199png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
200   png_const_bytep prev_row)
201{
202   png_bytep rp = row;
203   png_bytep rp_stop = row + row_info->rowbytes;
204   png_const_bytep pp = prev_row;
205
206   uint8x8x4_t vdest;
207   vdest.val[3] = vdup_n_u8(0);
208
209   for (; rp < rp_stop; rp += 16, pp += 16)
210   {
211      uint32x2x4_t vtmp;
212      uint8x8x4_t *vrpt, *vppt;
213      uint8x8x4_t vrp, vpp;
214      uint32x2x4_t *temp_pointer;
215
216      vtmp = vld4_u32(png_ptr(uint32_t,rp));
217      vrpt = png_ptr(uint8x8x4_t,&vtmp);
218      vrp = *vrpt;
219      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
220      vppt = png_ptr(uint8x8x4_t,&vtmp);
221      vpp = *vppt;
222
223      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
224      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
225      vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
226      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
227      vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
228      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
229      vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
230      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
231
232      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
233   }
234}
235
236static uint8x8_t
237paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
238{
239   uint8x8_t d, e;
240   uint16x8_t p1, pa, pb, pc;
241
242   p1 = vaddl_u8(a, b); /* a + b */
243   pc = vaddl_u8(c, c); /* c * 2 */
244   pa = vabdl_u8(b, c); /* pa */
245   pb = vabdl_u8(a, c); /* pb */
246   pc = vabdq_u16(p1, pc); /* pc */
247
248   p1 = vcleq_u16(pa, pb); /* pa <= pb */
249   pa = vcleq_u16(pa, pc); /* pa <= pc */
250   pb = vcleq_u16(pb, pc); /* pb <= pc */
251
252   p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
253
254   d = vmovn_u16(pb);
255   e = vmovn_u16(p1);
256
257   d = vbsl_u8(d, b, c);
258   e = vbsl_u8(e, a, d);
259
260   return e;
261}
262
263void
264png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
265   png_const_bytep prev_row)
266{
267   png_bytep rp = row;
268   png_const_bytep pp = prev_row;
269   png_bytep rp_stop = row + row_info->rowbytes;
270
271   uint8x16_t vtmp;
272   uint8x8x2_t *vrpt;
273   uint8x8x2_t vrp;
274   uint8x8_t vlast = vdup_n_u8(0);
275   uint8x8x4_t vdest;
276   vdest.val[3] = vdup_n_u8(0);
277
278   vtmp = vld1q_u8(rp);
279   vrpt = png_ptr(uint8x8x2_t,&vtmp);
280   vrp = *vrpt;
281
282   for (; rp < rp_stop; pp += 12)
283   {
284      uint8x8x2_t *vppt;
285      uint8x8x2_t vpp;
286      uint8x8_t vtmp1, vtmp2, vtmp3;
287      uint32x2_t *temp_pointer;
288
289      vtmp = vld1q_u8(pp);
290      vppt = png_ptr(uint8x8x2_t,&vtmp);
291      vpp = *vppt;
292
293      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
294      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
295
296      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
297      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
298      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
299      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
300
301      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
302      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
303      vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
304      vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
305
306      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
307      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
308
309      vtmp = vld1q_u8(rp + 12);
310      vrpt = png_ptr(uint8x8x2_t,&vtmp);
311      vrp = *vrpt;
312
313      vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
314      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
315
316      vlast = vtmp2;
317
318      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
319      rp += 3;
320      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
321      rp += 3;
322      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
323      rp += 3;
324      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
325      rp += 3;
326   }
327}
328
329void
330png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
331   png_const_bytep prev_row)
332{
333   png_bytep rp = row;
334   png_bytep rp_stop = row + row_info->rowbytes;
335   png_const_bytep pp = prev_row;
336
337   uint8x8_t vlast = vdup_n_u8(0);
338   uint8x8x4_t vdest;
339   vdest.val[3] = vdup_n_u8(0);
340
341   for (; rp < rp_stop; rp += 16, pp += 16)
342   {
343      uint32x2x4_t vtmp;
344      uint8x8x4_t *vrpt, *vppt;
345      uint8x8x4_t vrp, vpp;
346      uint32x2x4_t *temp_pointer;
347
348      vtmp = vld4_u32(png_ptr(uint32_t,rp));
349      vrpt = png_ptr(uint8x8x4_t,&vtmp);
350      vrp = *vrpt;
351      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
352      vppt = png_ptr(uint8x8x4_t,&vtmp);
353      vpp = *vppt;
354
355      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
356      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
357      vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
358      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
359      vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
360      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
361      vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
362      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
363
364      vlast = vpp.val[3];
365
366      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
367   }
368}
369
370#endif /* PNG_ARM_NEON_OPT > 0 */
371#endif /* PNG_READ_SUPPORTED */
372#endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
373