1
2/* filter_neon_intrinsics.c - NEON optimised filter functions
3 *
4 * Copyright (c) 2014,2016 Glenn Randers-Pehrson
5 * Written by James Yu <james.yu at linaro.org>, October 2013.
6 * Based on filter_neon.S, written by Mans Rullgard, 2011.
7 *
8 * Last changed in libpng 1.6.22 [May 26, 2016]
9 *
10 * This code is released under the libpng license.
11 * For conditions of distribution and use, see the disclaimer
12 * and license in png.h
13 */
14
15#include "../pngpriv.h"
16
17#ifdef PNG_READ_SUPPORTED
18
19/* This code requires -mfpu=neon on the command line: */
20#if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
21
22#include <arm_neon.h>
23
24/* libpng row pointers are not necessarily aligned to any particular boundary,
25 * however this code will only work with appropriate alignment.  arm/arm_init.c
26 * checks for this (and will not compile unless it is done). This code uses
27 * variants of png_aligncast to avoid compiler warnings.
28 */
29#define png_ptr(type,pointer) png_aligncast(type *,pointer)
30#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
31
32/* The following relies on a variable 'temp_pointer' being declared with type
33 * 'type'.  This is written this way just to hide the GCC strict aliasing
34 * warning; note that the code is safe because there never is an alias between
35 * the input and output pointers.
36 */
37#define png_ldr(type,pointer)\
38   (temp_pointer = png_ptr(type,pointer), *temp_pointer)
39
40#if PNG_ARM_NEON_OPT > 0
41
42void
43png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
44   png_const_bytep prev_row)
45{
46   png_bytep rp = row;
47   png_bytep rp_stop = row + row_info->rowbytes;
48   png_const_bytep pp = prev_row;
49
50   png_debug(1, "in png_read_filter_row_up_neon");
51
52   for (; rp < rp_stop; rp += 16, pp += 16)
53   {
54      uint8x16_t qrp, qpp;
55
56      qrp = vld1q_u8(rp);
57      qpp = vld1q_u8(pp);
58      qrp = vaddq_u8(qrp, qpp);
59      vst1q_u8(rp, qrp);
60   }
61}
62
63void
64png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
65   png_const_bytep prev_row)
66{
67   png_bytep rp = row;
68   png_bytep rp_stop = row + row_info->rowbytes;
69
70   uint8x16_t vtmp = vld1q_u8(rp);
71   uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
72   uint8x8x2_t vrp = *vrpt;
73
74   uint8x8x4_t vdest;
75   vdest.val[3] = vdup_n_u8(0);
76
77   png_debug(1, "in png_read_filter_row_sub3_neon");
78
79   for (; rp < rp_stop;)
80   {
81      uint8x8_t vtmp1, vtmp2;
82      uint32x2_t *temp_pointer;
83
84      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
85      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
86      vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
87      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
88
89      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
90      vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
91      vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
92
93      vtmp = vld1q_u8(rp + 12);
94      vrpt = png_ptr(uint8x8x2_t, &vtmp);
95      vrp = *vrpt;
96
97      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
98      rp += 3;
99      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
100      rp += 3;
101      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
102      rp += 3;
103      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
104      rp += 3;
105   }
106
107   PNG_UNUSED(prev_row)
108}
109
110void
111png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
112   png_const_bytep prev_row)
113{
114   png_bytep rp = row;
115   png_bytep rp_stop = row + row_info->rowbytes;
116
117   uint8x8x4_t vdest;
118   vdest.val[3] = vdup_n_u8(0);
119
120   png_debug(1, "in png_read_filter_row_sub4_neon");
121
122   for (; rp < rp_stop; rp += 16)
123   {
124      uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
125      uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
126      uint8x8x4_t vrp = *vrpt;
127      uint32x2x4_t *temp_pointer;
128
129      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
130      vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
131      vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
132      vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
133      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
134   }
135
136   PNG_UNUSED(prev_row)
137}
138
139void
140png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
141   png_const_bytep prev_row)
142{
143   png_bytep rp = row;
144   png_const_bytep pp = prev_row;
145   png_bytep rp_stop = row + row_info->rowbytes;
146
147   uint8x16_t vtmp;
148   uint8x8x2_t *vrpt;
149   uint8x8x2_t vrp;
150   uint8x8x4_t vdest;
151   vdest.val[3] = vdup_n_u8(0);
152
153   vtmp = vld1q_u8(rp);
154   vrpt = png_ptr(uint8x8x2_t,&vtmp);
155   vrp = *vrpt;
156
157   png_debug(1, "in png_read_filter_row_avg3_neon");
158
159   for (; rp < rp_stop; pp += 12)
160   {
161      uint8x8_t vtmp1, vtmp2, vtmp3;
162
163      uint8x8x2_t *vppt;
164      uint8x8x2_t vpp;
165
166      uint32x2_t *temp_pointer;
167
168      vtmp = vld1q_u8(pp);
169      vppt = png_ptr(uint8x8x2_t,&vtmp);
170      vpp = *vppt;
171
172      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
173      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
174      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
175
176      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
177      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
178      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
179      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
180
181      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
182      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
183
184      vtmp = vld1q_u8(rp + 12);
185      vrpt = png_ptr(uint8x8x2_t,&vtmp);
186      vrp = *vrpt;
187
188      vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
189      vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
190
191      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
192
193      vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
194      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
195
196      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
197      rp += 3;
198      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
199      rp += 3;
200      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
201      rp += 3;
202      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
203      rp += 3;
204   }
205}
206
207void
208png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
209   png_const_bytep prev_row)
210{
211   png_bytep rp = row;
212   png_bytep rp_stop = row + row_info->rowbytes;
213   png_const_bytep pp = prev_row;
214
215   uint8x8x4_t vdest;
216   vdest.val[3] = vdup_n_u8(0);
217
218   png_debug(1, "in png_read_filter_row_avg4_neon");
219
220   for (; rp < rp_stop; rp += 16, pp += 16)
221   {
222      uint32x2x4_t vtmp;
223      uint8x8x4_t *vrpt, *vppt;
224      uint8x8x4_t vrp, vpp;
225      uint32x2x4_t *temp_pointer;
226
227      vtmp = vld4_u32(png_ptr(uint32_t,rp));
228      vrpt = png_ptr(uint8x8x4_t,&vtmp);
229      vrp = *vrpt;
230      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
231      vppt = png_ptr(uint8x8x4_t,&vtmp);
232      vpp = *vppt;
233
234      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
235      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
236      vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
237      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
238      vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
239      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
240      vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
241      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
242
243      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
244   }
245}
246
247static uint8x8_t
248paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
249{
250   uint8x8_t d, e;
251   uint16x8_t p1, pa, pb, pc;
252
253   p1 = vaddl_u8(a, b); /* a + b */
254   pc = vaddl_u8(c, c); /* c * 2 */
255   pa = vabdl_u8(b, c); /* pa */
256   pb = vabdl_u8(a, c); /* pb */
257   pc = vabdq_u16(p1, pc); /* pc */
258
259   p1 = vcleq_u16(pa, pb); /* pa <= pb */
260   pa = vcleq_u16(pa, pc); /* pa <= pc */
261   pb = vcleq_u16(pb, pc); /* pb <= pc */
262
263   p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
264
265   d = vmovn_u16(pb);
266   e = vmovn_u16(p1);
267
268   d = vbsl_u8(d, b, c);
269   e = vbsl_u8(e, a, d);
270
271   return e;
272}
273
274void
275png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
276   png_const_bytep prev_row)
277{
278   png_bytep rp = row;
279   png_const_bytep pp = prev_row;
280   png_bytep rp_stop = row + row_info->rowbytes;
281
282   uint8x16_t vtmp;
283   uint8x8x2_t *vrpt;
284   uint8x8x2_t vrp;
285   uint8x8_t vlast = vdup_n_u8(0);
286   uint8x8x4_t vdest;
287   vdest.val[3] = vdup_n_u8(0);
288
289   vtmp = vld1q_u8(rp);
290   vrpt = png_ptr(uint8x8x2_t,&vtmp);
291   vrp = *vrpt;
292
293   png_debug(1, "in png_read_filter_row_paeth3_neon");
294
295   for (; rp < rp_stop; pp += 12)
296   {
297      uint8x8x2_t *vppt;
298      uint8x8x2_t vpp;
299      uint8x8_t vtmp1, vtmp2, vtmp3;
300      uint32x2_t *temp_pointer;
301
302      vtmp = vld1q_u8(pp);
303      vppt = png_ptr(uint8x8x2_t,&vtmp);
304      vpp = *vppt;
305
306      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
307      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
308
309      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
310      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
311      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
312      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
313
314      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
315      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
316      vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
317      vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
318
319      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
320      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
321
322      vtmp = vld1q_u8(rp + 12);
323      vrpt = png_ptr(uint8x8x2_t,&vtmp);
324      vrp = *vrpt;
325
326      vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
327      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
328
329      vlast = vtmp2;
330
331      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
332      rp += 3;
333      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
334      rp += 3;
335      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
336      rp += 3;
337      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
338      rp += 3;
339   }
340}
341
342void
343png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
344   png_const_bytep prev_row)
345{
346   png_bytep rp = row;
347   png_bytep rp_stop = row + row_info->rowbytes;
348   png_const_bytep pp = prev_row;
349
350   uint8x8_t vlast = vdup_n_u8(0);
351   uint8x8x4_t vdest;
352   vdest.val[3] = vdup_n_u8(0);
353
354   png_debug(1, "in png_read_filter_row_paeth4_neon");
355
356   for (; rp < rp_stop; rp += 16, pp += 16)
357   {
358      uint32x2x4_t vtmp;
359      uint8x8x4_t *vrpt, *vppt;
360      uint8x8x4_t vrp, vpp;
361      uint32x2x4_t *temp_pointer;
362
363      vtmp = vld4_u32(png_ptr(uint32_t,rp));
364      vrpt = png_ptr(uint8x8x4_t,&vtmp);
365      vrp = *vrpt;
366      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
367      vppt = png_ptr(uint8x8x4_t,&vtmp);
368      vpp = *vppt;
369
370      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
371      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
372      vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
373      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
374      vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
375      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
376      vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
377      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
378
379      vlast = vpp.val[3];
380
381      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
382   }
383}
384
385#endif /* PNG_ARM_NEON_OPT > 0 */
386#endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
387#endif /* READ */
388