1// Copyright 2012 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// ARM NEON version of speed-critical encoding functions.
11//
12// adapted from libvpx (http://www.webmproject.org/code/)
13
14#include "./dsp.h"
15
16#if defined(WEBP_USE_NEON)
17
18#include <assert.h>
19
20#include "./neon.h"
21#include "../enc/vp8enci.h"
22
23//------------------------------------------------------------------------------
24// Transforms (Paragraph 14.4)
25
26// Inverse transform.
27// This code is pretty much the same as TransformOne in the dec_neon.c, except
28// for subtraction to *ref. See the comments there for algorithmic explanations.
29
30static const int16_t kC1 = 20091;
31static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
32
33// This code works but is *slower* than the inlined-asm version below
34// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
35// USE_INTRINSICS define.
36// With gcc-4.8, it's a little faster speed than inlined-assembly.
37#if defined(USE_INTRINSICS)
38
39// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
40static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
41  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
42}
43
44// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
45// to the corresponding rows of 'dst'.
46static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
47                                            const int16x8_t dst01,
48                                            const int16x8_t dst23) {
49  // Unsigned saturate to 8b.
50  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
51  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
52
53  // Store the results.
54  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
55  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
56  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
57  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
58}
59
60static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
61                               const uint8_t* const ref, uint8_t* const dst) {
62  uint32x2_t dst01 = vdup_n_u32(0);
63  uint32x2_t dst23 = vdup_n_u32(0);
64
65  // Load the source pixels.
66  dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
67  dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
68  dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
69  dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
70
71  {
72    // Convert to 16b.
73    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
74    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
75
76    // Descale with rounding.
77    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
78    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
79    // Add the inverse transform.
80    SaturateAndStore4x4(dst, out01, out23);
81  }
82}
83
84static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
85                                     int16x8x2_t* const out) {
86  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
87  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
88  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
89                                                  // b0 d0 b1 d1 b2 d2 ...
90  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
91}
92
93static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
94  // {rows} = in0 | in4
95  //          in8 | in12
96  // B1 = in4 | in12
97  const int16x8_t B1 =
98      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
99  // C0 = kC1 * in4 | kC1 * in12
100  // C1 = kC2 * in4 | kC2 * in12
101  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
102  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
103  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
104                                vget_low_s16(rows->val[1]));   // in0 + in8
105  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
106                                vget_low_s16(rows->val[1]));   // in0 - in8
107  // c = kC2 * in4 - kC1 * in12
108  // d = kC1 * in4 + kC2 * in12
109  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
110  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
111  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
112  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
113  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
114  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
115  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
116  Transpose8x2(E0, E1, rows);
117}
118
119static void ITransformOne(const uint8_t* ref,
120                          const int16_t* in, uint8_t* dst) {
121  int16x8x2_t rows;
122  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
123  TransformPass(&rows);
124  TransformPass(&rows);
125  Add4x4(rows.val[0], rows.val[1], ref, dst);
126}
127
128#else
129
130static void ITransformOne(const uint8_t* ref,
131                          const int16_t* in, uint8_t* dst) {
132  const int kBPS = BPS;
133  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
134
135  __asm__ volatile (
136    "vld1.16         {q1, q2}, [%[in]]           \n"
137    "vld1.16         {d0}, [%[kC1C2]]            \n"
138
139    // d2: in[0]
140    // d3: in[8]
141    // d4: in[4]
142    // d5: in[12]
143    "vswp            d3, d4                      \n"
144
145    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
146    // q9 = {in[4], in[12]} * kC2 >> 16
147    "vqdmulh.s16     q8, q2, d0[0]               \n"
148    "vqdmulh.s16     q9, q2, d0[1]               \n"
149
150    // d22 = a = in[0] + in[8]
151    // d23 = b = in[0] - in[8]
152    "vqadd.s16       d22, d2, d3                 \n"
153    "vqsub.s16       d23, d2, d3                 \n"
154
155    //  q8 = in[4]/[12] * kC1 >> 16
156    "vshr.s16        q8, q8, #1                  \n"
157
158    // Add {in[4], in[12]} back after the multiplication.
159    "vqadd.s16       q8, q2, q8                  \n"
160
161    // d20 = c = in[4]*kC2 - in[12]*kC1
162    // d21 = d = in[4]*kC1 + in[12]*kC2
163    "vqsub.s16       d20, d18, d17               \n"
164    "vqadd.s16       d21, d19, d16               \n"
165
166    // d2 = tmp[0] = a + d
167    // d3 = tmp[1] = b + c
168    // d4 = tmp[2] = b - c
169    // d5 = tmp[3] = a - d
170    "vqadd.s16       d2, d22, d21                \n"
171    "vqadd.s16       d3, d23, d20                \n"
172    "vqsub.s16       d4, d23, d20                \n"
173    "vqsub.s16       d5, d22, d21                \n"
174
175    "vzip.16         q1, q2                      \n"
176    "vzip.16         q1, q2                      \n"
177
178    "vswp            d3, d4                      \n"
179
180    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
181    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
182    "vqdmulh.s16     q8, q2, d0[0]               \n"
183    "vqdmulh.s16     q9, q2, d0[1]               \n"
184
185    // d22 = a = tmp[0] + tmp[8]
186    // d23 = b = tmp[0] - tmp[8]
187    "vqadd.s16       d22, d2, d3                 \n"
188    "vqsub.s16       d23, d2, d3                 \n"
189
190    "vshr.s16        q8, q8, #1                  \n"
191    "vqadd.s16       q8, q2, q8                  \n"
192
193    // d20 = c = in[4]*kC2 - in[12]*kC1
194    // d21 = d = in[4]*kC1 + in[12]*kC2
195    "vqsub.s16       d20, d18, d17               \n"
196    "vqadd.s16       d21, d19, d16               \n"
197
198    // d2 = tmp[0] = a + d
199    // d3 = tmp[1] = b + c
200    // d4 = tmp[2] = b - c
201    // d5 = tmp[3] = a - d
202    "vqadd.s16       d2, d22, d21                \n"
203    "vqadd.s16       d3, d23, d20                \n"
204    "vqsub.s16       d4, d23, d20                \n"
205    "vqsub.s16       d5, d22, d21                \n"
206
207    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
208    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
209    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
210    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
211
212    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
213
214    // (val) + 4 >> 3
215    "vrshr.s16       d2, d2, #3                  \n"
216    "vrshr.s16       d3, d3, #3                  \n"
217    "vrshr.s16       d4, d4, #3                  \n"
218    "vrshr.s16       d5, d5, #3                  \n"
219
220    "vzip.16         q1, q2                      \n"
221    "vzip.16         q1, q2                      \n"
222
223    // Must accumulate before saturating
224    "vmovl.u8        q8, d6                      \n"
225    "vmovl.u8        q9, d7                      \n"
226
227    "vqadd.s16       q1, q1, q8                  \n"
228    "vqadd.s16       q2, q2, q9                  \n"
229
230    "vqmovun.s16     d0, q1                      \n"
231    "vqmovun.s16     d1, q2                      \n"
232
233    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
234    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
235    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
236    "vst1.32         d1[1], [%[dst]]             \n"
237
238    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
239    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
240    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
241  );
242}
243
244#endif    // USE_INTRINSICS
245
246static void ITransform(const uint8_t* ref,
247                       const int16_t* in, uint8_t* dst, int do_two) {
248  ITransformOne(ref, in, dst);
249  if (do_two) {
250    ITransformOne(ref + 4, in + 16, dst + 4);
251  }
252}
253
254// Load all 4x4 pixels into a single uint8x16_t variable.
255static uint8x16_t Load4x4(const uint8_t* src) {
256  uint32x4_t out = { 0, 0, 0, 0 };
257  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
258  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
259  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
260  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
261  return vreinterpretq_u8_u32(out);
262}
263
264// Forward transform.
265
266#if defined(USE_INTRINSICS)
267
268static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
269                                         const int16x4_t C, const int16x4_t D,
270                                         int16x8_t* const out01,
271                                         int16x8_t* const out32) {
272  const int16x4x2_t AB = vtrn_s16(A, B);
273  const int16x4x2_t CD = vtrn_s16(C, D);
274  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
275                                     vreinterpret_s32_s16(CD.val[0]));
276  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
277                                     vreinterpret_s32_s16(CD.val[1]));
278  *out01 = vreinterpretq_s16_s64(
279      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
280                   vreinterpret_s64_s32(tmp13.val[0])));
281  *out32 = vreinterpretq_s16_s64(
282      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
283                   vreinterpret_s64_s32(tmp02.val[1])));
284}
285
286static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
287                                         const uint8x8_t b) {
288  return vreinterpretq_s16_u16(vsubl_u8(a, b));
289}
290
291static void FTransform(const uint8_t* src, const uint8_t* ref,
292                       int16_t* out) {
293  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
294  {
295    const uint8x16_t S0 = Load4x4(src);
296    const uint8x16_t R0 = Load4x4(ref);
297    const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
298    const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
299    const int16x4_t D0 = vget_low_s16(D0D1);
300    const int16x4_t D1 = vget_high_s16(D0D1);
301    const int16x4_t D2 = vget_low_s16(D2D3);
302    const int16x4_t D3 = vget_high_s16(D2D3);
303    Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
304  }
305  {    // 1rst pass
306    const int32x4_t kCst937 = vdupq_n_s32(937);
307    const int32x4_t kCst1812 = vdupq_n_s32(1812);
308    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
309    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
310    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
311    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
312                                    vget_high_s16(a0a1_2));
313    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
314                                    vget_high_s16(a0a1_2));
315    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
316    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
317    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
318    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
319    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
320    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
321    Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
322  }
323  {    // 2nd pass
324    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
325    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
326    const int32x4_t kCst51000 = vdupq_n_s32(51000);
327    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
328    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
329    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
330    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
331    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
332    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
333    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
334    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
335    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
336    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
337    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
338    const int16x4_t a3_eq_0 =
339        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
340    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
341    vst1_s16(out +  0, out0);
342    vst1_s16(out +  4, out1);
343    vst1_s16(out +  8, out2);
344    vst1_s16(out + 12, out3);
345  }
346}
347
348#else
349
350// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
351static const int16_t kCoeff16[] = {
352  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
353};
354static const int32_t kCoeff32[] = {
355   1812,  1812,  1812,  1812,
356    937,   937,   937,   937,
357  12000, 12000, 12000, 12000,
358  51000, 51000, 51000, 51000
359};
360
361static void FTransform(const uint8_t* src, const uint8_t* ref,
362                       int16_t* out) {
363  const int kBPS = BPS;
364  const uint8_t* src_ptr = src;
365  const uint8_t* ref_ptr = ref;
366  const int16_t* coeff16 = kCoeff16;
367  const int32_t* coeff32 = kCoeff32;
368
369  __asm__ volatile (
370    // load src into q4, q5 in high half
371    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
372    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
373    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
374    "vld1.8 {d11}, [%[src_ptr]]               \n"
375
376    // load ref into q6, q7 in high half
377    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
378    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
379    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
380    "vld1.8 {d15}, [%[ref_ptr]]               \n"
381
382    // Pack the high values in to q4 and q6
383    "vtrn.32     q4, q5                       \n"
384    "vtrn.32     q6, q7                       \n"
385
386    // d[0-3] = src - ref
387    "vsubl.u8    q0, d8, d12                  \n"
388    "vsubl.u8    q1, d9, d13                  \n"
389
390    // load coeff16 into q8(d16=5352, d17=2217)
391    "vld1.16     {q8}, [%[coeff16]]           \n"
392
393    // load coeff32 high half into q9 = 1812, q10 = 937
394    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
395
396    // load coeff32 low half into q11=12000, q12=51000
397    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
398
399    // part 1
400    // Transpose. Register dN is the same as dN in C
401    "vtrn.32         d0, d2                   \n"
402    "vtrn.32         d1, d3                   \n"
403    "vtrn.16         d0, d1                   \n"
404    "vtrn.16         d2, d3                   \n"
405
406    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
407    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
408    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
409    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
410
411    "vadd.s16        d0, d4, d5               \n" // a0 + a1
412    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
413    "vsub.s16        d2, d4, d5               \n" // a0 - a1
414    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
415
416    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
417    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
418    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
419    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
420
421    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
422    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
423    "vshrn.s32       d1, q9, #9               \n"
424    "vshrn.s32       d3, q10, #9              \n"
425
426    // part 2
427    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
428    "vtrn.32         d0, d2                   \n"
429    "vtrn.32         d1, d3                   \n"
430    "vtrn.16         d0, d1                   \n"
431    "vtrn.16         d2, d3                   \n"
432
433    "vmov.s16        d26, #7                  \n"
434
435    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
436    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
437    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
438    "vadd.s16        d4, d4, d26              \n" // a1 + 7
439    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
440
441    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
442    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
443
444    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
445    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
446
447    "vceq.s16        d4, d7, #0               \n"
448
449    "vshr.s16        d0, d0, #4               \n"
450    "vshr.s16        d2, d2, #4               \n"
451
452    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
453    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
454
455    "vmvn            d4, d4                   \n" // !(d1 == 0)
456    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
457    "vshrn.s32       d1, q11, #16             \n"
458    // op[4] += (d1!=0)
459    "vsub.s16        d1, d1, d4               \n"
460    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
461    "vshrn.s32       d3, q12, #16             \n"
462
463    // set result to out array
464    "vst1.16         {q0, q1}, [%[out]]   \n"
465    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
466      [coeff32] "+r"(coeff32)          // modified registers
467    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
468      [out] "r"(out)                   // constants
469    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
470      "q10", "q11", "q12", "q13"       // clobbered
471  );
472}
473
474#endif
475
476#define LOAD_LANE_16b(VALUE, LANE) do {             \
477  (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
478  src += stride;                                    \
479} while (0)
480
481static void FTransformWHT(const int16_t* src, int16_t* out) {
482  const int stride = 16;
483  const int16x4_t zero = vdup_n_s16(0);
484  int32x4x4_t tmp0;
485  int16x4x4_t in;
486  INIT_VECTOR4(in, zero, zero, zero, zero);
487  LOAD_LANE_16b(in.val[0], 0);
488  LOAD_LANE_16b(in.val[1], 0);
489  LOAD_LANE_16b(in.val[2], 0);
490  LOAD_LANE_16b(in.val[3], 0);
491  LOAD_LANE_16b(in.val[0], 1);
492  LOAD_LANE_16b(in.val[1], 1);
493  LOAD_LANE_16b(in.val[2], 1);
494  LOAD_LANE_16b(in.val[3], 1);
495  LOAD_LANE_16b(in.val[0], 2);
496  LOAD_LANE_16b(in.val[1], 2);
497  LOAD_LANE_16b(in.val[2], 2);
498  LOAD_LANE_16b(in.val[3], 2);
499  LOAD_LANE_16b(in.val[0], 3);
500  LOAD_LANE_16b(in.val[1], 3);
501  LOAD_LANE_16b(in.val[2], 3);
502  LOAD_LANE_16b(in.val[3], 3);
503
504  {
505    // a0 = in[0 * 16] + in[2 * 16]
506    // a1 = in[1 * 16] + in[3 * 16]
507    // a2 = in[1 * 16] - in[3 * 16]
508    // a3 = in[0 * 16] - in[2 * 16]
509    const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
510    const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
511    const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
512    const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
513    tmp0.val[0] = vaddq_s32(a0, a1);
514    tmp0.val[1] = vaddq_s32(a3, a2);
515    tmp0.val[2] = vsubq_s32(a3, a2);
516    tmp0.val[3] = vsubq_s32(a0, a1);
517  }
518  {
519    const int32x4x4_t tmp1 = Transpose4x4(tmp0);
520    // a0 = tmp[0 + i] + tmp[ 8 + i]
521    // a1 = tmp[4 + i] + tmp[12 + i]
522    // a2 = tmp[4 + i] - tmp[12 + i]
523    // a3 = tmp[0 + i] - tmp[ 8 + i]
524    const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
525    const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
526    const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
527    const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
528    const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
529    const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
530    const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
531    const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
532    const int16x4_t out0 = vmovn_s32(b0);
533    const int16x4_t out1 = vmovn_s32(b1);
534    const int16x4_t out2 = vmovn_s32(b2);
535    const int16x4_t out3 = vmovn_s32(b3);
536
537    vst1_s16(out +  0, out0);
538    vst1_s16(out +  4, out1);
539    vst1_s16(out +  8, out2);
540    vst1_s16(out + 12, out3);
541  }
542}
543#undef LOAD_LANE_16b
544
545//------------------------------------------------------------------------------
546// Texture distortion
547//
548// We try to match the spectral content (weighted) between source and
549// reconstructed samples.
550
551// This code works but is *slower* than the inlined-asm version below
552// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
553// USE_INTRINSICS define.
554// With gcc-4.8, it's only slightly slower than the inlined.
555#if defined(USE_INTRINSICS)
556
557// Zero extend an uint16x4_t 'v' to an int32x4_t.
558static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {
559  return vreinterpretq_s32_u32(vmovl_u16(v));
560}
561
562// Does a regular 4x4 transpose followed by an adjustment of the upper columns
563// in the inner rows to restore the source order of differences,
564// i.e., a0 - a1 | a3 - a2.
565static WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) {
566  int32x4x4_t out = Transpose4x4(rows);
567  // restore source order in the columns containing differences.
568  const int32x2_t r1h = vget_high_s32(out.val[1]);
569  const int32x2_t r2h = vget_high_s32(out.val[2]);
570  out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h);
571  out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h);
572  return out;
573}
574
575static WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1,
576                                                   const uint8x8_t r2r3) {
577  // a0 = in[0] + in[2] | a1 = in[1] + in[3]
578  const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3);
579  // a3 = in[0] - in[2] | a2 = in[1] - in[3]
580  const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3);
581  const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1));  // a0 + a1
582  const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2));  // a3 + a2
583  // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations.
584  // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3
585  // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2
586  const int16x8x2_t transpose =
587      vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2));
588  // tmp[3] = a0 - a1 | tmp[2] = a3 - a2
589  const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]),
590                                      vget_low_s16(transpose.val[1]));
591  const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]),
592                                      vget_high_s16(transpose.val[1]));
593  // [0]: tmp[3] [1]: tmp[2]
594  const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2);
595  const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } };
596  return res;
597}
598
599static WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) {
600  // a0 = tmp[0 + i] + tmp[8 + i];
601  const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]);
602  // a1 = tmp[4 + i] + tmp[12+ i];
603  const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]);
604  // a2 = tmp[4 + i] - tmp[12+ i];
605  const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]);
606  // a3 = tmp[0 + i] - tmp[8 + i];
607  const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]);
608  const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1));  // abs(a0 + a1)
609  const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2));  // abs(a3 + a2)
610  const int32x4_t b2 = vabdq_s32(a3, a2);              // abs(a3 - a2)
611  const int32x4_t b3 = vabdq_s32(a0, a1);              // abs(a0 - a1)
612  const int32x4x4_t res = { { b0, b1, b2, b3 } };
613  return res;
614}
615
616// Calculate the weighted sum of the rows in 'b'.
617static WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b,
618                                      const int32x4_t w0, const int32x4_t w1,
619                                      const int32x4_t w2, const int32x4_t w3) {
620  const int32x4_t s0 = vmulq_s32(w0, b.val[0]);
621  const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]);
622  const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]);
623  const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]);
624  const int64x2_t sum1 = vpaddlq_s32(s3);
625  const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1));
626  return sum2;
627}
628
629#define LOAD_LANE_32b(src, VALUE, LANE) \
630    (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
631
632// Hadamard transform
633// Returns the weighted sum of the absolute value of transformed coefficients.
634static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
635                    const uint16_t* const w) {
636  uint32x4_t d0d1 = { 0, 0, 0, 0 };
637  uint32x4_t d2d3 = { 0, 0, 0, 0 };
638  LOAD_LANE_32b(a + 0 * BPS, d0d1, 0);  // a00 a01 a02 a03
639  LOAD_LANE_32b(a + 1 * BPS, d0d1, 1);  // a10 a11 a12 a13
640  LOAD_LANE_32b(b + 0 * BPS, d0d1, 2);  // b00 b01 b02 b03
641  LOAD_LANE_32b(b + 1 * BPS, d0d1, 3);  // b10 b11 b12 b13
642  LOAD_LANE_32b(a + 2 * BPS, d2d3, 0);  // a20 a21 a22 a23
643  LOAD_LANE_32b(a + 3 * BPS, d2d3, 1);  // a30 a31 a32 a33
644  LOAD_LANE_32b(b + 2 * BPS, d2d3, 2);  // b20 b21 b22 b23
645  LOAD_LANE_32b(b + 3 * BPS, d2d3, 3);  // b30 b31 b32 b33
646
647  {
648    // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31
649    // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33
650    const uint16x8x2_t tmp =
651        vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3));
652    const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]);
653    const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]);
654    const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8),
655                                                    vget_low_u8(d2d3u8));
656    const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8),
657                                                    vget_high_u8(d2d3u8));
658    const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a);
659    const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b);
660    const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a);
661    const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b);
662    const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0));
663    const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4));
664    const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8));
665    const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12));
666    const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3);
667    const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3);
668    const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1),
669                                    vreinterpret_s32_s64(sum2));
670    const int32x2_t res = vshr_n_s32(diff, 5);
671    return vget_lane_s32(res, 0);
672  }
673}
674
675#undef LOAD_LANE_32b
676
677#else
678
679// Hadamard transform
680// Returns the weighted sum of the absolute value of transformed coefficients.
681static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
682                    const uint16_t* const w) {
683  const int kBPS = BPS;
684  const uint8_t* A = a;
685  const uint8_t* B = b;
686  const uint16_t* W = w;
687  int sum;
688  __asm__ volatile (
689    "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
690    "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
691    "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
692    "vld1.32         d2[1], [%[a]]            \n"
693
694    "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
695    "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
696    "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
697    "vld1.32         d3[1], [%[b]]            \n"
698
699    // a d0/d2, b d1/d3
700    // d0/d1: 01 01 01 01
701    // d2/d3: 23 23 23 23
702    // But: it goes 01 45 23 67
703    // Notice the middle values are transposed
704    "vtrn.16         q0, q1                   \n"
705
706    // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
707    "vaddl.u8        q2, d0, d2               \n"
708    "vaddl.u8        q10, d1, d3              \n"
709    // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
710    "vsubl.u8        q3, d0, d2               \n"
711    "vsubl.u8        q11, d1, d3              \n"
712
713    // tmp[0] = a0 + a1
714    "vpaddl.s16      q0, q2                   \n"
715    "vpaddl.s16      q8, q10                  \n"
716
717    // tmp[1] = a3 + a2
718    "vpaddl.s16      q1, q3                   \n"
719    "vpaddl.s16      q9, q11                  \n"
720
721    // No pair subtract
722    // q2 = {a0, a3}
723    // q3 = {a1, a2}
724    "vtrn.16         q2, q3                   \n"
725    "vtrn.16         q10, q11                 \n"
726
727    // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
728    "vsubl.s16       q12, d4, d6              \n"
729    "vsubl.s16       q13, d5, d7              \n"
730    "vsubl.s16       q14, d20, d22            \n"
731    "vsubl.s16       q15, d21, d23            \n"
732
733    // separate tmp[3] and tmp[2]
734    // q12 = tmp[3]
735    // q13 = tmp[2]
736    "vtrn.32         q12, q13                 \n"
737    "vtrn.32         q14, q15                 \n"
738
739    // Transpose tmp for a
740    "vswp            d1, d26                  \n" // vtrn.64
741    "vswp            d3, d24                  \n" // vtrn.64
742    "vtrn.32         q0, q1                   \n"
743    "vtrn.32         q13, q12                 \n"
744
745    // Transpose tmp for b
746    "vswp            d17, d30                 \n" // vtrn.64
747    "vswp            d19, d28                 \n" // vtrn.64
748    "vtrn.32         q8, q9                   \n"
749    "vtrn.32         q15, q14                 \n"
750
751    // The first Q register is a, the second b.
752    // q0/8 tmp[0-3]
753    // q13/15 tmp[4-7]
754    // q1/9 tmp[8-11]
755    // q12/14 tmp[12-15]
756
757    // These are still in 01 45 23 67 order. We fix it easily in the addition
758    // case but the subtraction propagates them.
759    "vswp            d3, d27                  \n"
760    "vswp            d19, d31                 \n"
761
762    // a0 = tmp[0] + tmp[8]
763    "vadd.s32        q2, q0, q1               \n"
764    "vadd.s32        q3, q8, q9               \n"
765
766    // a1 = tmp[4] + tmp[12]
767    "vadd.s32        q10, q13, q12            \n"
768    "vadd.s32        q11, q15, q14            \n"
769
770    // a2 = tmp[4] - tmp[12]
771    "vsub.s32        q13, q13, q12            \n"
772    "vsub.s32        q15, q15, q14            \n"
773
774    // a3 = tmp[0] - tmp[8]
775    "vsub.s32        q0, q0, q1               \n"
776    "vsub.s32        q8, q8, q9               \n"
777
778    // b0 = a0 + a1
779    "vadd.s32        q1, q2, q10              \n"
780    "vadd.s32        q9, q3, q11              \n"
781
782    // b1 = a3 + a2
783    "vadd.s32        q12, q0, q13             \n"
784    "vadd.s32        q14, q8, q15             \n"
785
786    // b2 = a3 - a2
787    "vsub.s32        q0, q0, q13              \n"
788    "vsub.s32        q8, q8, q15              \n"
789
790    // b3 = a0 - a1
791    "vsub.s32        q2, q2, q10              \n"
792    "vsub.s32        q3, q3, q11              \n"
793
794    "vld1.64         {q10, q11}, [%[w]]       \n"
795
796    // abs(b0)
797    "vabs.s32        q1, q1                   \n"
798    "vabs.s32        q9, q9                   \n"
799    // abs(b1)
800    "vabs.s32        q12, q12                 \n"
801    "vabs.s32        q14, q14                 \n"
802    // abs(b2)
803    "vabs.s32        q0, q0                   \n"
804    "vabs.s32        q8, q8                   \n"
805    // abs(b3)
806    "vabs.s32        q2, q2                   \n"
807    "vabs.s32        q3, q3                   \n"
808
809    // expand w before using.
810    "vmovl.u16       q13, d20                 \n"
811    "vmovl.u16       q15, d21                 \n"
812
813    // w[0] * abs(b0)
814    "vmul.u32        q1, q1, q13              \n"
815    "vmul.u32        q9, q9, q13              \n"
816
817    // w[4] * abs(b1)
818    "vmla.u32        q1, q12, q15             \n"
819    "vmla.u32        q9, q14, q15             \n"
820
821    // expand w before using.
822    "vmovl.u16       q13, d22                 \n"
823    "vmovl.u16       q15, d23                 \n"
824
825    // w[8] * abs(b1)
826    "vmla.u32        q1, q0, q13              \n"
827    "vmla.u32        q9, q8, q13              \n"
828
829    // w[12] * abs(b1)
830    "vmla.u32        q1, q2, q15              \n"
831    "vmla.u32        q9, q3, q15              \n"
832
833    // Sum the arrays
834    "vpaddl.u32      q1, q1                   \n"
835    "vpaddl.u32      q9, q9                   \n"
836    "vadd.u64        d2, d3                   \n"
837    "vadd.u64        d18, d19                 \n"
838
839    // Hadamard transform needs 4 bits of extra precision (2 bits in each
840    // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
841    // precision for coeff is 8bit of input + 4bits of Hadamard transform +
842    // 16bits for w[] + 2 bits of abs() summation.
843    //
844    // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
845    // A-OK.
846
847    // sum2 - sum1
848    "vsub.u32        d0, d2, d18              \n"
849    // abs(sum2 - sum1)
850    "vabs.s32        d0, d0                   \n"
851    // abs(sum2 - sum1) >> 5
852    "vshr.u32        d0, #5                   \n"
853
854    // It would be better to move the value straight into r0 but I'm not
855    // entirely sure how this works with inline assembly.
856    "vmov.32         %[sum], d0[0]            \n"
857
858    : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
859    : [kBPS] "r"(kBPS)
860    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
861      "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
862  ) ;
863
864  return sum;
865}
866
867#endif  // USE_INTRINSICS
868
869static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
870                      const uint16_t* const w) {
871  int D = 0;
872  int x, y;
873  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
874    for (x = 0; x < 16; x += 4) {
875      D += Disto4x4(a + x + y, b + x + y, w);
876    }
877  }
878  return D;
879}
880
881//------------------------------------------------------------------------------
882
883static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
884                             int start_block, int end_block,
885                             VP8Histogram* const histo) {
886  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
887  int j;
888  for (j = start_block; j < end_block; ++j) {
889    int16_t out[16];
890    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
891    {
892      int k;
893      const int16x8_t a0 = vld1q_s16(out + 0);
894      const int16x8_t b0 = vld1q_s16(out + 8);
895      const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
896      const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
897      const uint16x8_t a2 = vshrq_n_u16(a1, 3);
898      const uint16x8_t b2 = vshrq_n_u16(b1, 3);
899      const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
900      const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
901      vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
902      vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
903      // Convert coefficients to bin.
904      for (k = 0; k < 16; ++k) {
905        histo->distribution[out[k]]++;
906      }
907    }
908  }
909}
910
911//------------------------------------------------------------------------------
912
913static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
914                                        const uint8_t* const b,
915                                        uint32x4_t* const sum) {
916  const uint8x16_t a0 = vld1q_u8(a);
917  const uint8x16_t b0 = vld1q_u8(b);
918  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
919  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
920  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
921  *sum = vpadalq_u16(*sum, prod);      // pair-wise add and accumulate
922}
923
924// Horizontal sum of all four uint32_t values in 'sum'.
925static int SumToInt(uint32x4_t sum) {
926  const uint64x2_t sum2 = vpaddlq_u32(sum);
927  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
928  return (int)sum3;
929}
930
931static int SSE16x16(const uint8_t* a, const uint8_t* b) {
932  uint32x4_t sum = { 0, 0, 0, 0 };
933  int y;
934  for (y = 0; y < 16; ++y) {
935    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
936  }
937  return SumToInt(sum);
938}
939
940static int SSE16x8(const uint8_t* a, const uint8_t* b) {
941  uint32x4_t sum = { 0, 0, 0, 0 };
942  int y;
943  for (y = 0; y < 8; ++y) {
944    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
945  }
946  return SumToInt(sum);
947}
948
949static int SSE8x8(const uint8_t* a, const uint8_t* b) {
950  uint32x4_t sum = { 0, 0, 0, 0 };
951  int y;
952  for (y = 0; y < 8; ++y) {
953    const uint8x8_t a0 = vld1_u8(a + y * BPS);
954    const uint8x8_t b0 = vld1_u8(b + y * BPS);
955    const uint8x8_t abs_diff = vabd_u8(a0, b0);
956    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
957    sum = vpadalq_u16(sum, prod);
958  }
959  return SumToInt(sum);
960}
961
962static int SSE4x4(const uint8_t* a, const uint8_t* b) {
963  const uint8x16_t a0 = Load4x4(a);
964  const uint8x16_t b0 = Load4x4(b);
965  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
966  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
967  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
968  return SumToInt(vpaddlq_u16(prod));
969}
970
971//------------------------------------------------------------------------------
972
973// Compilation with gcc-4.6.x is problematic for now and vtbl? are unavailable
974// in iOS/arm64 builds. Disable this function in those cases.
975#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__))
976
977static int16x8_t Quantize(int16_t* const in,
978                          const VP8Matrix* const mtx, int offset) {
979  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
980  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
981  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
982  const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
983  const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
984
985  const int16x8_t a = vld1q_s16(in + offset);                // in
986  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
987  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
988  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
989  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
990  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
991  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
992  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
993  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
994                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
995  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
996  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
997  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
998  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
999  vst1q_s16(in + offset, c4);
1000  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
1001  return c3;
1002}
1003
1004static const uint8_t kShuffles[4][8] = {
1005 { 0,   1,  2,  3,  8,  9, 16, 17 },
1006 { 10, 11,  4,  5,  6,  7, 12, 13 },
1007 { 18, 19, 24, 25, 26, 27, 20, 21 },
1008 { 14, 15, 22, 23, 28, 29, 30, 31 }
1009};
1010
1011static int QuantizeBlock(int16_t in[16], int16_t out[16],
1012                         const VP8Matrix* const mtx) {
1013  const int16x8_t out0 = Quantize(in, mtx, 0);
1014  const int16x8_t out1 = Quantize(in, mtx, 8);
1015  uint8x8x4_t all_out;
1016  INIT_VECTOR4(all_out,
1017               vreinterpret_u8_s16(vget_low_s16(out0)),
1018               vreinterpret_u8_s16(vget_high_s16(out0)),
1019               vreinterpret_u8_s16(vget_low_s16(out1)),
1020               vreinterpret_u8_s16(vget_high_s16(out1)));
1021  // Zigzag reordering
1022  vst1_u8((uint8_t*)(out +  0), vtbl4_u8(all_out, vld1_u8(kShuffles[0])));
1023  vst1_u8((uint8_t*)(out +  4), vtbl4_u8(all_out, vld1_u8(kShuffles[1])));
1024  vst1_u8((uint8_t*)(out +  8), vtbl4_u8(all_out, vld1_u8(kShuffles[2])));
1025  vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
1026  // test zeros
1027  if (*(uint64_t*)(out +  0) != 0) return 1;
1028  if (*(uint64_t*)(out +  4) != 0) return 1;
1029  if (*(uint64_t*)(out +  8) != 0) return 1;
1030  if (*(uint64_t*)(out + 12) != 0) return 1;
1031  return 0;
1032}
1033
1034#endif   // !WORK_AROUND_GCC && !__aarch64__
1035
1036#endif   // WEBP_USE_NEON
1037
1038//------------------------------------------------------------------------------
1039// Entry point
1040
1041extern void VP8EncDspInitNEON(void);
1042
1043void VP8EncDspInitNEON(void) {
1044#if defined(WEBP_USE_NEON)
1045  VP8ITransform = ITransform;
1046  VP8FTransform = FTransform;
1047
1048  VP8FTransformWHT = FTransformWHT;
1049
1050  VP8TDisto4x4 = Disto4x4;
1051  VP8TDisto16x16 = Disto16x16;
1052  VP8CollectHistogram = CollectHistogram;
1053  VP8SSE16x16 = SSE16x16;
1054  VP8SSE16x8 = SSE16x8;
1055  VP8SSE8x8 = SSE8x8;
1056  VP8SSE4x4 = SSE4x4;
1057#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__))
1058  VP8EncQuantizeBlock = QuantizeBlock;
1059#endif
1060#endif   // WEBP_USE_NEON
1061}
1062