enc_neon.c revision 0406ce1417f76f2034833414dcecc9f56253640c
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2012 Google Inc. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// that can be found in the COPYING file in the root of the source 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// tree. An additional intellectual property rights grant can be found 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in the file PATENTS. All contributing project authors may 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// be found in the AUTHORS file in the root of the source tree. 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ----------------------------------------------------------------------------- 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ARM NEON version of speed-critical encoding functions. 1158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch// 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// adapted from libvpx (http://www.webmproject.org/code/) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "./dsp.h" 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(__cplusplus) || defined(c_plusplus) 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern "C" { 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 19d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 20d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)#if defined(WEBP_USE_NEON) 21d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 22d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)#include "../enc/vp8enci.h" 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//------------------------------------------------------------------------------ 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Transforms (Paragraph 14.4) 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Inverse transform. 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This code is pretty much the same as TransformOneNEON in the decoder, except 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for subtraction to *ref. See the comments there for algorithmic explanations. 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void ITransformOne(const uint8_t* ref, 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const int16_t* in, uint8_t* dst) { 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const int kBPS = BPS; 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const int16_t kC1C2[] = { 20091, 17734, 0, 0 }; // kC1 / (kC2 >> 1) / 0 / 0 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 35d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) __asm__ volatile ( 36d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vld1.16 {q1, q2}, [%[in]] \n" 37d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vld1.16 {d0}, [%[kC1C2]] \n" 38d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 39d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) // d2: in[0] 40d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) // d3: in[8] 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // d4: in[4] 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // d5: in[12] 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vswp d3, d4 \n" 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // q8 = {in[4], in[12]} * kC1 * 2 >> 16 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // q9 = {in[4], in[12]} * kC2 >> 16 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q8, q2, d0[0] \n" 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q9, q2, d0[1] \n" 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // d22 = a = in[0] + in[8] 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // d23 = b = in[0] - in[8] 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d22, d2, d3 \n" 534e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) "vqsub.s16 d23, d2, d3 \n" 544e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) 554e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) // q8 = in[4]/[12] * kC1 >> 16 564e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) "vshr.s16 q8, q8, #1 \n" 574e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles) 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add {in[4], in[12]} back after the multiplication. 59d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vqadd.s16 q8, q2, q8 \n" 60d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 61d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) // d20 = c = in[4]*kC2 - in[12]*kC1 62d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) // d21 = d = in[4]*kC1 + in[12]*kC2 63d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vqsub.s16 d20, d18, d17 \n" 64d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vqadd.s16 d21, d19, d16 \n" 65d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 66d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) // d2 = tmp[0] = a + d 67d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) // d3 = tmp[1] = b + c 68d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) // d4 = tmp[2] = b - c 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // d5 = tmp[3] = a - d 70d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vqadd.s16 d2, d22, d21 \n" 71d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vqadd.s16 d3, d23, d20 \n" 72d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vqsub.s16 d4, d23, d20 \n" 73d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vqsub.s16 d5, d22, d21 \n" 74d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 75d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vzip.16 q1, q2 \n" 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vzip.16 q1, q2 \n" 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vswp d3, d4 \n" 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // q9 = {tmp[4], tmp[12]} * kC2 >> 16 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q8, q2, d0[0] \n" 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q9, q2, d0[1] \n" 84a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 85a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // d22 = a = tmp[0] + tmp[8] 86a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // d23 = b = tmp[0] - tmp[8] 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d22, d2, d3 \n" 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d23, d2, d3 \n" 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vshr.s16 q8, q8, #1 \n" 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 q8, q2, q8 \n" 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // d20 = c = in[4]*kC2 - in[12]*kC1 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // d21 = d = in[4]*kC1 + in[12]*kC2 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d20, d18, d17 \n" 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d21, d19, d16 \n" 97a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 98a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // d2 = tmp[0] = a + d 99a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // d3 = tmp[1] = b + c 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // d4 = tmp[2] = b - c 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // d5 = tmp[3] = a - d 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d2, d22, d21 \n" 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d3, d23, d20 \n" 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d4, d23, d20 \n" 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d5, d22, d21 \n" 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d6[0], [%[ref]], %[kBPS] \n" 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d6[1], [%[ref]], %[kBPS] \n" 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d7[0], [%[ref]], %[kBPS] \n" 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d7[1], [%[ref]], %[kBPS] \n" 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "sub %[ref], %[ref], %[kBPS], lsl #2 \n" 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (val) + 4 >> 3 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d2, d2, #3 \n" 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d3, d3, #3 \n" 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d4, d4, #3 \n" 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d5, d5, #3 \n" 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1201320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci "vzip.16 q1, q2 \n" 1211320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci "vzip.16 q1, q2 \n" 1221320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci 1231320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci // Must accumulate before saturating 1241320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci "vmovl.u8 q8, d6 \n" 1251320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci "vmovl.u8 q9, d7 \n" 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 q1, q1, q8 \n" 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 q2, q2, q9 \n" 1291320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci 1301320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci "vqmovun.s16 d0, q1 \n" 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqmovun.s16 d1, q2 \n" 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 1361320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci "vst1.32 d1[1], [%[dst]] \n" 1371320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci 1381320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci : [in] "+r"(in), [dst] "+r"(dst) // modified registers 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ); 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void ITransform(const uint8_t* ref, 1451320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci const int16_t* in, uint8_t* dst, int do_two) { 1461320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci ITransformOne(ref, in, dst); 1471320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci if (do_two) { 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ITransformOne(ref + 4, in + 16, dst + 4); 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Same code as dec_neon.c 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void ITransformWHT(const int16_t* in, int16_t* out) { 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const int kStep = 32; // The store is only incrementing the pointer as if we 1551320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci // had stored a single byte. 1561320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci __asm__ volatile ( 1571320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci // part 1 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // load data into q0, q1 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.16 {q0, q1}, [%[in]] \n" 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8] 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12] 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 16858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2 17058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 17158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // Transpose 17258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] 17358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] 174a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "vswp d1, d4 \n" // vtrn.64 q0, q2 17558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vswp d3, d6 \n" // vtrn.64 q1, q3 176a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "vtrn.32 q0, q1 \n" 17758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vtrn.32 q2, q3 \n" 17858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 17958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vmov.s32 q4, #3 \n" // dc = 3 18058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3 18158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3] 18258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2] 18358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] 18458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] 18558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 18658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vadd.s32 q0, q6, q7 \n" 18758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 18858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vadd.s32 q1, q9, q8 \n" 18958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 19058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vsub.s32 q2, q6, q7 \n" 19158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 19258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vsub.s32 q3, q9, q8 \n" 19358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 19458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 19558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // set the results to output 19658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vst1.16 d0[0], [%[out]], %[kStep] \n" 19758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vst1.16 d1[0], [%[out]], %[kStep] \n" 19858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vst1.16 d2[0], [%[out]], %[kStep] \n" 199d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d3[0], [%[out]], %[kStep] \n" 200d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d0[1], [%[out]], %[kStep] \n" 201d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d1[1], [%[out]], %[kStep] \n" 202d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d2[1], [%[out]], %[kStep] \n" 203d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d3[1], [%[out]], %[kStep] \n" 204d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d0[2], [%[out]], %[kStep] \n" 205d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d1[2], [%[out]], %[kStep] \n" 206d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d2[2], [%[out]], %[kStep] \n" 207d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d3[2], [%[out]], %[kStep] \n" 208d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d0[3], [%[out]], %[kStep] \n" 209d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d1[3], [%[out]], %[kStep] \n" 210d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d2[3], [%[out]], %[kStep] \n" 211d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vst1.16 d3[3], [%[out]], %[kStep] \n" 212d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 213d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) : [out] "+r"(out) // modified registers 214d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) : [in] "r"(in), [kStep] "r"(kStep) // constants 215d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) : "memory", "q0", "q1", "q2", "q3", "q4", 216d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "q5", "q6", "q7", "q8", "q9" // clobbered 217d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) ); 218d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)} 219d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 220d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)// Forward transform. 221d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 222d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm 223d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)static const int16_t kCoeff16[] = { 224d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217 225d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)}; 226d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)static const int32_t kCoeff32[] = { 227d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 1812, 1812, 1812, 1812, 228a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 937, 937, 937, 937, 22958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 12000, 12000, 12000, 12000, 23058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 51000, 51000, 51000, 51000 23158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch}; 23258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 23358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdochstatic void FTransform(const uint8_t* src, const uint8_t* ref, 23458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch int16_t* out) { 23558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch const int kBPS = BPS; 23658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch const uint8_t* src_ptr = src; 237a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) const uint8_t* ref_ptr = ref; 23858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch const int16_t* coeff16 = kCoeff16; 239a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) const int32_t* coeff32 = kCoeff32; 24058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 24158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch __asm__ volatile ( 24258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // load src into q4, q5 in high half 24358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n" 24458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n" 24558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n" 24658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.8 {d11}, [%[src_ptr]] \n" 24758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 24858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // load ref into q6, q7 in high half 24958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n" 25058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n" 25158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n" 25258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.8 {d15}, [%[ref_ptr]] \n" 253d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 254d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) // Pack the high values in to q4 and q6 255d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vtrn.32 q4, q5 \n" 256d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vtrn.32 q6, q7 \n" 257d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 258d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) // d[0-3] = src - ref 259d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vsubl.u8 q0, d8, d12 \n" 260a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "vsubl.u8 q1, d9, d13 \n" 26158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 26258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // load coeff16 into q8(d16=5352, d17=2217) 26358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 {q8}, [%[coeff16]] \n" 26458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 26558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // load coeff32 high half into q9 = 1812, q10 = 937 26658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.32 {q9, q10}, [%[coeff32]]! \n" 267a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 26858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // load coeff32 low half into q11=12000, q12=51000 269a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "vld1.32 {q11,q12}, [%[coeff32]] \n" 27058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 27158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // part 1 27258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // Transpose. Register dN is the same as dN in C 27358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vtrn.32 d0, d2 \n" 27458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vtrn.32 d1, d3 \n" 27558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vtrn.16 d0, d1 \n" 27658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vtrn.16 d2, d3 \n" 27758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 27858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3 27958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2 28058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2 28158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3 28258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 283d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vadd.s16 d0, d4, d5 \n" // a0 + a1 284d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3 285d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vsub.s16 d2, d4, d5 \n" // a0 - a1 286d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3 287d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 288d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812 289d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937 290a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812 29158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352 29258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 29358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9 29458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9 29558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vshrn.s32 d1, q9, #9 \n" 29658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vshrn.s32 d3, q10, #9 \n" 297a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 29858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // part 2 299a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] 30058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vtrn.32 d0, d2 \n" 30158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vtrn.32 d1, d3 \n" 30258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vtrn.16 d0, d1 \n" 30358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vtrn.16 d2, d3 \n" 30458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 30558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vmov.s16 d26, #7 \n" 30658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 30758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12] 30858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8] 30958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8] 31058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vadd.s16 d4, d4, d26 \n" // a1 + 7 31158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12] 31258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 313d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7 314d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7 315d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 316d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000 317d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000 318d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) 319d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles) "vceq.s16 d4, d7, #0 \n" 320a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 32158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vshr.s16 d0, d0, #4 \n" 32258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vshr.s16 d2, d2, #4 \n" 32358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 32458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000 32558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000 32658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 32758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vmvn d4, d4 \n" // !(d1 == 0) 32858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // op[4] = (c1*2217 + d1*5352 + 12000)>>16 32958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vshrn.s32 d1, q11, #16 \n" 33058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // op[4] += (d1!=0) 33158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vsub.s16 d1, d1, d4 \n" 33258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // op[12]= (d1*2217 - c1*5352 + 51000)>>16 33358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vshrn.s32 d3, q12, #16 \n" 33458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 33558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // set result to out array 33658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vst1.16 {q0, q1}, [%[out]] \n" 33758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr), 33858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch [coeff32] "+r"(coeff32) // modified registers 33958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16), 34058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch [out] "r"(out) // constants 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "q10", "q11", "q12", "q13" // clobbered 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ); 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 34558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch 34658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdochstatic void FTransformWHT(const int16_t* in, int16_t* out) { 34758e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch const int kStep = 32; 34858e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch __asm__ volatile ( 34958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // d0 = in[0 * 16] , d1 = in[1 * 16] 35058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch // d2 = in[2 * 16] , d3 = in[3 * 16] 35158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 d0[0], [%[in]], %[kStep] \n" 35258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 d1[0], [%[in]], %[kStep] \n" 35358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 d2[0], [%[in]], %[kStep] \n" 35458e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 d3[0], [%[in]], %[kStep] \n" 35558e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 d0[1], [%[in]], %[kStep] \n" 35658e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 d1[1], [%[in]], %[kStep] \n" 357a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "vld1.16 d2[1], [%[in]], %[kStep] \n" 358a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "vld1.16 d3[1], [%[in]], %[kStep] \n" 35958e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 d0[2], [%[in]], %[kStep] \n" 36058e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 d1[2], [%[in]], %[kStep] \n" 36158e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 d2[2], [%[in]], %[kStep] \n" 36258e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 d3[2], [%[in]], %[kStep] \n" 36358e6fbe4ee35d65e14b626c557d37565bf8ad179Ben Murdoch "vld1.16 d0[3], [%[in]], %[kStep] \n" 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.16 d1[3], [%[in]], %[kStep] \n" 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.16 d2[3], [%[in]], %[kStep] \n" 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.16 d3[3], [%[in]], %[kStep] \n" 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vaddl.s16 q2, d0, d2 \n" // a0=(in[0*16]+in[2*16]) 369 "vaddl.s16 q3, d1, d3 \n" // a1=(in[1*16]+in[3*16]) 370 "vsubl.s16 q4, d1, d3 \n" // a2=(in[1*16]-in[3*16]) 371 "vsubl.s16 q5, d0, d2 \n" // a3=(in[0*16]-in[2*16]) 372 373 "vqadd.s32 q6, q2, q3 \n" // a0 + a1 374 "vqadd.s32 q7, q5, q4 \n" // a3 + a2 375 "vqsub.s32 q8, q5, q4 \n" // a3 - a2 376 "vqsub.s32 q9, q2, q3 \n" // a0 - a1 377 378 // Transpose 379 // q6 = tmp[0, 1, 2, 3] ; q7 = tmp[ 4, 5, 6, 7] 380 // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15] 381 "vswp d13, d16 \n" // vtrn.64 q0, q2 382 "vswp d15, d18 \n" // vtrn.64 q1, q3 383 "vtrn.32 q6, q7 \n" 384 "vtrn.32 q8, q9 \n" 385 386 "vqadd.s32 q0, q6, q8 \n" // a0 = tmp[0] + tmp[8] 387 "vqadd.s32 q1, q7, q9 \n" // a1 = tmp[4] + tmp[12] 388 "vqsub.s32 q2, q7, q9 \n" // a2 = tmp[4] - tmp[12] 389 "vqsub.s32 q3, q6, q8 \n" // a3 = tmp[0] - tmp[8] 390 391 "vqadd.s32 q4, q0, q1 \n" // b0 = a0 + a1 392 "vqadd.s32 q5, q3, q2 \n" // b1 = a3 + a2 393 "vqsub.s32 q6, q3, q2 \n" // b2 = a3 - a2 394 "vqsub.s32 q7, q0, q1 \n" // b3 = a0 - a1 395 396 "vshrn.s32 d18, q4, #1 \n" // b0 >> 1 397 "vshrn.s32 d19, q5, #1 \n" // b1 >> 1 398 "vshrn.s32 d20, q6, #1 \n" // b2 >> 1 399 "vshrn.s32 d21, q7, #1 \n" // b3 >> 1 400 401 "vst1.16 {q9, q10}, [%[out]] \n" 402 403 : [in] "+r"(in) 404 : [kStep] "r"(kStep), [out] "r"(out) 405 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", 406 "q6", "q7", "q8", "q9", "q10" // clobbered 407 ) ; 408} 409 410//------------------------------------------------------------------------------ 411// Texture distortion 412// 413// We try to match the spectral content (weighted) between source and 414// reconstructed samples. 415 416// Hadamard transform 417// Returns the weighted sum of the absolute value of transformed coefficients. 418// This uses a TTransform helper function in C 419static int Disto4x4(const uint8_t* const a, const uint8_t* const b, 420 const uint16_t* const w) { 421 const int kBPS = BPS; 422 const uint8_t* A = a; 423 const uint8_t* B = b; 424 const uint16_t* W = w; 425 int sum; 426 __asm__ volatile ( 427 "vld1.32 d0[0], [%[a]], %[kBPS] \n" 428 "vld1.32 d0[1], [%[a]], %[kBPS] \n" 429 "vld1.32 d2[0], [%[a]], %[kBPS] \n" 430 "vld1.32 d2[1], [%[a]] \n" 431 432 "vld1.32 d1[0], [%[b]], %[kBPS] \n" 433 "vld1.32 d1[1], [%[b]], %[kBPS] \n" 434 "vld1.32 d3[0], [%[b]], %[kBPS] \n" 435 "vld1.32 d3[1], [%[b]] \n" 436 437 // a d0/d2, b d1/d3 438 // d0/d1: 01 01 01 01 439 // d2/d3: 23 23 23 23 440 // But: it goes 01 45 23 67 441 // Notice the middle values are transposed 442 "vtrn.16 q0, q1 \n" 443 444 // {a0, a1} = {in[0] + in[2], in[1] + in[3]} 445 "vaddl.u8 q2, d0, d2 \n" 446 "vaddl.u8 q10, d1, d3 \n" 447 // {a3, a2} = {in[0] - in[2], in[1] - in[3]} 448 "vsubl.u8 q3, d0, d2 \n" 449 "vsubl.u8 q11, d1, d3 \n" 450 451 // tmp[0] = a0 + a1 452 "vpaddl.s16 q0, q2 \n" 453 "vpaddl.s16 q8, q10 \n" 454 455 // tmp[1] = a3 + a2 456 "vpaddl.s16 q1, q3 \n" 457 "vpaddl.s16 q9, q11 \n" 458 459 // No pair subtract 460 // q2 = {a0, a3} 461 // q3 = {a1, a2} 462 "vtrn.16 q2, q3 \n" 463 "vtrn.16 q10, q11 \n" 464 465 // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2} 466 "vsubl.s16 q12, d4, d6 \n" 467 "vsubl.s16 q13, d5, d7 \n" 468 "vsubl.s16 q14, d20, d22 \n" 469 "vsubl.s16 q15, d21, d23 \n" 470 471 // separate tmp[3] and tmp[2] 472 // q12 = tmp[3] 473 // q13 = tmp[2] 474 "vtrn.32 q12, q13 \n" 475 "vtrn.32 q14, q15 \n" 476 477 // Transpose tmp for a 478 "vswp d1, d26 \n" // vtrn.64 479 "vswp d3, d24 \n" // vtrn.64 480 "vtrn.32 q0, q1 \n" 481 "vtrn.32 q13, q12 \n" 482 483 // Transpose tmp for b 484 "vswp d17, d30 \n" // vtrn.64 485 "vswp d19, d28 \n" // vtrn.64 486 "vtrn.32 q8, q9 \n" 487 "vtrn.32 q15, q14 \n" 488 489 // The first Q register is a, the second b. 490 // q0/8 tmp[0-3] 491 // q13/15 tmp[4-7] 492 // q1/9 tmp[8-11] 493 // q12/14 tmp[12-15] 494 495 // These are still in 01 45 23 67 order. We fix it easily in the addition 496 // case but the subtraction propegates them. 497 "vswp d3, d27 \n" 498 "vswp d19, d31 \n" 499 500 // a0 = tmp[0] + tmp[8] 501 "vadd.s32 q2, q0, q1 \n" 502 "vadd.s32 q3, q8, q9 \n" 503 504 // a1 = tmp[4] + tmp[12] 505 "vadd.s32 q10, q13, q12 \n" 506 "vadd.s32 q11, q15, q14 \n" 507 508 // a2 = tmp[4] - tmp[12] 509 "vsub.s32 q13, q13, q12 \n" 510 "vsub.s32 q15, q15, q14 \n" 511 512 // a3 = tmp[0] - tmp[8] 513 "vsub.s32 q0, q0, q1 \n" 514 "vsub.s32 q8, q8, q9 \n" 515 516 // b0 = a0 + a1 517 "vadd.s32 q1, q2, q10 \n" 518 "vadd.s32 q9, q3, q11 \n" 519 520 // b1 = a3 + a2 521 "vadd.s32 q12, q0, q13 \n" 522 "vadd.s32 q14, q8, q15 \n" 523 524 // b2 = a3 - a2 525 "vsub.s32 q0, q0, q13 \n" 526 "vsub.s32 q8, q8, q15 \n" 527 528 // b3 = a0 - a1 529 "vsub.s32 q2, q2, q10 \n" 530 "vsub.s32 q3, q3, q11 \n" 531 532 "vld1.64 {q10, q11}, [%[w]] \n" 533 534 // abs(b0) 535 "vabs.s32 q1, q1 \n" 536 "vabs.s32 q9, q9 \n" 537 // abs(b1) 538 "vabs.s32 q12, q12 \n" 539 "vabs.s32 q14, q14 \n" 540 // abs(b2) 541 "vabs.s32 q0, q0 \n" 542 "vabs.s32 q8, q8 \n" 543 // abs(b3) 544 "vabs.s32 q2, q2 \n" 545 "vabs.s32 q3, q3 \n" 546 547 // expand w before using. 548 "vmovl.u16 q13, d20 \n" 549 "vmovl.u16 q15, d21 \n" 550 551 // w[0] * abs(b0) 552 "vmul.u32 q1, q1, q13 \n" 553 "vmul.u32 q9, q9, q13 \n" 554 555 // w[4] * abs(b1) 556 "vmla.u32 q1, q12, q15 \n" 557 "vmla.u32 q9, q14, q15 \n" 558 559 // expand w before using. 560 "vmovl.u16 q13, d22 \n" 561 "vmovl.u16 q15, d23 \n" 562 563 // w[8] * abs(b1) 564 "vmla.u32 q1, q0, q13 \n" 565 "vmla.u32 q9, q8, q13 \n" 566 567 // w[12] * abs(b1) 568 "vmla.u32 q1, q2, q15 \n" 569 "vmla.u32 q9, q3, q15 \n" 570 571 // Sum the arrays 572 "vpaddl.u32 q1, q1 \n" 573 "vpaddl.u32 q9, q9 \n" 574 "vadd.u64 d2, d3 \n" 575 "vadd.u64 d18, d19 \n" 576 577 // Hadamard transform needs 4 bits of extra precision (2 bits in each 578 // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum 579 // precision for coeff is 8bit of input + 4bits of Hadamard transform + 580 // 16bits for w[] + 2 bits of abs() summation. 581 // 582 // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is 583 // A-OK. 584 585 // sum2 - sum1 586 "vsub.u32 d0, d2, d18 \n" 587 // abs(sum2 - sum1) 588 "vabs.s32 d0, d0 \n" 589 // abs(sum2 - sum1) >> 5 590 "vshr.u32 d0, #5 \n" 591 592 // It would be better to move the value straight into r0 but I'm not 593 // entirely sure how this works with inline assembly. 594 "vmov.32 %[sum], d0[0] \n" 595 596 : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W) 597 : [kBPS] "r"(kBPS) 598 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", 599 "q10", "q11", "q12", "q13", "q14", "q15" // clobbered 600 ) ; 601 602 return sum; 603} 604 605static int Disto16x16(const uint8_t* const a, const uint8_t* const b, 606 const uint16_t* const w) { 607 int D = 0; 608 int x, y; 609 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 610 for (x = 0; x < 16; x += 4) { 611 D += Disto4x4(a + x + y, b + x + y, w); 612 } 613 } 614 return D; 615} 616 617#endif // WEBP_USE_NEON 618 619//------------------------------------------------------------------------------ 620// Entry point 621 622extern void VP8EncDspInitNEON(void); 623 624void VP8EncDspInitNEON(void) { 625#if defined(WEBP_USE_NEON) 626 VP8ITransform = ITransform; 627 VP8FTransform = FTransform; 628 629 VP8ITransformWHT = ITransformWHT; 630 VP8FTransformWHT = FTransformWHT; 631 632 VP8TDisto4x4 = Disto4x4; 633 VP8TDisto16x16 = Disto16x16; 634#endif // WEBP_USE_NEON 635} 636 637#if defined(__cplusplus) || defined(c_plusplus) 638} // extern "C" 639#endif 640