enc_neon.c revision 1e7bf8805bd030c19924a5306837ecd72c295751
1a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Copyright 2012 Google Inc. All Rights Reserved. 2a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 3a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// This code is licensed under the same terms as WebM: 4a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Software License Agreement: http://www.webmproject.org/license/software/ 5a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ 6a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// ----------------------------------------------------------------------------- 7a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 8010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// ARM NEON version of speed-critical encoding functions. 9a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch// 100529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// adapted from libvpx (http://www.webmproject.org/code/) 11a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 12010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#include "./dsp.h" 13a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 14010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#if defined(__cplusplus) || defined(c_plusplus) 15a02191e04bc25c4935f804f2c080ae28663d096dBen Murdochextern "C" { 16a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#endif 17a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 18a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch#if defined(WEBP_USE_NEON) 19a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 20010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)#include "../enc/vp8enci.h" 21010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 22010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)//------------------------------------------------------------------------------ 23010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// Transforms (Paragraph 14.4) 24010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 25010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// Inverse transform. 26010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// This code is pretty much the same as TransformOneNEON in the decoder, except 27010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// for subtraction to *ref. See the comments there for algorithmic explanations. 28010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)static void ITransformOne(const uint8_t* ref, 29010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) const int16_t* in, uint8_t* dst) { 30010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) const int kBPS = BPS; 31010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) const int16_t kC1C2[] = { 20091, 17734, 0, 0 }; // kC1 / (kC2 >> 1) / 0 / 0 32010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 33010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) __asm__ volatile ( 34010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vld1.16 {q1, q2}, [%[in]] \n" 35010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vld1.16 {d0}, [%[kC1C2]] \n" 36010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 37010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // d2: in[0] 38010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // d3: in[8] 39010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // d4: in[4] 40010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // d5: in[12] 41010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vswp d3, d4 \n" 42010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 43010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // q8 = {in[4], in[12]} * kC1 * 2 >> 16 44010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // q9 = {in[4], in[12]} * kC2 >> 16 45010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqdmulh.s16 q8, q2, d0[0] \n" 46010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqdmulh.s16 q9, q2, d0[1] \n" 47010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 48010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // d22 = a = in[0] + in[8] 49010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // d23 = b = in[0] - in[8] 50010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqadd.s16 d22, d2, d3 \n" 51010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqsub.s16 d23, d2, d3 \n" 52010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 53010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // q8 = in[4]/[12] * kC1 >> 16 54010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vshr.s16 q8, q8, #1 \n" 55010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 56010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // Add {in[4], in[12]} back after the multiplication. 57010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqadd.s16 q8, q2, q8 \n" 58010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 59010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // d20 = c = in[4]*kC2 - in[12]*kC1 60010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // d21 = d = in[4]*kC1 + in[12]*kC2 61a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqsub.s16 d20, d18, d17 \n" 62a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqadd.s16 d21, d19, d16 \n" 63010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 64010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // d2 = tmp[0] = a + d 65a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // d3 = tmp[1] = b + c 66a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // d4 = tmp[2] = b - c 67010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // d5 = tmp[3] = a - d 68010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqadd.s16 d2, d22, d21 \n" 69a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqadd.s16 d3, d23, d20 \n" 70a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqsub.s16 d4, d23, d20 \n" 71010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqsub.s16 d5, d22, d21 \n" 72010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 73a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vzip.16 q1, q2 \n" 74f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) "vzip.16 q1, q2 \n" 755f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 76a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vswp d3, d4 \n" 77a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 78a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 79a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // q9 = {tmp[4], tmp[12]} * kC2 >> 16 80a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqdmulh.s16 q8, q2, d0[0] \n" 81a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqdmulh.s16 q9, q2, d0[1] \n" 82a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 83a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // d22 = a = tmp[0] + tmp[8] 84a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // d23 = b = tmp[0] - tmp[8] 85a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqadd.s16 d22, d2, d3 \n" 86a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqsub.s16 d23, d2, d3 \n" 87a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 88a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vshr.s16 q8, q8, #1 \n" 89a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqadd.s16 q8, q2, q8 \n" 90a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 91a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // d20 = c = in[4]*kC2 - in[12]*kC1 92a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // d21 = d = in[4]*kC1 + in[12]*kC2 93a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqsub.s16 d20, d18, d17 \n" 94a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqadd.s16 d21, d19, d16 \n" 95a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 96a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // d2 = tmp[0] = a + d 97a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // d3 = tmp[1] = b + c 98a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // d4 = tmp[2] = b - c 99a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // d5 = tmp[3] = a - d 100a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqadd.s16 d2, d22, d21 \n" 101a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vqadd.s16 d3, d23, d20 \n" 102010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqsub.s16 d4, d23, d20 \n" 103010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqsub.s16 d5, d22, d21 \n" 104010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 105010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vld1.32 d6[0], [%[ref]], %[kBPS] \n" 106010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vld1.32 d6[1], [%[ref]], %[kBPS] \n" 107010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vld1.32 d7[0], [%[ref]], %[kBPS] \n" 108010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vld1.32 d7[1], [%[ref]], %[kBPS] \n" 1091320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci 110010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "sub %[ref], %[ref], %[kBPS], lsl #2 \n" 111a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 112010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // (val) + 4 >> 3 113a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vrshr.s16 d2, d2, #3 \n" 114010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vrshr.s16 d3, d3, #3 \n" 115010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vrshr.s16 d4, d4, #3 \n" 116010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vrshr.s16 d5, d5, #3 \n" 117010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 118010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vzip.16 q1, q2 \n" 119010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vzip.16 q1, q2 \n" 120010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 121010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // Must accumulate before saturating 122010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vmovl.u8 q8, d6 \n" 12346d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles) "vmovl.u8 q9, d7 \n" 12446d4c2bc3267f3f028f39e7e311b0f89aba2e4fdTorne (Richard Coles) 125010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqadd.s16 q1, q1, q8 \n" 126010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqadd.s16 q2, q2, q9 \n" 127010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 128010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqmovun.s16 d0, q1 \n" 129010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vqmovun.s16 d1, q2 \n" 130010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 131010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 132010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 133010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 134010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vst1.32 d1[1], [%[dst]] \n" 135010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 136010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) : [in] "+r"(in), [dst] "+r"(dst) // modified registers 137010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants 138010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered 139010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) ); 140010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)} 1415f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 142010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)static void ITransform(const uint8_t* ref, 143010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) const int16_t* in, uint8_t* dst, int do_two) { 144010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) ITransformOne(ref, in, dst); 145010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) if (do_two) { 146010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) ITransformOne(ref + 4, in + 16, dst + 4); 147010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) } 148010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)} 149010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 150010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)// Same code as dec_neon.c 1511320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tuccistatic void ITransformWHT(const int16_t* in, int16_t* out) { 152010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) const int kStep = 32; // The store is only incrementing the pointer as if we 153010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // had stored a single byte. 1541320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci __asm__ volatile ( 155010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) // part 1 1561320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci // load data into q0, q1 1576e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) "vld1.16 {q0, q1}, [%[in]] \n" 158010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 159010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] 160010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] 161010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8] 162010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12] 1631320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci 164010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 165010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 166010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2 167010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2 1681320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci 169a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // Transpose 170a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] 171a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] 172010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) "vswp d1, d4 \n" // vtrn.64 q0, q2 173a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vswp d3, d6 \n" // vtrn.64 q1, q3 174a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vtrn.32 q0, q1 \n" 175a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vtrn.32 q2, q3 \n" 176a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 177a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vmov.s32 q4, #3 \n" // dc = 3 178a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3 179a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3] 180a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2] 181a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] 182a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] 183a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 184a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vadd.s32 q0, q6, q7 \n" 185a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 186a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vadd.s32 q1, q9, q8 \n" 187a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 188a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vsub.s32 q2, q6, q7 \n" 189a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 190a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vsub.s32 q3, q9, q8 \n" 191a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 192a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch 193a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch // set the results to output 194a02191e04bc25c4935f804f2c080ae28663d096dBen Murdoch "vst1.16 d0[0], [%[out]], %[kStep] \n" 195 "vst1.16 d1[0], [%[out]], %[kStep] \n" 196 "vst1.16 d2[0], [%[out]], %[kStep] \n" 197 "vst1.16 d3[0], [%[out]], %[kStep] \n" 198 "vst1.16 d0[1], [%[out]], %[kStep] \n" 199 "vst1.16 d1[1], [%[out]], %[kStep] \n" 200 "vst1.16 d2[1], [%[out]], %[kStep] \n" 201 "vst1.16 d3[1], [%[out]], %[kStep] \n" 202 "vst1.16 d0[2], [%[out]], %[kStep] \n" 203 "vst1.16 d1[2], [%[out]], %[kStep] \n" 204 "vst1.16 d2[2], [%[out]], %[kStep] \n" 205 "vst1.16 d3[2], [%[out]], %[kStep] \n" 206 "vst1.16 d0[3], [%[out]], %[kStep] \n" 207 "vst1.16 d1[3], [%[out]], %[kStep] \n" 208 "vst1.16 d2[3], [%[out]], %[kStep] \n" 209 "vst1.16 d3[3], [%[out]], %[kStep] \n" 210 211 : [out] "+r"(out) // modified registers 212 : [in] "r"(in), [kStep] "r"(kStep) // constants 213 : "memory", "q0", "q1", "q2", "q3", "q4", 214 "q5", "q6", "q7", "q8", "q9" // clobbered 215 ); 216} 217 218// Forward transform. 219 220// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm 221static const int16_t kCoeff16[] = { 222 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217 223}; 224static const int32_t kCoeff32[] = { 225 1812, 1812, 1812, 1812, 226 937, 937, 937, 937, 227 12000, 12000, 12000, 12000, 228 51000, 51000, 51000, 51000 229}; 230 231static void FTransform(const uint8_t* src, const uint8_t* ref, 232 int16_t* out) { 233 const int kBPS = BPS; 234 const uint8_t* src_ptr = src; 235 const uint8_t* ref_ptr = ref; 236 const int16_t* coeff16 = kCoeff16; 237 const int32_t* coeff32 = kCoeff32; 238 239 __asm__ volatile ( 240 // load src into q4, q5 in high half 241 "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n" 242 "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n" 243 "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n" 244 "vld1.8 {d11}, [%[src_ptr]] \n" 245 246 // load ref into q6, q7 in high half 247 "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n" 248 "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n" 249 "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n" 250 "vld1.8 {d15}, [%[ref_ptr]] \n" 251 252 // Pack the high values in to q4 and q6 253 "vtrn.32 q4, q5 \n" 254 "vtrn.32 q6, q7 \n" 255 256 // d[0-3] = src - ref 257 "vsubl.u8 q0, d8, d12 \n" 258 "vsubl.u8 q1, d9, d13 \n" 259 260 // load coeff16 into q8(d16=5352, d17=2217) 261 "vld1.16 {q8}, [%[coeff16]] \n" 262 263 // load coeff32 high half into q9 = 1812, q10 = 937 264 "vld1.32 {q9, q10}, [%[coeff32]]! \n" 265 266 // load coeff32 low half into q11=12000, q12=51000 267 "vld1.32 {q11,q12}, [%[coeff32]] \n" 268 269 // part 1 270 // Transpose. Register dN is the same as dN in C 271 "vtrn.32 d0, d2 \n" 272 "vtrn.32 d1, d3 \n" 273 "vtrn.16 d0, d1 \n" 274 "vtrn.16 d2, d3 \n" 275 276 "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3 277 "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2 278 "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2 279 "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3 280 281 "vadd.s16 d0, d4, d5 \n" // a0 + a1 282 "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3 283 "vsub.s16 d2, d4, d5 \n" // a0 - a1 284 "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3 285 286 "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812 287 "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937 288 "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812 289 "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352 290 291 // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9 292 // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9 293 "vshrn.s32 d1, q9, #9 \n" 294 "vshrn.s32 d3, q10, #9 \n" 295 296 // part 2 297 // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] 298 "vtrn.32 d0, d2 \n" 299 "vtrn.32 d1, d3 \n" 300 "vtrn.16 d0, d1 \n" 301 "vtrn.16 d2, d3 \n" 302 303 "vmov.s16 d26, #7 \n" 304 305 "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12] 306 "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8] 307 "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8] 308 "vadd.s16 d4, d4, d26 \n" // a1 + 7 309 "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12] 310 311 "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7 312 "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7 313 314 "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000 315 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000 316 317 "vceq.s16 d4, d7, #0 \n" 318 319 "vshr.s16 d0, d0, #4 \n" 320 "vshr.s16 d2, d2, #4 \n" 321 322 "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000 323 "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000 324 325 "vmvn.s16 d4, d4 \n" 326 // op[4] = (c1*2217 + d1*5352 + 12000)>>16 327 "vshrn.s32 d1, q11, #16 \n" 328 // op[4] += (d1!=0) 329 "vsub.s16 d1, d1, d4 \n" 330 // op[12]= (d1*2217 - c1*5352 + 51000)>>16 331 "vshrn.s32 d3, q12, #16 \n" 332 333 // set result to out array 334 "vst1.16 {q0, q1}, [%[out]] \n" 335 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr), 336 [coeff32] "+r"(coeff32) // modified registers 337 : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16), 338 [out] "r"(out) // constants 339 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", 340 "q10", "q11", "q12", "q13" // clobbered 341 ); 342} 343 344static void FTransformWHT(const int16_t* in, int16_t* out) { 345 const int kStep = 32; 346 __asm__ volatile ( 347 // d0 = in[0 * 16] , d1 = in[1 * 16] 348 // d2 = in[2 * 16] , d3 = in[3 * 16] 349 "vld1.16 d0[0], [%[in]], %[kStep] \n" 350 "vld1.16 d1[0], [%[in]], %[kStep] \n" 351 "vld1.16 d2[0], [%[in]], %[kStep] \n" 352 "vld1.16 d3[0], [%[in]], %[kStep] \n" 353 "vld1.16 d0[1], [%[in]], %[kStep] \n" 354 "vld1.16 d1[1], [%[in]], %[kStep] \n" 355 "vld1.16 d2[1], [%[in]], %[kStep] \n" 356 "vld1.16 d3[1], [%[in]], %[kStep] \n" 357 "vld1.16 d0[2], [%[in]], %[kStep] \n" 358 "vld1.16 d1[2], [%[in]], %[kStep] \n" 359 "vld1.16 d2[2], [%[in]], %[kStep] \n" 360 "vld1.16 d3[2], [%[in]], %[kStep] \n" 361 "vld1.16 d0[3], [%[in]], %[kStep] \n" 362 "vld1.16 d1[3], [%[in]], %[kStep] \n" 363 "vld1.16 d2[3], [%[in]], %[kStep] \n" 364 "vld1.16 d3[3], [%[in]], %[kStep] \n" 365 366 "vaddl.s16 q2, d0, d2 \n" 367 "vshl.s32 q2, q2, #2 \n" // a0=(in[0*16]+in[2*16])<<2 368 "vaddl.s16 q3, d1, d3 \n" 369 "vshl.s32 q3, q3, #2 \n" // a1=(in[1*16]+in[3*16])<<2 370 "vsubl.s16 q4, d1, d3 \n" 371 "vshl.s32 q4, q4, #2 \n" // a2=(in[1*16]-in[3*16])<<2 372 "vsubl.s16 q5, d0, d2 \n" 373 "vshl.s32 q5, q5, #2 \n" // a3=(in[0*16]-in[2*16])<<2 374 375 "vceq.s32 q10, q2, #0 \n" 376 "vmvn.s32 q10, q10 \n" // (a0 != 0) 377 "vqadd.s32 q6, q2, q3 \n" // (a0 + a1) 378 "vqsub.s32 q6, q6, q10 \n" // (a0 + a1) + (a0 != 0) 379 "vqadd.s32 q7, q5, q4 \n" // a3 + a2 380 "vqsub.s32 q8, q5, q4 \n" // a3 - a2 381 "vqsub.s32 q9, q2, q3 \n" // a0 - a1 382 383 // Transpose 384 // q6 = tmp[0, 1, 2, 3] ; q7 = tmp[ 4, 5, 6, 7] 385 // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15] 386 "vswp d13, d16 \n" // vtrn.64 q0, q2 387 "vswp d15, d18 \n" // vtrn.64 q1, q3 388 "vtrn.32 q6, q7 \n" 389 "vtrn.32 q8, q9 \n" 390 391 "vqadd.s32 q0, q6, q8 \n" // a0 = tmp[0] + tmp[8] 392 "vqadd.s32 q1, q7, q9 \n" // a1 = tmp[4] + tmp[12] 393 "vqsub.s32 q2, q7, q9 \n" // a2 = tmp[4] - tmp[12] 394 "vqsub.s32 q3, q6, q8 \n" // a3 = tmp[0] - tmp[8] 395 396 "vqadd.s32 q4, q0, q1 \n" // b0 = a0 + a1 397 "vqadd.s32 q5, q3, q2 \n" // b1 = a3 + a2 398 "vqsub.s32 q6, q3, q2 \n" // b2 = a3 - a2 399 "vqsub.s32 q7, q0, q1 \n" // b3 = a0 - a1 400 401 "vmov.s32 q0, #3 \n" // q0 = 3 402 403 "vcgt.s32 q1, q4, #0 \n" // (b0>0) 404 "vqsub.s32 q2, q4, q1 \n" // (b0+(b0>0)) 405 "vqadd.s32 q3, q2, q0 \n" // (b0+(b0>0)+3) 406 "vshrn.s32 d18, q3, #3 \n" // (b0+(b0>0)+3) >> 3 407 408 "vcgt.s32 q1, q5, #0 \n" // (b1>0) 409 "vqsub.s32 q2, q5, q1 \n" // (b1+(b1>0)) 410 "vqadd.s32 q3, q2, q0 \n" // (b1+(b1>0)+3) 411 "vshrn.s32 d19, q3, #3 \n" // (b1+(b1>0)+3) >> 3 412 413 "vcgt.s32 q1, q6, #0 \n" // (b2>0) 414 "vqsub.s32 q2, q6, q1 \n" // (b2+(b2>0)) 415 "vqadd.s32 q3, q2, q0 \n" // (b2+(b2>0)+3) 416 "vshrn.s32 d20, q3, #3 \n" // (b2+(b2>0)+3) >> 3 417 418 "vcgt.s32 q1, q7, #0 \n" // (b3>0) 419 "vqsub.s32 q2, q7, q1 \n" // (b3+(b3>0)) 420 "vqadd.s32 q3, q2, q0 \n" // (b3+(b3>0)+3) 421 "vshrn.s32 d21, q3, #3 \n" // (b3+(b3>0)+3) >> 3 422 423 "vst1.16 {q9, q10}, [%[out]] \n" 424 425 : [in] "+r"(in) 426 : [kStep] "r"(kStep), [out] "r"(out) 427 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", 428 "q6", "q7", "q8", "q9", "q10" // clobbered 429 ) ; 430} 431 432//------------------------------------------------------------------------------ 433// Texture distortion 434// 435// We try to match the spectral content (weighted) between source and 436// reconstructed samples. 437 438// Hadamard transform 439// Returns the weighted sum of the absolute value of transformed coefficients. 440// This uses a TTransform helper function in C 441static int Disto4x4(const uint8_t* const a, const uint8_t* const b, 442 const uint16_t* const w) { 443 const int kBPS = BPS; 444 const uint8_t* A = a; 445 const uint8_t* B = b; 446 const uint16_t* W = w; 447 int sum; 448 __asm__ volatile ( 449 "vld1.32 d0[0], [%[a]], %[kBPS] \n" 450 "vld1.32 d0[1], [%[a]], %[kBPS] \n" 451 "vld1.32 d2[0], [%[a]], %[kBPS] \n" 452 "vld1.32 d2[1], [%[a]] \n" 453 454 "vld1.32 d1[0], [%[b]], %[kBPS] \n" 455 "vld1.32 d1[1], [%[b]], %[kBPS] \n" 456 "vld1.32 d3[0], [%[b]], %[kBPS] \n" 457 "vld1.32 d3[1], [%[b]] \n" 458 459 // a d0/d2, b d1/d3 460 // d0/d1: 01 01 01 01 461 // d2/d3: 23 23 23 23 462 // But: it goes 01 45 23 67 463 // Notice the middle values are transposed 464 "vtrn.16 q0, q1 \n" 465 466 // {a0, a1} = {in[0] + in[2], in[1] + in[3]} 467 "vaddl.u8 q2, d0, d2 \n" 468 "vaddl.u8 q10, d1, d3 \n" 469 // {a3, a2} = {in[0] - in[2], in[1] - in[3]} 470 "vsubl.u8 q3, d0, d2 \n" 471 "vsubl.u8 q11, d1, d3 \n" 472 473 // tmp[0] = a0 + a1 474 "vpaddl.s16 q0, q2 \n" 475 "vpaddl.s16 q8, q10 \n" 476 477 // tmp[1] = a3 + a2 478 "vpaddl.s16 q1, q3 \n" 479 "vpaddl.s16 q9, q11 \n" 480 481 // No pair subtract 482 // q2 = {a0, a3} 483 // q3 = {a1, a2} 484 "vtrn.16 q2, q3 \n" 485 "vtrn.16 q10, q11 \n" 486 487 // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2} 488 "vsubl.s16 q12, d4, d6 \n" 489 "vsubl.s16 q13, d5, d7 \n" 490 "vsubl.s16 q14, d20, d22 \n" 491 "vsubl.s16 q15, d21, d23 \n" 492 493 // separate tmp[3] and tmp[2] 494 // q12 = tmp[3] 495 // q13 = tmp[2] 496 "vtrn.32 q12, q13 \n" 497 "vtrn.32 q14, q15 \n" 498 499 // Transpose tmp for a 500 "vswp d1, d26 \n" // vtrn.64 501 "vswp d3, d24 \n" // vtrn.64 502 "vtrn.32 q0, q1 \n" 503 "vtrn.32 q13, q12 \n" 504 505 // Transpose tmp for b 506 "vswp d17, d30 \n" // vtrn.64 507 "vswp d19, d28 \n" // vtrn.64 508 "vtrn.32 q8, q9 \n" 509 "vtrn.32 q15, q14 \n" 510 511 // The first Q register is a, the second b. 512 // q0/8 tmp[0-3] 513 // q13/15 tmp[4-7] 514 // q1/9 tmp[8-11] 515 // q12/14 tmp[12-15] 516 517 // These are still in 01 45 23 67 order. We fix it easily in the addition 518 // case but the subtraction propegates them. 519 "vswp d3, d27 \n" 520 "vswp d19, d31 \n" 521 522 // a0 = tmp[0] + tmp[8] 523 "vadd.s32 q2, q0, q1 \n" 524 "vadd.s32 q3, q8, q9 \n" 525 526 // a1 = tmp[4] + tmp[12] 527 "vadd.s32 q10, q13, q12 \n" 528 "vadd.s32 q11, q15, q14 \n" 529 530 // a2 = tmp[4] - tmp[12] 531 "vsub.s32 q13, q13, q12 \n" 532 "vsub.s32 q15, q15, q14 \n" 533 534 // a3 = tmp[0] - tmp[8] 535 "vsub.s32 q0, q0, q1 \n" 536 "vsub.s32 q8, q8, q9 \n" 537 538 // b0 = a0 + a1 539 "vadd.s32 q1, q2, q10 \n" 540 "vadd.s32 q9, q3, q11 \n" 541 542 // b1 = a3 + a2 543 "vadd.s32 q12, q0, q13 \n" 544 "vadd.s32 q14, q8, q15 \n" 545 546 // b2 = a3 - a2 547 "vsub.s32 q0, q0, q13 \n" 548 "vsub.s32 q8, q8, q15 \n" 549 550 // b3 = a0 - a1 551 "vsub.s32 q2, q2, q10 \n" 552 "vsub.s32 q3, q3, q11 \n" 553 554 "vld1.64 {q10, q11}, [%[w]] \n" 555 556 // abs(b0) 557 "vabs.s32 q1, q1 \n" 558 "vabs.s32 q9, q9 \n" 559 // abs(b1) 560 "vabs.s32 q12, q12 \n" 561 "vabs.s32 q14, q14 \n" 562 // abs(b2) 563 "vabs.s32 q0, q0 \n" 564 "vabs.s32 q8, q8 \n" 565 // abs(b3) 566 "vabs.s32 q2, q2 \n" 567 "vabs.s32 q3, q3 \n" 568 569 // expand w before using. 570 "vmovl.u16 q13, d20 \n" 571 "vmovl.u16 q15, d21 \n" 572 573 // w[0] * abs(b0) 574 "vmul.u32 q1, q1, q13 \n" 575 "vmul.u32 q9, q9, q13 \n" 576 577 // w[4] * abs(b1) 578 "vmla.u32 q1, q12, q15 \n" 579 "vmla.u32 q9, q14, q15 \n" 580 581 // expand w before using. 582 "vmovl.u16 q13, d22 \n" 583 "vmovl.u16 q15, d23 \n" 584 585 // w[8] * abs(b1) 586 "vmla.u32 q1, q0, q13 \n" 587 "vmla.u32 q9, q8, q13 \n" 588 589 // w[12] * abs(b1) 590 "vmla.u32 q1, q2, q15 \n" 591 "vmla.u32 q9, q3, q15 \n" 592 593 // Sum the arrays 594 "vpaddl.u32 q1, q1 \n" 595 "vpaddl.u32 q9, q9 \n" 596 "vadd.u64 d2, d3 \n" 597 "vadd.u64 d18, d19 \n" 598 599 // Hadamard transform needs 4 bits of extra precision (2 bits in each 600 // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum 601 // precision for coeff is 8bit of input + 4bits of Hadamard transform + 602 // 16bits for w[] + 2 bits of abs() summation. 603 // 604 // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is 605 // A-OK. 606 607 // sum2 - sum1 608 "vsub.u32 d0, d2, d18 \n" 609 // abs(sum2 - sum1) 610 "vabs.s32 d0, d0 \n" 611 // abs(sum2 - sum1) >> 5 612 "vshr.u32 d0, #5 \n" 613 614 // It would be better to move the value straight into r0 but I'm not 615 // entirely sure how this works with inline assembly. 616 "vmov.32 %[sum], d0[0] \n" 617 618 : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W) 619 : [kBPS] "r"(kBPS) 620 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", 621 "q10", "q11", "q12", "q13", "q14", "q15" // clobbered 622 ) ; 623 624 return sum; 625} 626 627static int Disto16x16(const uint8_t* const a, const uint8_t* const b, 628 const uint16_t* const w) { 629 int D = 0; 630 int x, y; 631 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 632 for (x = 0; x < 16; x += 4) { 633 D += Disto4x4(a + x + y, b + x + y, w); 634 } 635 } 636 return D; 637} 638 639#endif // WEBP_USE_NEON 640 641//------------------------------------------------------------------------------ 642// Entry point 643 644extern void VP8EncDspInitNEON(void); 645 646void VP8EncDspInitNEON(void) { 647#if defined(WEBP_USE_NEON) 648 VP8ITransform = ITransform; 649 VP8FTransform = FTransform; 650 651 VP8ITransformWHT = ITransformWHT; 652 VP8FTransformWHT = FTransformWHT; 653 654 VP8TDisto4x4 = Disto4x4; 655 VP8TDisto16x16 = Disto16x16; 656#endif // WEBP_USE_NEON 657} 658 659#if defined(__cplusplus) || defined(c_plusplus) 660} // extern "C" 661#endif 662