15a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Copyright 2012 Google Inc. All Rights Reserved. 25a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// 35a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// This code is licensed under the same terms as WebM: 45a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Software License Agreement: http://www.webmproject.org/license/software/ 55a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ 65a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// ----------------------------------------------------------------------------- 75a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// 85a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// ARM NEON version of dsp functions and loop filtering. 95a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// 105a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Authors: Somnath Banerjee (somnath@google.com) 115a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Johann Koenig (johannkoenig@google.com) 125a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 135a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#include "./dsp.h" 145a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 155a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#if defined(WEBP_USE_NEON) 165a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 175a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#include "../dec/vp8i.h" 185a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 195a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#if defined(__cplusplus) || defined(c_plusplus) 205a50414796e9a458925c7a13a15055d02406bf43Vikas Aroraextern "C" { 215a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#endif 225a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 235a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \ 245a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 255a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 265a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define FLIP_SIGN_BIT2(a, b, s) \ 275a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "veor " #a "," #a "," #s " \n" \ 285a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "veor " #b "," #b "," #s " \n" \ 295a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 305a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define FLIP_SIGN_BIT4(a, b, c, d, s) \ 315a50414796e9a458925c7a13a15055d02406bf43Vikas Arora FLIP_SIGN_BIT2(a, b, s) \ 325a50414796e9a458925c7a13a15055d02406bf43Vikas Arora FLIP_SIGN_BIT2(c, d, s) \ 335a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 345a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \ 355a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \ 365a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \ 375a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \ 385a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \ 395a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ 405a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vdup.8 q14, " #thresh " \n" \ 415a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */ 425a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 435a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define GET_BASE_DELTA(p1, p0, q0, q1, o) \ 445a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \ 455a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \ 465a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \ 475a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \ 485a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */ 495a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 505a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define DO_SIMPLE_FILTER(p0, q0, fl) \ 515a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vmov.i8 q15, #0x03 \n" \ 525a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \ 535a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \ 545a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \ 555a50414796e9a458925c7a13a15055d02406bf43Vikas Arora \ 565a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vmov.i8 q15, #0x04 \n" \ 575a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \ 585a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \ 595a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */ 605a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 615a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Applies filter on 2 pixels (p0 and q0) 625a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define DO_FILTER2(p1, p0, q0, q1, thresh) \ 635a50414796e9a458925c7a13a15055d02406bf43Vikas Arora NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \ 645a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vmov.i8 q10, #0x80 \n" /* sign bit */ \ 655a50414796e9a458925c7a13a15055d02406bf43Vikas Arora FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \ 665a50414796e9a458925c7a13a15055d02406bf43Vikas Arora GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \ 675a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vand q9, q9, q11 \n" /* apply filter mask */ \ 685a50414796e9a458925c7a13a15055d02406bf43Vikas Arora DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \ 695a50414796e9a458925c7a13a15055d02406bf43Vikas Arora FLIP_SIGN_BIT2(p0, q0, q10) 705a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 715a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Load/Store vertical edge 725a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ 735a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \ 745a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld4.8 {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \ 755a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld4.8 {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \ 765a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld4.8 {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \ 775a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld4.8 {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \ 785a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld4.8 {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \ 795a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld4.8 {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \ 805a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld4.8 {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n" 815a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 825a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#define STORE8x2(c1, c2, p,stride) \ 835a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst2.8 {" #c1"[0], " #c2"[0]}," #p "," #stride " \n" \ 845a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst2.8 {" #c1"[1], " #c2"[1]}," #p "," #stride " \n" \ 855a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst2.8 {" #c1"[2], " #c2"[2]}," #p "," #stride " \n" \ 865a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst2.8 {" #c1"[3], " #c2"[3]}," #p "," #stride " \n" \ 875a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst2.8 {" #c1"[4], " #c2"[4]}," #p "," #stride " \n" \ 885a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst2.8 {" #c1"[5], " #c2"[5]}," #p "," #stride " \n" \ 895a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst2.8 {" #c1"[6], " #c2"[6]}," #p "," #stride " \n" \ 905a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst2.8 {" #c1"[7], " #c2"[7]}," #p "," #stride " \n" 915a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 925a50414796e9a458925c7a13a15055d02406bf43Vikas Arora//----------------------------------------------------------------------------- 935a50414796e9a458925c7a13a15055d02406bf43Vikas Arora// Simple In-loop filtering (Paragraph 15.2) 945a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 955a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { 965a50414796e9a458925c7a13a15055d02406bf43Vikas Arora __asm__ volatile ( 975a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 985a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 995a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1 1005a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0 1015a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0 1025a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld1.u8 {q4}, [%[p]] \n" // q1 1035a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1045a50414796e9a458925c7a13a15055d02406bf43Vikas Arora DO_FILTER2(q1, q2, q3, q4, %[thresh]) 1055a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1065a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 1075a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1085a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0 1095a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst1.u8 {q3}, [%[p]] \n" // store oq0 1105a50414796e9a458925c7a13a15055d02406bf43Vikas Arora : [p] "+r"(p) 1115a50414796e9a458925c7a13a15055d02406bf43Vikas Arora : [stride] "r"(stride), [thresh] "r"(thresh) 1125a50414796e9a458925c7a13a15055d02406bf43Vikas Arora : "memory", QRegs 1135a50414796e9a458925c7a13a15055d02406bf43Vikas Arora ); 1145a50414796e9a458925c7a13a15055d02406bf43Vikas Arora} 1155a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1165a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) { 1175a50414796e9a458925c7a13a15055d02406bf43Vikas Arora __asm__ volatile ( 1185a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "sub r4, %[p], #2 \n" // base1 = p - 2 1195a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "lsl r6, %[stride], #1 \n" // r6 = 2 * stride 1205a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "add r5, r4, %[stride] \n" // base2 = base1 + stride 1215a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1225a50414796e9a458925c7a13a15055d02406bf43Vikas Arora LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6) 1235a50414796e9a458925c7a13a15055d02406bf43Vikas Arora LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6) 1245a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vswp d3, d6 \n" // p1:q1 p0:q3 1255a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vswp d5, d8 \n" // q0:q2 q1:q4 1265a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vswp q2, q3 \n" // p1:q1 p0:q2 q0:q3 q1:q4 1275a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1285a50414796e9a458925c7a13a15055d02406bf43Vikas Arora DO_FILTER2(q1, q2, q3, q4, %[thresh]) 1295a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1305a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "sub %[p], %[p], #1 \n" // p - 1 1315a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1325a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vswp d5, d6 \n" 1335a50414796e9a458925c7a13a15055d02406bf43Vikas Arora STORE8x2(d4, d5, [%[p]], %[stride]) 1345a50414796e9a458925c7a13a15055d02406bf43Vikas Arora STORE8x2(d6, d7, [%[p]], %[stride]) 1355a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1365a50414796e9a458925c7a13a15055d02406bf43Vikas Arora : [p] "+r"(p) 1375a50414796e9a458925c7a13a15055d02406bf43Vikas Arora : [stride] "r"(stride), [thresh] "r"(thresh) 1385a50414796e9a458925c7a13a15055d02406bf43Vikas Arora : "memory", "r4", "r5", "r6", QRegs 1395a50414796e9a458925c7a13a15055d02406bf43Vikas Arora ); 1405a50414796e9a458925c7a13a15055d02406bf43Vikas Arora} 1415a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1425a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) { 1435a50414796e9a458925c7a13a15055d02406bf43Vikas Arora int k; 1445a50414796e9a458925c7a13a15055d02406bf43Vikas Arora for (k = 3; k > 0; --k) { 1455a50414796e9a458925c7a13a15055d02406bf43Vikas Arora p += 4 * stride; 1465a50414796e9a458925c7a13a15055d02406bf43Vikas Arora SimpleVFilter16NEON(p, stride, thresh); 1475a50414796e9a458925c7a13a15055d02406bf43Vikas Arora } 1485a50414796e9a458925c7a13a15055d02406bf43Vikas Arora} 1495a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1505a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { 1515a50414796e9a458925c7a13a15055d02406bf43Vikas Arora int k; 1525a50414796e9a458925c7a13a15055d02406bf43Vikas Arora for (k = 3; k > 0; --k) { 1535a50414796e9a458925c7a13a15055d02406bf43Vikas Arora p += 4; 1545a50414796e9a458925c7a13a15055d02406bf43Vikas Arora SimpleHFilter16NEON(p, stride, thresh); 1555a50414796e9a458925c7a13a15055d02406bf43Vikas Arora } 1565a50414796e9a458925c7a13a15055d02406bf43Vikas Arora} 1575a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1585a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void TransformOneNEON(const int16_t *in, uint8_t *dst) { 1595a50414796e9a458925c7a13a15055d02406bf43Vikas Arora const int kBPS = BPS; 1605a50414796e9a458925c7a13a15055d02406bf43Vikas Arora const int16_t constants[] = {20091, 17734, 0, 0}; 1615a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* kC1, kC2. Padded because vld1.16 loads 8 bytes 1625a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * Technically these are unsigned but vqdmulh is only available in signed. 1635a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * vqdmulh returns high half (effectively >> 16) but also doubles the value, 1645a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * changing the >> 16 to >> 15 and requiring an additional >> 1. 1655a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * We use this to our advantage with kC2. The canonical value is 35468. 1665a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * However, the high bit is set so treating it as signed will give incorrect 1675a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * results. We avoid this by down shifting by 1 here to clear the highest bit. 1685a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * Combined with the doubling effect of vqdmulh we get >> 16. 1695a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * This can not be applied to kC1 because the lowest bit is set. Down shifting 1705a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * the constant would reduce precision. 1715a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 1725a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1735a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* libwebp uses a trick to avoid some extra addition that libvpx does. 1745a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * Instead of: 1755a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); 1765a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the 1775a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * same issue with kC1 and vqdmulh that we work around by down shifting kC2 1785a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 1795a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1805a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */ 1815a50414796e9a458925c7a13a15055d02406bf43Vikas Arora __asm__ volatile ( 1825a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld1.16 {q1, q2}, [%[in]] \n" 1835a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld1.16 {d0}, [%[constants]] \n" 1845a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1855a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* d2: in[0] 1865a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d3: in[8] 1875a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d4: in[4] 1885a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d5: in[12] 1895a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 1905a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vswp d3, d4 \n" 1915a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1925a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* q8 = {in[4], in[12]} * kC1 * 2 >> 16 1935a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * q9 = {in[4], in[12]} * kC2 >> 16 1945a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 1955a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqdmulh.s16 q8, q2, d0[0] \n" 1965a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqdmulh.s16 q9, q2, d0[1] \n" 1975a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 1985a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* d22 = a = in[0] + in[8] 1995a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d23 = b = in[0] - in[8] 2005a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 2015a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 d22, d2, d3 \n" 2025a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqsub.s16 d23, d2, d3 \n" 2035a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2045a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* The multiplication should be x * kC1 >> 16 2055a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * However, with vqdmulh we get x * kC1 * 2 >> 16 2065a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * (multiply, double, return high half) 2075a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * We avoided this in kC2 by pre-shifting the constant. 2085a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * q8 = in[4]/[12] * kC1 >> 16 2095a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 2105a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vshr.s16 q8, q8, #1 \n" 2115a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2125a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* Add {in[4], in[12]} back after the multiplication. This is handled by 2135a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * adding 1 << 16 to kC1 in the libwebp C code. 2145a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 2155a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 q8, q2, q8 \n" 2165a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2175a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* d20 = c = in[4]*kC2 - in[12]*kC1 2185a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d21 = d = in[4]*kC1 + in[12]*kC2 2195a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 2205a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqsub.s16 d20, d18, d17 \n" 2215a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 d21, d19, d16 \n" 2225a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2235a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* d2 = tmp[0] = a + d 2245a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d3 = tmp[1] = b + c 2255a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d4 = tmp[2] = b - c 2265a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d5 = tmp[3] = a - d 2275a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 2285a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 d2, d22, d21 \n" 2295a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 d3, d23, d20 \n" 2305a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqsub.s16 d4, d23, d20 \n" 2315a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqsub.s16 d5, d22, d21 \n" 2325a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2335a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vzip.16 q1, q2 \n" 2345a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vzip.16 q1, q2 \n" 2355a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2365a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vswp d3, d4 \n" 2375a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2385a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 2395a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * q9 = {tmp[4], tmp[12]} * kC2 >> 16 2405a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 2415a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqdmulh.s16 q8, q2, d0[0] \n" 2425a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqdmulh.s16 q9, q2, d0[1] \n" 2435a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2445a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* d22 = a = tmp[0] + tmp[8] 2455a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d23 = b = tmp[0] - tmp[8] 2465a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 2475a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 d22, d2, d3 \n" 2485a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqsub.s16 d23, d2, d3 \n" 2495a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2505a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* See long winded explanations prior */ 2515a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vshr.s16 q8, q8, #1 \n" 2525a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 q8, q2, q8 \n" 2535a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2545a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* d20 = c = in[4]*kC2 - in[12]*kC1 2555a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d21 = d = in[4]*kC1 + in[12]*kC2 2565a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 2575a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqsub.s16 d20, d18, d17 \n" 2585a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 d21, d19, d16 \n" 2595a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2605a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* d2 = tmp[0] = a + d 2615a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d3 = tmp[1] = b + c 2625a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d4 = tmp[2] = b - c 2635a50414796e9a458925c7a13a15055d02406bf43Vikas Arora * d5 = tmp[3] = a - d 2645a50414796e9a458925c7a13a15055d02406bf43Vikas Arora */ 2655a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 d2, d22, d21 \n" 2665a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 d3, d23, d20 \n" 2675a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqsub.s16 d4, d23, d20 \n" 2685a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqsub.s16 d5, d22, d21 \n" 2695a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2705a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld1.32 d6[0], [%[dst]], %[kBPS] \n" 2715a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld1.32 d6[1], [%[dst]], %[kBPS] \n" 2725a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld1.32 d7[0], [%[dst]], %[kBPS] \n" 2735a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vld1.32 d7[1], [%[dst]], %[kBPS] \n" 2745a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2755a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "sub %[dst], %[dst], %[kBPS], lsl #2 \n" 2765a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2775a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* (val) + 4 >> 3 */ 2785a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vrshr.s16 d2, d2, #3 \n" 2795a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vrshr.s16 d3, d3, #3 \n" 2805a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vrshr.s16 d4, d4, #3 \n" 2815a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vrshr.s16 d5, d5, #3 \n" 2825a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2835a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vzip.16 q1, q2 \n" 2845a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vzip.16 q1, q2 \n" 2855a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2865a50414796e9a458925c7a13a15055d02406bf43Vikas Arora /* Must accumulate before saturating */ 2875a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vmovl.u8 q8, d6 \n" 2885a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vmovl.u8 q9, d7 \n" 2895a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2905a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 q1, q1, q8 \n" 2915a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqadd.s16 q2, q2, q9 \n" 2925a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2935a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqmovun.s16 d0, q1 \n" 2945a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vqmovun.s16 d1, q2 \n" 2955a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 2965a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 2975a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 2985a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 2995a50414796e9a458925c7a13a15055d02406bf43Vikas Arora "vst1.32 d1[1], [%[dst]] \n" 3005a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 3015a50414796e9a458925c7a13a15055d02406bf43Vikas Arora : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ 3025a50414796e9a458925c7a13a15055d02406bf43Vikas Arora : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */ 3035a50414796e9a458925c7a13a15055d02406bf43Vikas Arora : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */ 3045a50414796e9a458925c7a13a15055d02406bf43Vikas Arora ); 3055a50414796e9a458925c7a13a15055d02406bf43Vikas Arora} 3065a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 3075a50414796e9a458925c7a13a15055d02406bf43Vikas Arorastatic void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { 3085a50414796e9a458925c7a13a15055d02406bf43Vikas Arora TransformOneNEON(in, dst); 3095a50414796e9a458925c7a13a15055d02406bf43Vikas Arora if (do_two) { 3105a50414796e9a458925c7a13a15055d02406bf43Vikas Arora TransformOneNEON(in + 16, dst + 4); 3115a50414796e9a458925c7a13a15055d02406bf43Vikas Arora } 3125a50414796e9a458925c7a13a15055d02406bf43Vikas Arora} 3135a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 3145a50414796e9a458925c7a13a15055d02406bf43Vikas Aroraextern void VP8DspInitNEON(void); 3155a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 3165a50414796e9a458925c7a13a15055d02406bf43Vikas Aroravoid VP8DspInitNEON(void) { 3175a50414796e9a458925c7a13a15055d02406bf43Vikas Arora VP8Transform = TransformTwoNEON; 3185a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 3195a50414796e9a458925c7a13a15055d02406bf43Vikas Arora VP8SimpleVFilter16 = SimpleVFilter16NEON; 3205a50414796e9a458925c7a13a15055d02406bf43Vikas Arora VP8SimpleHFilter16 = SimpleHFilter16NEON; 3215a50414796e9a458925c7a13a15055d02406bf43Vikas Arora VP8SimpleVFilter16i = SimpleVFilter16iNEON; 3225a50414796e9a458925c7a13a15055d02406bf43Vikas Arora VP8SimpleHFilter16i = SimpleHFilter16iNEON; 3235a50414796e9a458925c7a13a15055d02406bf43Vikas Arora} 3245a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 3255a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#if defined(__cplusplus) || defined(c_plusplus) 3265a50414796e9a458925c7a13a15055d02406bf43Vikas Arora} // extern "C" 3275a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#endif 3285a50414796e9a458925c7a13a15055d02406bf43Vikas Arora 3295a50414796e9a458925c7a13a15055d02406bf43Vikas Arora#endif // WEBP_USE_NEON 330