15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2012 Google Inc. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 3eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// Use of this source code is governed by a BSD-style license 4eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// that can be found in the COPYING file in the root of the source 5eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// tree. An additional intellectual property rights grant can be found 6eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// in the file PATENTS. All contributing project authors may 7eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// be found in the AUTHORS file in the root of the source tree. 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ----------------------------------------------------------------------------- 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ARM NEON version of dsp functions and loop filtering. 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Authors: Somnath Banerjee (somnath@google.com) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Johann Koenig (johannkoenig@google.com) 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "./dsp.h" 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(__cplusplus) || defined(c_plusplus) 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern "C" { 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if defined(WEBP_USE_NEON) 222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "../dec/vp8i.h" 242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \ 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define FLIP_SIGN_BIT2(a, b, s) \ 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "veor " #a "," #a "," #s " \n" \ 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "veor " #b "," #b "," #s " \n" \ 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define FLIP_SIGN_BIT4(a, b, c, d, s) \ 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLIP_SIGN_BIT2(a, b, s) \ 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLIP_SIGN_BIT2(c, d, s) \ 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \ 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \ 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \ 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \ 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \ 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vdup.8 q14, " #thresh " \n" \ 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */ 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define GET_BASE_DELTA(p1, p0, q0, q1, o) \ 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \ 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \ 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \ 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \ 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */ 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define DO_SIMPLE_FILTER(p0, q0, fl) \ 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vmov.i8 q15, #0x03 \n" \ 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \ 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \ 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \ 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) \ 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vmov.i8 q15, #0x04 \n" \ 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \ 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \ 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */ 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Applies filter on 2 pixels (p0 and q0) 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define DO_FILTER2(p1, p0, q0, q1, thresh) \ 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \ 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vmov.i8 q10, #0x80 \n" /* sign bit */ \ 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \ 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \ 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vand q9, q9, q11 \n" /* apply filter mask */ \ 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \ 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLIP_SIGN_BIT2(p0, q0, q10) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Load/Store vertical edge 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \ 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld4.8 {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \ 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld4.8 {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \ 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld4.8 {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \ 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld4.8 {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \ 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld4.8 {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \ 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld4.8 {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \ 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld4.8 {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n" 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 84c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#define STORE8x2(c1, c2, p, stride) \ 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst2.8 {" #c1"[0], " #c2"[0]}," #p "," #stride " \n" \ 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst2.8 {" #c1"[1], " #c2"[1]}," #p "," #stride " \n" \ 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst2.8 {" #c1"[2], " #c2"[2]}," #p "," #stride " \n" \ 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst2.8 {" #c1"[3], " #c2"[3]}," #p "," #stride " \n" \ 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst2.8 {" #c1"[4], " #c2"[4]}," #p "," #stride " \n" \ 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst2.8 {" #c1"[5], " #c2"[5]}," #p "," #stride " \n" \ 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst2.8 {" #c1"[6], " #c2"[6]}," #p "," #stride " \n" \ 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst2.8 {" #c1"[7], " #c2"[7]}," #p "," #stride " \n" 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//----------------------------------------------------------------------------- 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Simple In-loop filtering (Paragraph 15.2) 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __asm__ volatile ( 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.u8 {q4}, [%[p]] \n" // q1 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DO_FILTER2(q1, q2, q3, q4, %[thresh]) 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.u8 {q3}, [%[p]] \n" // store oq0 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [p] "+r"(p) 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [stride] "r"(stride), [thresh] "r"(thresh) 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : "memory", QRegs 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ); 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) { 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __asm__ volatile ( 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "sub r4, %[p], #2 \n" // base1 = p - 2 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "lsl r6, %[stride], #1 \n" // r6 = 2 * stride 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "add r5, r4, %[stride] \n" // base2 = base1 + stride 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6) 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6) 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vswp d3, d6 \n" // p1:q1 p0:q3 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vswp d5, d8 \n" // q0:q2 q1:q4 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vswp q2, q3 \n" // p1:q1 p0:q2 q0:q3 q1:q4 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DO_FILTER2(q1, q2, q3, q4, %[thresh]) 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "sub %[p], %[p], #1 \n" // p - 1 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vswp d5, d6 \n" 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) STORE8x2(d4, d5, [%[p]], %[stride]) 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) STORE8x2(d6, d7, [%[p]], %[stride]) 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [p] "+r"(p) 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [stride] "r"(stride), [thresh] "r"(thresh) 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : "memory", "r4", "r5", "r6", QRegs 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ); 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) { 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int k; 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (k = 3; k > 0; --k) { 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) p += 4 * stride; 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SimpleVFilter16NEON(p, stride, thresh); 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int k; 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (k = 3; k > 0; --k) { 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) p += 4; 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SimpleHFilter16NEON(p, stride, thresh); 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)//----------------------------------------------------------------------------- 1612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Inverse transforms (Paragraph 14.4) 1622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void TransformOneNEON(const int16_t *in, uint8_t *dst) { 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const int kBPS = BPS; 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const int16_t constants[] = {20091, 17734, 0, 0}; 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* kC1, kC2. Padded because vld1.16 loads 8 bytes 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * Technically these are unsigned but vqdmulh is only available in signed. 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * vqdmulh returns high half (effectively >> 16) but also doubles the value, 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * changing the >> 16 to >> 15 and requiring an additional >> 1. 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * We use this to our advantage with kC2. The canonical value is 35468. 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * However, the high bit is set so treating it as signed will give incorrect 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * results. We avoid this by down shifting by 1 here to clear the highest bit. 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * Combined with the doubling effect of vqdmulh we get >> 16. 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * This can not be applied to kC1 because the lowest bit is set. Down shifting 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * the constant would reduce precision. 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* libwebp uses a trick to avoid some extra addition that libvpx does. 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * Instead of: 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * same issue with kC1 and vqdmulh that we work around by down shifting kC2 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */ 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) __asm__ volatile ( 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.16 {q1, q2}, [%[in]] \n" 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.16 {d0}, [%[constants]] \n" 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d2: in[0] 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d3: in[8] 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d4: in[4] 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d5: in[12] 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vswp d3, d4 \n" 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* q8 = {in[4], in[12]} * kC1 * 2 >> 16 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * q9 = {in[4], in[12]} * kC2 >> 16 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q8, q2, d0[0] \n" 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q9, q2, d0[1] \n" 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d22 = a = in[0] + in[8] 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d23 = b = in[0] - in[8] 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d22, d2, d3 \n" 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d23, d2, d3 \n" 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* The multiplication should be x * kC1 >> 16 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * However, with vqdmulh we get x * kC1 * 2 >> 16 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * (multiply, double, return high half) 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * We avoided this in kC2 by pre-shifting the constant. 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * q8 = in[4]/[12] * kC1 >> 16 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vshr.s16 q8, q8, #1 \n" 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* Add {in[4], in[12]} back after the multiplication. This is handled by 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * adding 1 << 16 to kC1 in the libwebp C code. 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 q8, q2, q8 \n" 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d20 = c = in[4]*kC2 - in[12]*kC1 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d21 = d = in[4]*kC1 + in[12]*kC2 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d20, d18, d17 \n" 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d21, d19, d16 \n" 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d2 = tmp[0] = a + d 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d3 = tmp[1] = b + c 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d4 = tmp[2] = b - c 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d5 = tmp[3] = a - d 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d2, d22, d21 \n" 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d3, d23, d20 \n" 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d4, d23, d20 \n" 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d5, d22, d21 \n" 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vzip.16 q1, q2 \n" 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vzip.16 q1, q2 \n" 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vswp d3, d4 \n" 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * q9 = {tmp[4], tmp[12]} * kC2 >> 16 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q8, q2, d0[0] \n" 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqdmulh.s16 q9, q2, d0[1] \n" 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d22 = a = tmp[0] + tmp[8] 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d23 = b = tmp[0] - tmp[8] 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d22, d2, d3 \n" 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d23, d2, d3 \n" 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* See long winded explanations prior */ 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vshr.s16 q8, q8, #1 \n" 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 q8, q2, q8 \n" 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d20 = c = in[4]*kC2 - in[12]*kC1 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d21 = d = in[4]*kC1 + in[12]*kC2 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d20, d18, d17 \n" 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d21, d19, d16 \n" 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* d2 = tmp[0] = a + d 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d3 = tmp[1] = b + c 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d4 = tmp[2] = b - c 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * d5 = tmp[3] = a - d 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */ 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d2, d22, d21 \n" 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 d3, d23, d20 \n" 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d4, d23, d20 \n" 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqsub.s16 d5, d22, d21 \n" 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d6[0], [%[dst]], %[kBPS] \n" 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d6[1], [%[dst]], %[kBPS] \n" 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d7[0], [%[dst]], %[kBPS] \n" 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vld1.32 d7[1], [%[dst]], %[kBPS] \n" 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "sub %[dst], %[dst], %[kBPS], lsl #2 \n" 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* (val) + 4 >> 3 */ 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d2, d2, #3 \n" 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d3, d3, #3 \n" 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d4, d4, #3 \n" 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vrshr.s16 d5, d5, #3 \n" 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vzip.16 q1, q2 \n" 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vzip.16 q1, q2 \n" 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) /* Must accumulate before saturating */ 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vmovl.u8 q8, d6 \n" 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vmovl.u8 q9, d7 \n" 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 q1, q1, q8 \n" 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqadd.s16 q2, q2, q9 \n" 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqmovun.s16 d0, q1 \n" 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vqmovun.s16 d1, q2 \n" 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "vst1.32 d1[1], [%[dst]] \n" 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */ 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */ 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ); 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TransformOneNEON(in, dst); 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (do_two) { 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TransformOneNEON(in + 16, dst + 4); 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)static void TransformWHT(const int16_t* in, int16_t* out) { 320c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const int kStep = 32; // The store is only incrementing the pointer as if we 321c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // had stored a single byte. 3222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) __asm__ volatile ( 3232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // part 1 3242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // load data into q0, q1 3252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vld1.16 {q0, q1}, [%[in]] \n" 3262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 3272a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] 3282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] 3292a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8] 3302a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12] 3312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 3322a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 3332a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 3342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2 3352a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2 3362a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 3372a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // Transpose 3382a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] 3392a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] 3402a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vswp d1, d4 \n" // vtrn.64 q0, q2 3412a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vswp d3, d6 \n" // vtrn.64 q1, q3 3422a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vtrn.32 q0, q1 \n" 3432a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vtrn.32 q2, q3 \n" 3442a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 3452a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vmov.s32 q4, #3 \n" // dc = 3 3462a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3 3472a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3] 3482a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2] 3492a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] 3502a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] 3512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 3522a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vadd.s32 q0, q6, q7 \n" 3532a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 3542a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vadd.s32 q1, q9, q8 \n" 3552a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 3562a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vsub.s32 q2, q6, q7 \n" 3572a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 3582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vsub.s32 q3, q9, q8 \n" 3592a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 3602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 3612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // set the results to output 3622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d0[0], [%[out]], %[kStep] \n" 3632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d1[0], [%[out]], %[kStep] \n" 3642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d2[0], [%[out]], %[kStep] \n" 3652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d3[0], [%[out]], %[kStep] \n" 3662a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d0[1], [%[out]], %[kStep] \n" 3672a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d1[1], [%[out]], %[kStep] \n" 3682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d2[1], [%[out]], %[kStep] \n" 3692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d3[1], [%[out]], %[kStep] \n" 3702a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d0[2], [%[out]], %[kStep] \n" 3712a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d1[2], [%[out]], %[kStep] \n" 3722a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d2[2], [%[out]], %[kStep] \n" 3732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d3[2], [%[out]], %[kStep] \n" 3742a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d0[3], [%[out]], %[kStep] \n" 3752a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d1[3], [%[out]], %[kStep] \n" 3762a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d2[3], [%[out]], %[kStep] \n" 3772a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "vst1.16 d3[3], [%[out]], %[kStep] \n" 3782a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 3792a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) : [out] "+r"(out) // modified registers 3802a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) : [in] "r"(in), [kStep] "r"(kStep) // constants 3812a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) : "memory", "q0", "q1", "q2", "q3", "q4", 3822a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) "q5", "q6", "q7", "q8", "q9" // clobbered 3832a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) ); 3842a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 3852a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 3862a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif // WEBP_USE_NEON 3872a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 3882a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)//------------------------------------------------------------------------------ 3892a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Entry point 3902a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)extern void VP8DspInitNEON(void); 3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void VP8DspInitNEON(void) { 3942a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if defined(WEBP_USE_NEON) 3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) VP8Transform = TransformTwoNEON; 3962a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) VP8TransformWHT = TransformWHT; 3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) VP8SimpleVFilter16 = SimpleVFilter16NEON; 3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) VP8SimpleHFilter16 = SimpleHFilter16NEON; 4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) VP8SimpleVFilter16i = SimpleVFilter16iNEON; 4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) VP8SimpleHFilter16i = SimpleHFilter16iNEON; 4022a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif // WEBP_USE_NEON 4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(__cplusplus) || defined(c_plusplus) 4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // extern "C" 4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 408