dec_neon.c revision 8b720228d581a84fd173b6dcb2fa295b59db489a
1// Copyright 2012 Google Inc. All Rights Reserved. 2// 3// Use of this source code is governed by a BSD-style license 4// that can be found in the COPYING file in the root of the source 5// tree. An additional intellectual property rights grant can be found 6// in the file PATENTS. All contributing project authors may 7// be found in the AUTHORS file in the root of the source tree. 8// ----------------------------------------------------------------------------- 9// 10// ARM NEON version of dsp functions and loop filtering. 11// 12// Authors: Somnath Banerjee (somnath@google.com) 13// Johann Koenig (johannkoenig@google.com) 14 15#include "./dsp.h" 16 17#if defined(WEBP_USE_NEON) 18 19#include "../dec/vp8i.h" 20 21#define QRegs "q0", "q1", "q2", "q3", \ 22 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 23 24#define FLIP_SIGN_BIT2(a, b, s) \ 25 "veor " #a "," #a "," #s " \n" \ 26 "veor " #b "," #b "," #s " \n" \ 27 28#define FLIP_SIGN_BIT4(a, b, c, d, s) \ 29 FLIP_SIGN_BIT2(a, b, s) \ 30 FLIP_SIGN_BIT2(c, d, s) \ 31 32#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \ 33 "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \ 34 "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \ 35 "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \ 36 "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \ 37 "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ 38 "vdup.8 q14, " #thresh " \n" \ 39 "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */ 40 41#define GET_BASE_DELTA(p1, p0, q0, q1, o) \ 42 "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \ 43 "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \ 44 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \ 45 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \ 46 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */ 47 48#define DO_SIMPLE_FILTER(p0, q0, fl) \ 49 "vmov.i8 q15, #0x03 \n" \ 50 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \ 51 "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \ 52 "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \ 53 \ 54 "vmov.i8 q15, #0x04 \n" \ 55 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \ 56 "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \ 57 "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */ 58 59// Applies filter on 2 pixels (p0 and q0) 60#define DO_FILTER2(p1, p0, q0, q1, thresh) \ 61 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \ 62 "vmov.i8 q10, #0x80 \n" /* sign bit */ \ 63 FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \ 64 GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \ 65 "vand q9, q9, q11 \n" /* apply filter mask */ \ 66 DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \ 67 FLIP_SIGN_BIT2(p0, q0, q10) 68 69// Load/Store vertical edge 70#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ 71 "vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \ 72 "vld4.8 {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \ 73 "vld4.8 {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \ 74 "vld4.8 {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \ 75 "vld4.8 {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \ 76 "vld4.8 {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \ 77 "vld4.8 {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \ 78 "vld4.8 {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n" 79 80#define STORE8x2(c1, c2, p, stride) \ 81 "vst2.8 {" #c1"[0], " #c2"[0]}," #p "," #stride " \n" \ 82 "vst2.8 {" #c1"[1], " #c2"[1]}," #p "," #stride " \n" \ 83 "vst2.8 {" #c1"[2], " #c2"[2]}," #p "," #stride " \n" \ 84 "vst2.8 {" #c1"[3], " #c2"[3]}," #p "," #stride " \n" \ 85 "vst2.8 {" #c1"[4], " #c2"[4]}," #p "," #stride " \n" \ 86 "vst2.8 {" #c1"[5], " #c2"[5]}," #p "," #stride " \n" \ 87 "vst2.8 {" #c1"[6], " #c2"[6]}," #p "," #stride " \n" \ 88 "vst2.8 {" #c1"[7], " #c2"[7]}," #p "," #stride " \n" 89 90//----------------------------------------------------------------------------- 91// Simple In-loop filtering (Paragraph 15.2) 92 93static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { 94 __asm__ volatile ( 95 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 96 97 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1 98 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0 99 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0 100 "vld1.u8 {q12}, [%[p]] \n" // q1 101 102 DO_FILTER2(q1, q2, q3, q12, %[thresh]) 103 104 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 105 106 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0 107 "vst1.u8 {q3}, [%[p]] \n" // store oq0 108 : [p] "+r"(p) 109 : [stride] "r"(stride), [thresh] "r"(thresh) 110 : "memory", QRegs 111 ); 112} 113 114static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) { 115 __asm__ volatile ( 116 "sub r4, %[p], #2 \n" // base1 = p - 2 117 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride 118 "add r5, r4, %[stride] \n" // base2 = base1 + stride 119 120 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6) 121 LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6) 122 "vswp d3, d24 \n" // p1:q1 p0:q3 123 "vswp d5, d26 \n" // q0:q2 q1:q4 124 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4 125 126 DO_FILTER2(q1, q2, q12, q13, %[thresh]) 127 128 "sub %[p], %[p], #1 \n" // p - 1 129 130 "vswp d5, d24 \n" 131 STORE8x2(d4, d5, [%[p]], %[stride]) 132 STORE8x2(d24, d25, [%[p]], %[stride]) 133 134 : [p] "+r"(p) 135 : [stride] "r"(stride), [thresh] "r"(thresh) 136 : "memory", "r4", "r5", "r6", QRegs 137 ); 138} 139 140static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) { 141 int k; 142 for (k = 3; k > 0; --k) { 143 p += 4 * stride; 144 SimpleVFilter16NEON(p, stride, thresh); 145 } 146} 147 148static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { 149 int k; 150 for (k = 3; k > 0; --k) { 151 p += 4; 152 SimpleHFilter16NEON(p, stride, thresh); 153 } 154} 155 156//----------------------------------------------------------------------------- 157// Inverse transforms (Paragraph 14.4) 158 159static void TransformOne(const int16_t* in, uint8_t* dst) { 160 const int kBPS = BPS; 161 const int16_t constants[] = {20091, 17734, 0, 0}; 162 /* kC1, kC2. Padded because vld1.16 loads 8 bytes 163 * Technically these are unsigned but vqdmulh is only available in signed. 164 * vqdmulh returns high half (effectively >> 16) but also doubles the value, 165 * changing the >> 16 to >> 15 and requiring an additional >> 1. 166 * We use this to our advantage with kC2. The canonical value is 35468. 167 * However, the high bit is set so treating it as signed will give incorrect 168 * results. We avoid this by down shifting by 1 here to clear the highest bit. 169 * Combined with the doubling effect of vqdmulh we get >> 16. 170 * This can not be applied to kC1 because the lowest bit is set. Down shifting 171 * the constant would reduce precision. 172 */ 173 174 /* libwebp uses a trick to avoid some extra addition that libvpx does. 175 * Instead of: 176 * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); 177 * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the 178 * same issue with kC1 and vqdmulh that we work around by down shifting kC2 179 */ 180 181 /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */ 182 __asm__ volatile ( 183 "vld1.16 {q1, q2}, [%[in]] \n" 184 "vld1.16 {d0}, [%[constants]] \n" 185 186 /* d2: in[0] 187 * d3: in[8] 188 * d4: in[4] 189 * d5: in[12] 190 */ 191 "vswp d3, d4 \n" 192 193 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16 194 * q9 = {in[4], in[12]} * kC2 >> 16 195 */ 196 "vqdmulh.s16 q8, q2, d0[0] \n" 197 "vqdmulh.s16 q9, q2, d0[1] \n" 198 199 /* d22 = a = in[0] + in[8] 200 * d23 = b = in[0] - in[8] 201 */ 202 "vqadd.s16 d22, d2, d3 \n" 203 "vqsub.s16 d23, d2, d3 \n" 204 205 /* The multiplication should be x * kC1 >> 16 206 * However, with vqdmulh we get x * kC1 * 2 >> 16 207 * (multiply, double, return high half) 208 * We avoided this in kC2 by pre-shifting the constant. 209 * q8 = in[4]/[12] * kC1 >> 16 210 */ 211 "vshr.s16 q8, q8, #1 \n" 212 213 /* Add {in[4], in[12]} back after the multiplication. This is handled by 214 * adding 1 << 16 to kC1 in the libwebp C code. 215 */ 216 "vqadd.s16 q8, q2, q8 \n" 217 218 /* d20 = c = in[4]*kC2 - in[12]*kC1 219 * d21 = d = in[4]*kC1 + in[12]*kC2 220 */ 221 "vqsub.s16 d20, d18, d17 \n" 222 "vqadd.s16 d21, d19, d16 \n" 223 224 /* d2 = tmp[0] = a + d 225 * d3 = tmp[1] = b + c 226 * d4 = tmp[2] = b - c 227 * d5 = tmp[3] = a - d 228 */ 229 "vqadd.s16 d2, d22, d21 \n" 230 "vqadd.s16 d3, d23, d20 \n" 231 "vqsub.s16 d4, d23, d20 \n" 232 "vqsub.s16 d5, d22, d21 \n" 233 234 "vzip.16 q1, q2 \n" 235 "vzip.16 q1, q2 \n" 236 237 "vswp d3, d4 \n" 238 239 /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 240 * q9 = {tmp[4], tmp[12]} * kC2 >> 16 241 */ 242 "vqdmulh.s16 q8, q2, d0[0] \n" 243 "vqdmulh.s16 q9, q2, d0[1] \n" 244 245 /* d22 = a = tmp[0] + tmp[8] 246 * d23 = b = tmp[0] - tmp[8] 247 */ 248 "vqadd.s16 d22, d2, d3 \n" 249 "vqsub.s16 d23, d2, d3 \n" 250 251 /* See long winded explanations prior */ 252 "vshr.s16 q8, q8, #1 \n" 253 "vqadd.s16 q8, q2, q8 \n" 254 255 /* d20 = c = in[4]*kC2 - in[12]*kC1 256 * d21 = d = in[4]*kC1 + in[12]*kC2 257 */ 258 "vqsub.s16 d20, d18, d17 \n" 259 "vqadd.s16 d21, d19, d16 \n" 260 261 /* d2 = tmp[0] = a + d 262 * d3 = tmp[1] = b + c 263 * d4 = tmp[2] = b - c 264 * d5 = tmp[3] = a - d 265 */ 266 "vqadd.s16 d2, d22, d21 \n" 267 "vqadd.s16 d3, d23, d20 \n" 268 "vqsub.s16 d4, d23, d20 \n" 269 "vqsub.s16 d5, d22, d21 \n" 270 271 "vld1.32 d6[0], [%[dst]], %[kBPS] \n" 272 "vld1.32 d6[1], [%[dst]], %[kBPS] \n" 273 "vld1.32 d7[0], [%[dst]], %[kBPS] \n" 274 "vld1.32 d7[1], [%[dst]], %[kBPS] \n" 275 276 "sub %[dst], %[dst], %[kBPS], lsl #2 \n" 277 278 /* (val) + 4 >> 3 */ 279 "vrshr.s16 d2, d2, #3 \n" 280 "vrshr.s16 d3, d3, #3 \n" 281 "vrshr.s16 d4, d4, #3 \n" 282 "vrshr.s16 d5, d5, #3 \n" 283 284 "vzip.16 q1, q2 \n" 285 "vzip.16 q1, q2 \n" 286 287 /* Must accumulate before saturating */ 288 "vmovl.u8 q8, d6 \n" 289 "vmovl.u8 q9, d7 \n" 290 291 "vqadd.s16 q1, q1, q8 \n" 292 "vqadd.s16 q2, q2, q9 \n" 293 294 "vqmovun.s16 d0, q1 \n" 295 "vqmovun.s16 d1, q2 \n" 296 297 "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 298 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 299 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 300 "vst1.32 d1[1], [%[dst]] \n" 301 302 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ 303 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */ 304 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */ 305 ); 306} 307 308static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { 309 TransformOne(in, dst); 310 if (do_two) { 311 TransformOne(in + 16, dst + 4); 312 } 313} 314 315static void TransformDC(const int16_t* in, uint8_t* dst) { 316 const int DC = (in[0] + 4) >> 3; 317 const int kBPS = BPS; 318 __asm__ volatile ( 319 "vdup.16 q1, %[DC] \n" 320 321 "vld1.32 d0[0], [%[dst]], %[kBPS] \n" 322 "vld1.32 d1[0], [%[dst]], %[kBPS] \n" 323 "vld1.32 d0[1], [%[dst]], %[kBPS] \n" 324 "vld1.32 d1[1], [%[dst]], %[kBPS] \n" 325 326 "sub %[dst], %[dst], %[kBPS], lsl #2 \n" 327 328 // add DC and convert to s16. 329 "vaddw.u8 q2, q1, d0 \n" 330 "vaddw.u8 q3, q1, d1 \n" 331 // convert back to u8 with saturation 332 "vqmovun.s16 d0, q2 \n" 333 "vqmovun.s16 d1, q3 \n" 334 335 "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 336 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 337 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 338 "vst1.32 d1[1], [%[dst]] \n" 339 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ 340 : [kBPS] "r"(kBPS), /* constants */ 341 [DC] "r"(DC) 342 : "memory", "q0", "q1", "q2", "q3" /* clobbered */ 343 ); 344} 345 346static void TransformWHT(const int16_t* in, int16_t* out) { 347 const int kStep = 32; // The store is only incrementing the pointer as if we 348 // had stored a single byte. 349 __asm__ volatile ( 350 // part 1 351 // load data into q0, q1 352 "vld1.16 {q0, q1}, [%[in]] \n" 353 354 "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] 355 "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] 356 "vsubl.s16 q10, d1, d2 \n" // a2 = in[4] - in[8] 357 "vsubl.s16 q11, d0, d3 \n" // a3 = in[0] - in[12] 358 359 "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 360 "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 361 "vadd.s32 q1, q11, q10 \n" // tmp[4] = a3 + a2 362 "vsub.s32 q3, q11, q10 \n" // tmp[12] = a3 - a2 363 364 // Transpose 365 // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] 366 // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] 367 "vswp d1, d4 \n" // vtrn.64 q0, q2 368 "vswp d3, d6 \n" // vtrn.64 q1, q3 369 "vtrn.32 q0, q1 \n" 370 "vtrn.32 q2, q3 \n" 371 372 "vmov.s32 q10, #3 \n" // dc = 3 373 "vadd.s32 q0, q0, q10 \n" // dc = tmp[0] + 3 374 "vadd.s32 q12, q0, q3 \n" // a0 = dc + tmp[3] 375 "vadd.s32 q13, q1, q2 \n" // a1 = tmp[1] + tmp[2] 376 "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] 377 "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] 378 379 "vadd.s32 q0, q12, q13 \n" 380 "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 381 "vadd.s32 q1, q9, q8 \n" 382 "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 383 "vsub.s32 q2, q12, q13 \n" 384 "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 385 "vsub.s32 q3, q9, q8 \n" 386 "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 387 388 // set the results to output 389 "vst1.16 d0[0], [%[out]], %[kStep] \n" 390 "vst1.16 d1[0], [%[out]], %[kStep] \n" 391 "vst1.16 d2[0], [%[out]], %[kStep] \n" 392 "vst1.16 d3[0], [%[out]], %[kStep] \n" 393 "vst1.16 d0[1], [%[out]], %[kStep] \n" 394 "vst1.16 d1[1], [%[out]], %[kStep] \n" 395 "vst1.16 d2[1], [%[out]], %[kStep] \n" 396 "vst1.16 d3[1], [%[out]], %[kStep] \n" 397 "vst1.16 d0[2], [%[out]], %[kStep] \n" 398 "vst1.16 d1[2], [%[out]], %[kStep] \n" 399 "vst1.16 d2[2], [%[out]], %[kStep] \n" 400 "vst1.16 d3[2], [%[out]], %[kStep] \n" 401 "vst1.16 d0[3], [%[out]], %[kStep] \n" 402 "vst1.16 d1[3], [%[out]], %[kStep] \n" 403 "vst1.16 d2[3], [%[out]], %[kStep] \n" 404 "vst1.16 d3[3], [%[out]], %[kStep] \n" 405 406 : [out] "+r"(out) // modified registers 407 : [in] "r"(in), [kStep] "r"(kStep) // constants 408 : "memory", "q0", "q1", "q2", "q3", 409 "q8", "q9", "q10", "q11", "q12", "q13" // clobbered 410 ); 411} 412 413#endif // WEBP_USE_NEON 414 415//------------------------------------------------------------------------------ 416// Entry point 417 418extern void VP8DspInitNEON(void); 419 420void VP8DspInitNEON(void) { 421#if defined(WEBP_USE_NEON) 422 VP8Transform = TransformTwo; 423 VP8TransformAC3 = TransformOne; // no special code here 424 VP8TransformDC = TransformDC; 425 VP8TransformWHT = TransformWHT; 426 427 VP8SimpleVFilter16 = SimpleVFilter16NEON; 428 VP8SimpleHFilter16 = SimpleHFilter16NEON; 429 VP8SimpleVFilter16i = SimpleVFilter16iNEON; 430 VP8SimpleHFilter16i = SimpleHFilter16iNEON; 431#endif // WEBP_USE_NEON 432} 433 434