shortidct4x4llm_neon.asm revision 90d3ed91ae9228e1c8bab561b6138d4cb8c1e4fd
190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; Use of this source code is governed by a BSD-style license and patent 590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; grant that can be found in the LICENSE file in the root of the source 690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; tree. All contributing project authors may be found in the AUTHORS 790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; file in the root of the source tree. 890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber EXPORT |vp8_short_idct4x4llm_neon| 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ARM 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber REQUIRE8 1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber PRESERVE8 1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber AREA ||.text||, CODE, READONLY, ALIGN=2 1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;************************************************************* 1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) 2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;r0 short * input 2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;r1 short * output 2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;r2 int pitch 2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;************************************************************* 2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;static const int cospi8sqrt2minus1=20091; 2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;static const int sinpi8sqrt2 =35468; 2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;static const int rounding = 0; 2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Optimization note: The resulted data from dequantization are signed 13-bit data that is 2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since 2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half 3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;result of the multiplication that is needed in IDCT. 3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber|vp8_short_idct4x4llm_neon| PROC 3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ldr r12, _idct_coeff_ 3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.16 {q1, q2}, [r0] 3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.16 {d0}, [r12] 3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vswp d3, d4 ;q2(vp[4] vp[12]) 3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqdmulh.s16 q3, q2, d0[2] 4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqdmulh.s16 q4, q2, d0[0] 4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 d12, d2, d3 ;a1 4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqsub.s16 d13, d2, d3 ;b1 4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vshr.s16 q3, q3, #1 4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vshr.s16 q4, q4, #1 4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) 4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q4, q4, q2 5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ;d6 - c1:temp1 5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ;d7 - d1:temp2 5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ;d8 - d1:temp1 5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ;d9 - c1:temp2 5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqsub.s16 d10, d6, d9 ;c1 5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 d11, d7, d8 ;d1 5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 d2, d12, d11 6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 d3, d13, d10 6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqsub.s16 d4, d13, d10 6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqsub.s16 d5, d12, d11 6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vtrn.32 d2, d4 6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vtrn.32 d3, d5 6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vtrn.16 d2, d3 6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vtrn.16 d4, d5 6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vswp d3, d4 7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqdmulh.s16 q3, q2, d0[2] 7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqdmulh.s16 q4, q2, d0[0] 7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 d12, d2, d3 ;a1 7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqsub.s16 d13, d2, d3 ;b1 7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vshr.s16 q3, q3, #1 7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vshr.s16 q4, q4, #1 7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) 8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q4, q4, q2 8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqsub.s16 d10, d6, d9 ;c1 8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 d11, d7, d8 ;d1 8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 d2, d12, d11 8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 d3, d13, d10 8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqsub.s16 d4, d13, d10 8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqsub.s16 d5, d12, d11 9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vrshr.s16 d2, d2, #3 9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vrshr.s16 d3, d3, #3 9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vrshr.s16 d4, d4, #3 9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vrshr.s16 d5, d5, #3 9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add r3, r1, r2 9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add r12, r3, r2 9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add r0, r12, r2 9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vtrn.32 d2, d4 10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vtrn.32 d3, d5 10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vtrn.16 d2, d3 10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vtrn.16 d4, d5 10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.16 {d2}, [r1] 10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.16 {d3}, [r3] 10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.16 {d4}, [r12] 10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.16 {d5}, [r0] 10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber bx lr 11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ENDP 11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;----------------- 11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber AREA idct4x4_dat, DATA, READWRITE ;read/write by default 11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Data section with name data_area is specified. DCD reserves space in memory for 48 data. 11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;One word each is reserved. Label filter_coeff can be used to access the data. 11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... 11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber_idct_coeff_ 12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber DCD idct_coeff 12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberidct_coeff 12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber DCD 0x4e7b4e7b, 0x8a8c8a8c 12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;20091, 20091, 35468, 35468 12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber END 127