shortidct4x4llm_neon.asm revision 90d3ed91ae9228e1c8bab561b6138d4cb8c1e4fd
190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;  Use of this source code is governed by a BSD-style license and patent
590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;  grant that can be found in the LICENSE file in the root of the source
690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;  tree. All contributing project authors may be found in the AUTHORS
790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;  file in the root of the source tree.
890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    EXPORT  |vp8_short_idct4x4llm_neon|
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ARM
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    REQUIRE8
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    PRESERVE8
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    AREA ||.text||, CODE, READONLY, ALIGN=2
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;*************************************************************
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;r0 short * input
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;r1 short * output
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;r2 int pitch
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;*************************************************************
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;static const int cospi8sqrt2minus1=20091;
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;static const int sinpi8sqrt2      =35468;
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;static const int rounding = 0;
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Optimization note: The resulted data from dequantization are signed 13-bit data that is
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;result of the multiplication that is needed in IDCT.
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber|vp8_short_idct4x4llm_neon| PROC
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             r12, _idct_coeff_
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.16         {q1, q2}, [r0]
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.16         {d0}, [r12]
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vswp            d3, d4                  ;q2(vp[4] vp[12])
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqdmulh.s16     q3, q2, d0[2]
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqdmulh.s16     q4, q2, d0[0]
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       d12, d2, d3             ;a1
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqsub.s16       d13, d2, d3             ;b1
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.s16        q3, q3, #1
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.s16        q4, q4, #1
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q4, q4, q2
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;d6 - c1:temp1
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;d7 - d1:temp2
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;d8 - d1:temp1
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;d9 - c1:temp2
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqsub.s16       d10, d6, d9             ;c1
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       d11, d7, d8             ;d1
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       d2, d12, d11
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       d3, d13, d10
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqsub.s16       d4, d13, d10
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqsub.s16       d5, d12, d11
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vtrn.32         d2, d4
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vtrn.32         d3, d5
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vtrn.16         d2, d3
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vtrn.16         d4, d5
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vswp            d3, d4
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqdmulh.s16     q3, q2, d0[2]
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqdmulh.s16     q4, q2, d0[0]
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       d12, d2, d3             ;a1
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqsub.s16       d13, d2, d3             ;b1
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.s16        q3, q3, #1
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.s16        q4, q4, #1
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q4, q4, q2
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqsub.s16       d10, d6, d9             ;c1
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       d11, d7, d8             ;d1
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       d2, d12, d11
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       d3, d13, d10
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqsub.s16       d4, d13, d10
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqsub.s16       d5, d12, d11
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrshr.s16       d2, d2, #3
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrshr.s16       d3, d3, #3
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrshr.s16       d4, d4, #3
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrshr.s16       d5, d5, #3
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r3, r1, r2
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r12, r3, r2
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r0, r12, r2
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vtrn.32         d2, d4
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vtrn.32         d3, d5
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vtrn.16         d2, d3
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vtrn.16         d4, d5
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.16         {d2}, [r1]
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.16         {d3}, [r3]
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.16         {d4}, [r12]
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.16         {d5}, [r0]
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bx             lr
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ENDP
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;-----------------
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    AREA    idct4x4_dat, DATA, READWRITE            ;read/write by default
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;One word each is reserved. Label filter_coeff can be used to access the data.
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber_idct_coeff_
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    DCD     idct_coeff
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberidct_coeff
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    DCD     0x4e7b4e7b, 0x8a8c8a8c
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;20091, 20091, 35468, 35468
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    END
127