fastfdct4x4_neon.asm revision 90d3ed91ae9228e1c8bab561b6138d4cb8c1e4fd
1;
2;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license and patent
5;  grant that can be found in the LICENSE file in the root of the source
6;  tree. All contributing project authors may be found in the AUTHORS
7;  file in the root of the source tree.
8;
9
10
11    EXPORT  |vp8_fast_fdct4x4_neon|
12
13    ARM
14    REQUIRE8
15    PRESERVE8
16
17    AREA ||.text||, CODE, READONLY, ALIGN=2
18;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
19;NOTE:
20;The input *src_diff. src_diff is calculated as:
21;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
22;In which *src_ptr and *pred_ptr both are unsigned char.
23;Therefore, *src_diff should be in the range of [-255, 255].
24;CAUTION:
25;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
26;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
27;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
28
29|vp8_fast_fdct4x4_neon| PROC
30    vld1.16         {d2}, [r0], r2              ;load input
31    ldr             r12, _ffdct_coeff_
32    vld1.16         {d3}, [r0], r2
33    vld1.16         {d4}, [r0], r2
34    vld1.16         {d0}, [r12]
35    vld1.16         {d5}, [r0], r2
36
37    ;First for-loop
38    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
39    vtrn.32         d2, d4
40    vtrn.32         d3, d5
41    vtrn.16         d2, d3
42    vtrn.16         d4, d5
43
44    vadd.s16        d6, d2, d5              ;ip[0]+ip[3]
45    vadd.s16        d7, d3, d4              ;ip[1]+ip[2]
46    vsub.s16        d8, d3, d4              ;ip[1]-ip[2]
47    vsub.s16        d9, d2, d5              ;ip[0]-ip[3]
48    vshl.i16        q3, q3, #1              ; a1, b1
49    vshl.i16        q4, q4, #1              ; c1, d1
50
51    vadd.s16        d10, d6, d7             ;temp1 = a1 + b1
52    vsub.s16        d11, d6, d7             ;temp2 = a1 - b1
53
54    vqdmulh.s16     q6, q5, d0[1]
55    vqdmulh.s16     q8, q4, d0[0]
56    vqdmulh.s16     q7, q4, d0[2]
57
58    vshr.s16        q6, q6, #1
59    vshr.s16        q8, q8, #1
60    vshr.s16        q7, q7, #1              ;d14:temp1 = ( c1 * x_c3)>>16;  d15:temp1 =  (d1 * x_c3)>>16
61    vadd.s16        q8, q4, q8              ;d16:temp2 = ((c1 * x_c1)>>16) + c1;  d17:temp2 = ((d1 * x_c1)>>16) + d1
62
63    vadd.s16        d2, d10, d12            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
64    vadd.s16        d4, d11, d13            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
65    vadd.s16        d3, d14, d17            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
66    vsub.s16        d5, d15, d16            ;op[3] = temp1 - temp2
67
68    ;Second for-loop
69    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
70    vtrn.32         d2, d4
71    vtrn.32         d3, d5
72    vtrn.16         d2, d3
73    vtrn.16         d4, d5
74
75    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[12]
76    vadd.s16        d7, d3, d4              ;b1 = ip[4]+ip[8]
77    vsub.s16        d8, d3, d4              ;c1 = ip[4]-ip[8]
78    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[12]
79
80    vadd.s16        d10, d6, d7             ;temp1 = a1 + b1
81    vsub.s16        d11, d6, d7             ;temp2 = a1 - b1
82
83
84    vqdmulh.s16     q6, q5, d0[1]
85    vqdmulh.s16     q8, q4, d0[0]
86    vqdmulh.s16     q7, q4, d0[2]
87
88    vshr.s16        q6, q6, #1
89    vshr.s16        q8, q8, #1
90    vshr.s16        q7, q7, #1              ;d14:temp1 = ( c1 * x_c3)>>16;  d15:temp1 =  (d1 * x_c3)>>16
91    vadd.s16        q8, q4, q8              ;d16:temp2 = ((c1 * x_c1)>>16) + c1;  d17:temp2 = ((d1 * x_c1)>>16) + d1
92
93    vadd.s16        d2, d10, d12            ;a2 = ((temp1 * x_c2 )>>16) + temp1
94    vadd.s16        d4, d11, d13            ;c2 = ((temp2 * x_c2 )>>16) + temp2
95    vadd.s16        d3, d14, d17            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
96    vsub.s16        d5, d15, d16            ;d2 = temp1 - temp2
97
98    vclt.s16        q3, q1, #0
99    vclt.s16        q4, q2, #0
100
101    vsub.s16        q1, q1, q3
102    vsub.s16        q2, q2, q4
103
104    vshr.s16        q1, q1, #1
105    vshr.s16        q2, q2, #1
106
107    vst1.16         {q1, q2}, [r1]
108
109    bx              lr
110
111    ENDP
112
113;-----------------
114    AREA    fastfdct_dat, DATA, READONLY
115;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
116;One word each is reserved. Label filter_coeff can be used to access the data.
117;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
118_ffdct_coeff_
119    DCD     ffdct_coeff
120ffdct_coeff
121; 60547 =  0xEC83
122; 46341 =  0xB505
123; 25080 =  0x61F8
124    DCD     0xB505EC83, 0x000061F8
125
126    END
127