1;
2;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license and patent
5;  grant that can be found in the LICENSE file in the root of the source
6;  tree. All contributing project authors may be found in the AUTHORS
7;  file in the root of the source tree.
8;
9
10    EXPORT  |vp9_idct32x32_1_add_neon|
11    ARM
12    REQUIRE8
13    PRESERVE8
14
15    AREA ||.text||, CODE, READONLY, ALIGN=2
16
17    ;TODO(hkuang): put the following macros in a seperate
18    ;file so other idct function could also use them.
19    MACRO
20    LD_16x8          $src, $stride
21    vld1.8           {q8}, [$src], $stride
22    vld1.8           {q9}, [$src], $stride
23    vld1.8           {q10}, [$src], $stride
24    vld1.8           {q11}, [$src], $stride
25    vld1.8           {q12}, [$src], $stride
26    vld1.8           {q13}, [$src], $stride
27    vld1.8           {q14}, [$src], $stride
28    vld1.8           {q15}, [$src], $stride
29    MEND
30
31    MACRO
32    ADD_DIFF_16x8    $diff
33    vqadd.u8         q8, q8, $diff
34    vqadd.u8         q9, q9, $diff
35    vqadd.u8         q10, q10, $diff
36    vqadd.u8         q11, q11, $diff
37    vqadd.u8         q12, q12, $diff
38    vqadd.u8         q13, q13, $diff
39    vqadd.u8         q14, q14, $diff
40    vqadd.u8         q15, q15, $diff
41    MEND
42
43    MACRO
44    SUB_DIFF_16x8    $diff
45    vqsub.u8         q8, q8, $diff
46    vqsub.u8         q9, q9, $diff
47    vqsub.u8         q10, q10, $diff
48    vqsub.u8         q11, q11, $diff
49    vqsub.u8         q12, q12, $diff
50    vqsub.u8         q13, q13, $diff
51    vqsub.u8         q14, q14, $diff
52    vqsub.u8         q15, q15, $diff
53    MEND
54
55    MACRO
56    ST_16x8          $dst, $stride
57    vst1.8           {q8}, [$dst], $stride
58    vst1.8           {q9}, [$dst], $stride
59    vst1.8           {q10},[$dst], $stride
60    vst1.8           {q11},[$dst], $stride
61    vst1.8           {q12},[$dst], $stride
62    vst1.8           {q13},[$dst], $stride
63    vst1.8           {q14},[$dst], $stride
64    vst1.8           {q15},[$dst], $stride
65    MEND
66
67;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
68;                              int dest_stride)
69;
70; r0  int16_t input
71; r1  uint8_t *dest
72; r2  int dest_stride
73
74|vp9_idct32x32_1_add_neon| PROC
75    push             {lr}
76    pld              [r1]
77    add              r3, r1, #16               ; r3 dest + 16 for second loop
78    ldrsh            r0, [r0]
79
80    ; generate cospi_16_64 = 11585
81    mov              r12, #0x2d00
82    add              r12, #0x41
83
84    ; out = dct_const_round_shift(input[0] * cospi_16_64)
85    mul              r0, r0, r12               ; input[0] * cospi_16_64
86    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
87    asr              r0, r0, #14               ; >> DCT_CONST_BITS
88
89    ; out = dct_const_round_shift(out * cospi_16_64)
90    mul              r0, r0, r12               ; out * cospi_16_64
91    mov              r12, r1                   ; save dest
92    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
93    asr              r0, r0, #14               ; >> DCT_CONST_BITS
94
95    ; a1 = ROUND_POWER_OF_TWO(out, 6)
96    add              r0, r0, #32               ; + (1 <<((6) - 1))
97    asrs             r0, r0, #6                ; >> 6
98    bge              diff_positive_32_32
99
100diff_negative_32_32
101    neg              r0, r0
102    usat             r0, #8, r0
103    vdup.u8          q0, r0
104    mov              r0, #4
105
106diff_negative_32_32_loop
107    sub              r0, #1
108    LD_16x8          r1, r2
109    SUB_DIFF_16x8    q0
110    ST_16x8          r12, r2
111
112    LD_16x8          r1, r2
113    SUB_DIFF_16x8    q0
114    ST_16x8          r12, r2
115    cmp              r0, #2
116    moveq            r1, r3
117    moveq            r12, r3
118    cmp              r0, #0
119    bne              diff_negative_32_32_loop
120    pop              {pc}
121
122diff_positive_32_32
123    usat             r0, #8, r0
124    vdup.u8          q0, r0
125    mov              r0, #4
126
127diff_positive_32_32_loop
128    sub              r0, #1
129    LD_16x8          r1, r2
130    ADD_DIFF_16x8    q0
131    ST_16x8          r12, r2
132
133    LD_16x8          r1, r2
134    ADD_DIFF_16x8    q0
135    ST_16x8          r12, r2
136    cmp              r0, #2
137    moveq            r1, r3
138    moveq            r12, r3
139    cmp              r0, #0
140    bne              diff_positive_32_32_loop
141    pop              {pc}
142
143    ENDP             ; |vp9_idct32x32_1_add_neon|
144    END
145