omxVCM4P10_PredictIntra_16x16_s.S revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16/*
17 *
18 */
19
20    .eabi_attribute 24, 1
21    .eabi_attribute 25, 1
22
23    .arm
24    .fpu neon
25
26    .text
27    .align 4
28;//-------------------------------------------------------
29;// This table for implementing switch case of C in asm by
30;// the mehtod of two levels of indexing.
31;//-------------------------------------------------------
32
33armVCM4P10_pIndexTable16x16:
34    .word  OMX_VC_16X16_VERT-(P0+8), OMX_VC_16X16_HOR-(P0+8)
35    .word  OMX_VC_16X16_DC-(P0+8),   OMX_VC_16X16_PLANE-(P0+8)
36
37
38armVCM4P10_MultiplierTable16x16:
39    .hword   7,  6,  5,  4,  3,  2,  1,  8
40    .hword   0,  1,  2,  3,  4,  5,  6,  7
41    .hword   8,  9, 10, 11, 12, 13, 14, 15
42
43
44    .global omxVCM4P10_PredictIntra_16x16
45    .func   omxVCM4P10_PredictIntra_16x16
46omxVCM4P10_PredictIntra_16x16:
47    PUSH     {r4-r12,lr}
48    VPUSH    {d8-d15}
49    ADR      r9, armVCM4P10_pIndexTable16x16
50    LDR      r6,[sp,#0x70]
51    LDR      r4,[sp,#0x68]
52    LDR      r5,[sp,#0x6c]
53    LDR      r7,[sp,#0x74]
54    MOV      r12,#0x10
55    LDR      r9,[r9,r6,LSL #2]
56P0: ADD      pc,r9
57OMX_VC_16X16_VERT:
58    VLD1.8   {d0,d1},[r1]
59    ADD      r8,r3,r5
60    ADD      r10,r5,r5
61    VST1.8   {d0,d1},[r3],r10
62    VST1.8   {d0,d1},[r8],r10
63    VST1.8   {d0,d1},[r3],r10
64    VST1.8   {d0,d1},[r8],r10
65    VST1.8   {d0,d1},[r3],r10
66    VST1.8   {d0,d1},[r8],r10
67    VST1.8   {d0,d1},[r3],r10
68    VST1.8   {d0,d1},[r8],r10
69    VST1.8   {d0,d1},[r3],r10
70    VST1.8   {d0,d1},[r8],r10
71    VST1.8   {d0,d1},[r3],r10
72    VST1.8   {d0,d1},[r8],r10
73    VST1.8   {d0,d1},[r3],r10
74    VST1.8   {d0,d1},[r8],r10
75    VST1.8   {d0,d1},[r3]
76    VST1.8   {d0,d1},[r8]
77    MOV      r0,#0
78    VPOP     {d8-d15}
79    POP      {r4-r12,pc}
80OMX_VC_16X16_HOR:
81    ADD      r8,r0,r4
82    ADD      r4,r4,r4
83    ADD      r11,r3,r5
84    ADD      r5,r5,r5
85L0x8c:
86    VLD1.8   {d2[],d3[]},[r0],r4
87    VLD1.8   {d0[],d1[]},[r8],r4
88    SUBS     r12,r12,#8
89    VST1.8   {d2,d3},[r3],r5
90    VST1.8   {d0,d1},[r11],r5
91    VLD1.8   {d2[],d3[]},[r0],r4
92    VLD1.8   {d0[],d1[]},[r8],r4
93    VST1.8   {d2,d3},[r3],r5
94    VST1.8   {d0,d1},[r11],r5
95    VLD1.8   {d2[],d3[]},[r0],r4
96    VLD1.8   {d0[],d1[]},[r8],r4
97    VST1.8   {d2,d3},[r3],r5
98    VST1.8   {d0,d1},[r11],r5
99    VLD1.8   {d2[],d3[]},[r0],r4
100    VLD1.8   {d0[],d1[]},[r8],r4
101    VST1.8   {d2,d3},[r3],r5
102    VST1.8   {d0,d1},[r11],r5
103    BNE      L0x8c
104    MOV      r0,#0
105    VPOP     {d8-d15}
106    POP      {r4-r12,pc}
107OMX_VC_16X16_DC:
108    MOV      r11,#0
109    TST      r7,#2
110    BEQ      L0x14c
111    ADD      r8,r0,r4
112    ADD      r10,r4,r4
113    VLD1.8   {d2[0]},[r0],r10
114    VLD1.8   {d2[1]},[r8],r10
115    VLD1.8   {d2[2]},[r0],r10
116    VLD1.8   {d2[3]},[r8],r10
117    VLD1.8   {d2[4]},[r0],r10
118    VLD1.8   {d2[5]},[r8],r10
119    VLD1.8   {d2[6]},[r0],r10
120    VLD1.8   {d2[7]},[r8],r10
121    VLD1.8   {d3[0]},[r0],r10
122    VLD1.8   {d3[1]},[r8],r10
123    VLD1.8   {d3[2]},[r0],r10
124    VLD1.8   {d3[3]},[r8],r10
125    VLD1.8   {d3[4]},[r0],r10
126    VLD1.8   {d3[5]},[r8],r10
127    VLD1.8   {d3[6]},[r0],r10
128    VLD1.8   {d3[7]},[r8]
129    VPADDL.U8 q0,q1
130    ADD      r11,r11,#1
131    VPADD.I16 d0,d0,d1
132    VPADDL.U16 d0,d0
133    VPADDL.U32 d6,d0
134    VRSHR.U64 d8,d6,#4
135L0x14c:
136    TST      r7,#1
137    BEQ      L0x170
138    VLD1.8   {d0,d1},[r1]
139    ADD      r11,r11,#1
140    VPADDL.U8 q0,q0
141    VPADD.I16 d0,d0,d1
142    VPADDL.U16 d0,d0
143    VPADDL.U32 d7,d0
144    VRSHR.U64 d8,d7,#4
145L0x170:
146    CMP      r11,#2
147    BNE      L0x180
148    VADD.I64 d8,d7,d6
149    VRSHR.U64 d8,d8,#5
150L0x180:
151    VDUP.8   q3,d8[0]
152    CMP      r11,#0
153    ADD      r8,r3,r5
154    ADD      r10,r5,r5
155    BNE      L0x198
156    VMOV.I8  q3,#0x80
157L0x198:
158    VST1.8   {d6,d7},[r3],r10
159    VST1.8   {d6,d7},[r8],r10
160    VST1.8   {d6,d7},[r3],r10
161    VST1.8   {d6,d7},[r8],r10
162    VST1.8   {d6,d7},[r3],r10
163    VST1.8   {d6,d7},[r8],r10
164    VST1.8   {d6,d7},[r3],r10
165    VST1.8   {d6,d7},[r8],r10
166    VST1.8   {d6,d7},[r3],r10
167    VST1.8   {d6,d7},[r8],r10
168    VST1.8   {d6,d7},[r3],r10
169    VST1.8   {d6,d7},[r8],r10
170    VST1.8   {d6,d7},[r3],r10
171    VST1.8   {d6,d7},[r8],r10
172    VST1.8   {d6,d7},[r3],r10
173    VST1.8   {d6,d7},[r8],r10
174    MOV      r0,#0
175    VPOP     {d8-d15}
176    POP      {r4-r12,pc}
177OMX_VC_16X16_PLANE:
178    ADR      r9, armVCM4P10_MultiplierTable16x16
179    VLD1.8   {d0,d1},[r1]
180    VLD1.8   {d4[0]},[r2]
181    ADD      r8,r0,r4
182    ADD      r10,r4,r4
183    VLD1.8   {d2[0]},[r0],r10
184    VLD1.8   {d2[1]},[r8],r10
185    VLD1.8   {d2[2]},[r0],r10
186    VLD1.8   {d2[3]},[r8],r10
187    VLD1.8   {d2[4]},[r0],r10
188    VLD1.8   {d2[5]},[r8],r10
189    VLD1.8   {d2[6]},[r0],r10
190    VLD1.8   {d2[7]},[r8],r10
191    VLD1.8   {d3[0]},[r0],r10
192    VLD1.8   {d3[1]},[r8],r10
193    VLD1.8   {d3[2]},[r0],r10
194    VLD1.8   {d3[3]},[r8],r10
195    VLD1.8   {d3[4]},[r0],r10
196    VLD1.8   {d3[5]},[r8],r10
197    VLD1.8   {d3[6]},[r0],r10
198    VLD1.8   {d3[7]},[r8]
199    VREV64.8 d5,d1
200    VSUBL.U8 q3,d5,d4
201    VSHR.U64 d5,d5,#8
202    VSUBL.U8 q4,d5,d0
203    VSHL.I64 d9,d9,#16
204    VEXT.8   d9,d9,d6,#2
205    VREV64.8 d12,d3
206    VSUBL.U8 q7,d12,d4
207    VSHR.U64 d12,d12,#8
208    VSUBL.U8 q8,d12,d2
209    VLD1.16  {d20,d21},[r9]!
210    VSHL.I64 d17,d17,#16
211    VEXT.8   d17,d17,d14,#2
212    VMULL.S16 q11,d8,d20
213    VMULL.S16 q12,d16,d20
214    VMLAL.S16 q11,d9,d21
215    VMLAL.S16 q12,d17,d21
216    VPADD.I32 d22,d23,d22
217    VPADD.I32 d23,d25,d24
218    VPADDL.S32 q11,q11
219    VSHL.I64 q12,q11,#2
220    VADD.I64 q11,q11,q12
221    VRSHR.S64 q11,q11,#6
222    VSHL.I64 q12,q11,#3
223    VSUB.I64 q12,q12,q11
224    VLD1.16  {d20,d21},[r9]!
225    VDUP.16  q6,d22[0]
226    VDUP.16  q7,d23[0]
227    VADDL.U8 q11,d1,d3
228    VSHL.I16 q11,q11,#4
229    VDUP.16  q11,d23[3]
230    VADD.I64 d1,d24,d25
231    VLD1.16  {d24,d25},[r9]
232    VDUP.16  q13,d1[0]
233    VSUB.I16 q13,q11,q13
234    VMUL.I16 q5,q6,q10
235    VMUL.I16 q6,q6,q12
236    VADD.I16 q0,q5,q13
237    VADD.I16 q1,q6,q13
238L0x2d4:
239    VQRSHRUN.S16 d6,q0,#5
240    VQRSHRUN.S16 d7,q1,#5
241    SUBS     r12,r12,#1
242    VST1.8   {d6,d7},[r3],r5
243    VADD.I16 q0,q0,q7
244    VADD.I16 q1,q1,q7
245    BNE      L0x2d4
246    MOV      r0,#0
247    VPOP     {d8-d15}
248    POP      {r4-r12,pc}
249    .endfunc
250
251    .end
252
253