1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS CortexA8
31
32        EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
33
34    IF CortexA8
35
36        M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11
37
38;// Declare input registers
39pSrc            RN 0
40srcStep         RN 1
41pDst            RN 2
42dstStep         RN 3
43
44Temp            RN 12
45
46;// Declare Neon registers
47dCoeff5         DN 30.S16
48dCoeff20        DN 31.S16
49
50dSrc0           DN 7.U8
51dSrc1           DN 8.U8
52dSrc2           DN 9.U8
53dSrc3           DN 10.U8
54dSrc4           DN 11.U8
55dSrc5           DN 12.U8
56dSrc6           DN 13.U8
57dSrc7           DN 14.U8
58dSrc8           DN 15.U8
59
60qSumBE01        QN 8.S16
61qSumCD01        QN 9.S16
62dSumBE0         DN 16.S16
63dSumCD0         DN 18.S16
64
65qAcc01          QN 0.S16
66qAcc23          QN 1.S16
67qAcc45          QN 2.S16
68qAcc67          QN 3.S16
69
70dRes0           DN 0.S16
71dRes1           DN 2.S16
72dRes2           DN 4.S16
73dRes3           DN 6.S16
74
75dAcc0           DN 0.U8
76dAcc1           DN 2.U8
77dAcc2           DN 4.U8
78dAcc3           DN 6.U8
79
80
81dTmp0           DN 20.S16
82dTmp1           DN 21.S16
83dTmp2           DN 22.S16
84dTmp3           DN 23.S16
85
86
87        VLD1        dSrc0, [pSrc], srcStep     ;// [a0 a1 a2 a3 .. ]
88        ADD         Temp, pSrc, srcStep, LSL #2
89        VLD1        dSrc1, [pSrc], srcStep     ;// [b0 b1 b2 b3 .. ]
90        ;// One cycle stall
91        VLD1        dSrc5, [Temp], srcStep
92        ;// One cycle stall
93        VLD1        dSrc2, [pSrc], srcStep     ;// [c0 c1 c2 c3 .. ]
94        VADDL       qAcc01, dSrc0, dSrc5       ;// Acc = a+f
95        VLD1        dSrc3, [pSrc], srcStep
96        ;// One cycle stall
97        VLD1        dSrc6, [Temp], srcStep ;// TeRi
98
99        VLD1        dSrc4, [pSrc], srcStep
100        VLD1        dSrc7, [Temp], srcStep ;// TeRi
101        VADDL       qSumBE01, dSrc1, dSrc4     ;// b+e
102        VADDL       qSumCD01, dSrc2, dSrc3     ;// c+d
103        VLD1        dSrc8, [Temp], srcStep ;// TeRi
104        VMLS        dRes0, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
105;        VMLA        dRes0, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
106        VMUL        dTmp0, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
107
108;        VLD1        dSrc6, [Temp], srcStep
109        VADDL       qSumBE01, dSrc2, dSrc5     ;// b+e
110        VADDL       qSumCD01, dSrc3, dSrc4     ;// c+d
111        VADDL       qAcc23, dSrc1, dSrc6       ;// Acc = a+f
112        VMLS        dRes1, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
113;        VMLA        dRes1, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
114        VMUL        dTmp1, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
115
116;        VLD1        dSrc7, [Temp], srcStep
117        VADDL       qSumBE01, dSrc3, dSrc6     ;// b+e
118        VADDL       qSumCD01, dSrc4, dSrc5     ;// c+d
119        VADDL       qAcc45, dSrc2, dSrc7       ;// Acc = a+f
120        VMLS        dRes2, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
121;        VMLA        dRes2, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
122        VMUL        dTmp2, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
123
124;        VLD1        dSrc8, [Temp], srcStep     ;// [i0 i1 i2 i3 .. ]
125        VADDL       qSumBE01, dSrc4, dSrc7     ;// b+e
126        VADDL       qAcc67, dSrc3, dSrc8       ;// Acc = a+f
127        VADDL       qSumCD01, dSrc5, dSrc6     ;// c+d
128        VMLS        dRes3, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
129        VADD        dRes0, dRes0, dTmp0
130        VADD        dRes1, dRes1, dTmp1
131        VADD        dRes2, dRes2, dTmp2
132        VMLA        dRes3, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
133;        VMUL        dTmp3, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
134;        VADD        dRes3, dRes3, dTmp3
135
136        VQRSHRUN    dAcc0, qAcc01, #5
137        VQRSHRUN    dAcc1, qAcc23, #5
138        VQRSHRUN    dAcc2, qAcc45, #5
139        VQRSHRUN    dAcc3, qAcc67, #5
140
141        M_END
142
143    ENDIF
144
145
146
147    END
148
149