1;//
2;//
3;// File Name:  armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16        M_VARIANTS CortexA8
17
18        EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
19
20    IF CortexA8
21
22        M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11
23
24;// Declare input registers
25pSrc            RN 0
26srcStep         RN 1
27pDst            RN 2
28dstStep         RN 3
29
30Temp            RN 12
31
32;// Declare Neon registers
33dCoeff5         DN 30.S16
34dCoeff20        DN 31.S16
35
36dSrc0           DN 7.U8
37dSrc1           DN 8.U8
38dSrc2           DN 9.U8
39dSrc3           DN 10.U8
40dSrc4           DN 11.U8
41dSrc5           DN 12.U8
42dSrc6           DN 13.U8
43dSrc7           DN 14.U8
44dSrc8           DN 15.U8
45
46qSumBE01        QN 8.S16
47qSumCD01        QN 9.S16
48dSumBE0         DN 16.S16
49dSumCD0         DN 18.S16
50
51qAcc01          QN 0.S16
52qAcc23          QN 1.S16
53qAcc45          QN 2.S16
54qAcc67          QN 3.S16
55
56dRes0           DN 0.S16
57dRes1           DN 2.S16
58dRes2           DN 4.S16
59dRes3           DN 6.S16
60
61dAcc0           DN 0.U8
62dAcc1           DN 2.U8
63dAcc2           DN 4.U8
64dAcc3           DN 6.U8
65
66
67dTmp0           DN 20.S16
68dTmp1           DN 21.S16
69dTmp2           DN 22.S16
70dTmp3           DN 23.S16
71
72
73        VLD1        dSrc0, [pSrc], srcStep     ;// [a0 a1 a2 a3 .. ]
74        ADD         Temp, pSrc, srcStep, LSL #2
75        VLD1        dSrc1, [pSrc], srcStep     ;// [b0 b1 b2 b3 .. ]
76        ;// One cycle stall
77        VLD1        dSrc5, [Temp], srcStep
78        ;// One cycle stall
79        VLD1        dSrc2, [pSrc], srcStep     ;// [c0 c1 c2 c3 .. ]
80        VADDL       qAcc01, dSrc0, dSrc5       ;// Acc = a+f
81        VLD1        dSrc3, [pSrc], srcStep
82        ;// One cycle stall
83        VLD1        dSrc6, [Temp], srcStep ;// TeRi
84
85        VLD1        dSrc4, [pSrc], srcStep
86        VLD1        dSrc7, [Temp], srcStep ;// TeRi
87        VADDL       qSumBE01, dSrc1, dSrc4     ;// b+e
88        VADDL       qSumCD01, dSrc2, dSrc3     ;// c+d
89        VLD1        dSrc8, [Temp], srcStep ;// TeRi
90        VMLS        dRes0, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
91;        VMLA        dRes0, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
92        VMUL        dTmp0, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
93
94;        VLD1        dSrc6, [Temp], srcStep
95        VADDL       qSumBE01, dSrc2, dSrc5     ;// b+e
96        VADDL       qSumCD01, dSrc3, dSrc4     ;// c+d
97        VADDL       qAcc23, dSrc1, dSrc6       ;// Acc = a+f
98        VMLS        dRes1, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
99;        VMLA        dRes1, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
100        VMUL        dTmp1, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
101
102;        VLD1        dSrc7, [Temp], srcStep
103        VADDL       qSumBE01, dSrc3, dSrc6     ;// b+e
104        VADDL       qSumCD01, dSrc4, dSrc5     ;// c+d
105        VADDL       qAcc45, dSrc2, dSrc7       ;// Acc = a+f
106        VMLS        dRes2, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
107;        VMLA        dRes2, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
108        VMUL        dTmp2, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
109
110;        VLD1        dSrc8, [Temp], srcStep     ;// [i0 i1 i2 i3 .. ]
111        VADDL       qSumBE01, dSrc4, dSrc7     ;// b+e
112        VADDL       qAcc67, dSrc3, dSrc8       ;// Acc = a+f
113        VADDL       qSumCD01, dSrc5, dSrc6     ;// c+d
114        VMLS        dRes3, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
115        VADD        dRes0, dRes0, dTmp0
116        VADD        dRes1, dRes1, dTmp1
117        VADD        dRes2, dRes2, dTmp2
118        VMLA        dRes3, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
119;        VMUL        dTmp3, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
120;        VADD        dRes3, dRes3, dTmp3
121
122        VQRSHRUN    dAcc0, qAcc01, #5
123        VQRSHRUN    dAcc1, qAcc23, #5
124        VQRSHRUN    dAcc2, qAcc45, #5
125        VQRSHRUN    dAcc3, qAcc67, #5
126
127        M_END
128
129    ENDIF
130
131
132
133    END
134
135