armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS ARM1136JS
31
32        EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
33
34
35
36    IF ARM1136JS
37
38        ;// Function header
39
40        ;// Function:
41        ;//     armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
42        ;//
43        ;// Implements vertical interpolation for a block of size 4x4. Input and output should
44        ;// be aligned.
45        ;//
46        ;// Registers used as input for this function
47        ;// r0,r1,r2,r3 where r0,r2  input pointer and r1,r3 corresponding step size
48        ;//
49        ;// Registers preserved for top level function
50        ;// r0,r1,r2,r3,r4,r5,r6,r14
51        ;//
52        ;// Registers modified by the function
53        ;// r7,r8,r9,r10,r11,r12
54        ;//
55        ;// Output registers
56        ;// None. Function will preserve r0-r3
57        M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r6
58
59;// Declare input registers
60pSrc            RN 0
61srcStep         RN 1
62pDst            RN 2
63dstStep         RN 3
64
65;// Declare inner loop registers
66ValA            RN 5
67ValA0           RN 4
68ValA1           RN 5
69ValAF0          RN 4
70ValAF1          RN 5
71
72ValB            RN 11
73
74ValC            RN 5
75ValC0           RN 4
76ValC1           RN 5
77ValCD0          RN 12
78ValCD1          RN 14
79ValCF0          RN 4
80ValCF1          RN 5
81
82ValD            RN 10
83
84ValE            RN 7
85ValE0           RN 6
86ValE1           RN 7
87ValEB0          RN 10
88ValEB1          RN 11
89ValED0          RN 6
90ValED1          RN 7
91
92ValF            RN 10
93
94ValG            RN 14
95ValG0           RN 12
96ValG1           RN 14
97ValGB0          RN 12
98ValGB1          RN 14
99
100Acc0            RN 4
101Acc1            RN 5
102Acc2            RN 6
103Acc3            RN 7
104
105Temp            RN 7
106Height          RN 3
107Step            RN 6
108
109Counter         RN 8
110r0x00ff00ff     RN 9                                        ;// [0 255 0 255] where 255 is offset
111r0x0fe00fe0     RN 10                                       ;// [0 (16*255 - 16) 0 (16*255 - 16)]
112
113
114        LDR         r0x00ff00ff, =0x00ff00ff                ;// [0 255 0 255] 255 is offset to avoid negative results
115        MOV         Counter, #2
116
117TwoRowsLoop
118        M_LDR       ValC, [pSrc], srcStep                   ;// Load  [c3 c2 c1 c0]
119        M_LDR       ValD, [pSrc], srcStep                   ;// Load  [d3 d2 d1 d0]
120        M_LDR       ValE, [pSrc], srcStep                   ;// Load  [e3 e2 e1 e0]
121        SUB         pSrc, pSrc, srcStep, LSL #2
122        LDR         ValB, [pSrc]                            ;// Load  [b3 b2 b1 b0]
123        UXTAB16     ValC0, r0x00ff00ff, ValC                ;// [0 c2 0 c0] + [0 255 0 255]
124        UXTAB16     ValC1, r0x00ff00ff, ValC, ROR #8        ;// [0 c3 0 c1] + [0 255 0 255]
125
126        UXTAB16     ValE0, r0x00ff00ff, ValE                ;// [0 e2 0 e0] + [0 255 0 255]
127        UXTAB16     ValE1, r0x00ff00ff, ValE, ROR #8        ;// [0 e3 0 e1] + [0 255 0 255]
128        UXTAB16     ValCD0, ValC0, ValD                     ;// [0 c2 0 c0] + [0 255 0 255] + [0 d2 0 d0]
129        UXTAB16     ValCD1, ValC1, ValD, ROR #8             ;// [0 c3 0 c1] + [0 255 0 255] + [0 d3 0 d1]
130        UXTAB16     ValEB0, ValE0, ValB                     ;// [0 e2 0 e0] + [0 255 0 255] + [0 b2 0 b0]
131        RSB         ValCD0, ValEB0, ValCD0, LSL #2          ;// 4*(Off+C+D) - (Off+B+E)
132
133        LDR         ValD, [pSrc, srcStep, LSL #1]                       ;// Load  [d3 d2 d1 d0]
134        UXTAB16     ValEB1, ValE1, ValB, ROR #8             ;// [0 e3 0 e1] + [0 255 0 255] + [0 b3 0 b1]
135        RSB         ValCD1, ValEB1, ValCD1, LSL #2
136        ;// One cycle stall
137        UXTAB16     ValED0, ValE0, ValD                     ;// [0 e2 0 e0] + [0 255 0 255] + [0 d2 0 d0]
138        UXTAB16     ValED1, ValE1, ValD, ROR #8             ;// [0 e3 0 e1] + [0 255 0 255] + [0 d3 0 d1]
139
140        LDR         ValF, [pSrc, srcStep, LSL #2]           ;// Load  [f3 f2 f1 f0]
141        M_LDR       ValB, [pSrc], srcStep                   ;// Load  [b3 b2 b1 b0]
142        ADD         ValCD0, ValCD0, ValCD0, LSL #2          ;// 5 * [4*(Off+C+D) - (Off+B+E)]
143        ADD         ValCD1, ValCD1, ValCD1, LSL #2
144        UXTAB16     ValCF1, ValC1, ValF, ROR #8             ;// [0 c3 0 c1] + [0 255 0 255] + [0 f3 0 f1]
145        UXTAB16     ValCF0, ValC0, ValF                     ;// [0 c2 0 c0] + [0 255 0 255] + [0 f2 0 f0]
146        RSB         ValED1, ValCF1, ValED1, LSL #2
147
148        SUB         ValA, pSrc, srcStep, LSL #1
149        LDR         ValA, [ValA]                            ;// Load  [a3 a2 a1 a0]
150        RSB         ValED0, ValCF0, ValED0, LSL #2          ;// 4*(Off+E+D) - (Off+C+F)
151        ADD         ValED1, ValED1, ValED1, LSL #2
152        ADD         ValED0, ValED0, ValED0, LSL #2          ;// 5 * [4*(Off+E+D) - (Off+C+F)]
153        UXTAB16     ValA0, r0x00ff00ff, ValA                ;// [0 a2 0 a0] + [0 255 0 255]
154        UXTAB16     ValA1, r0x00ff00ff, ValA, ROR #8        ;// [0 a3 0 a1] + [0 255 0 255]
155        UXTAB16     ValAF0, ValA0, ValF                     ;// [0 a2 0 a0] + [0 255 0 255] + [0 f2 0 f0]
156        UXTAB16     ValAF1, ValA1, ValF, ROR #8             ;// [0 a3 0 a1] + [0 255 0 255] + [0 f3 0 f1]
157
158        LDR         r0x0fe00fe0, =0x0fe00fe0                ;// [0 255 0 255] 255 is offset to avoid negative results
159        ADD         Acc1, ValCD1, ValAF1
160
161        LDR         ValG, [pSrc, srcStep, LSL #2]           ;// Load  [g3 g2 g1 g0]
162        ADD         Acc0, ValCD0, ValAF0                    ;// Acc0 = 16*Off + (A+F) + 20*(C+D) - 5*(B+E)
163        UQSUB16     Acc1, Acc1, r0x0fe00fe0                 ;// Acc1 -= (16*Off - 16)
164        UQSUB16     Acc0, Acc0, r0x0fe00fe0
165        UXTAB16     ValG0, r0x00ff00ff, ValG                ;// [0 g2 0 g0] + [0 255 0 255]
166        UXTAB16     ValG1, r0x00ff00ff, ValG, ROR #8        ;// [0 g3 0 g1] + [0 255 0 255]
167        UXTAB16     ValGB0, ValG0, ValB                     ;// [0 g2 0 g0] + [0 255 0 255] + [0 b2 0 b0]
168        UXTAB16     ValGB1, ValG1, ValB, ROR #8             ;// [0 g3 0 g1] + [0 255 0 255] + [0 b3 0 b1]
169        ADD         Acc2, ValED0, ValGB0                    ;// Acc2 = 16*Off + (B+G) + 20*(D+E) - 5*(C+F)
170        ADD         Acc3, ValED1, ValGB1
171        UQSUB16     Acc3, Acc3, r0x0fe00fe0                 ;// Acc3 -= (16*Off - 16)
172        UQSUB16     Acc2, Acc2, r0x0fe00fe0
173        USAT16      Acc1, #13, Acc1                         ;// Saturate to 8+5 = 13 bits
174        USAT16      Acc0, #13, Acc0
175        USAT16      Acc3, #13, Acc3
176        USAT16      Acc2, #13, Acc2
177        AND         Acc1, r0x00ff00ff, Acc1, LSR #5         ;// [0 a3 0 a1]
178        AND         Acc0, r0x00ff00ff, Acc0, LSR #5         ;// [0 a2 0 a0]
179        ORR         Acc0, Acc0, Acc1, LSL #8                ;// [a3 a2 a1 a0]
180        AND         Acc3, r0x00ff00ff, Acc3, LSR #5         ;// [0 b3 0 b1]
181        AND         Acc2, r0x00ff00ff, Acc2, LSR #5         ;// [0 b2 0 b0]
182
183        M_STR       Acc0, [pDst], dstStep                   ;// Store result & adjust pointer
184        ORR         Acc2, Acc2, Acc3, LSL #8                ;// [b3 b2 b1 b0]
185        M_STR       Acc2, [pDst], dstStep                   ;// Store result & adjust pointer
186        ADD         pSrc, pSrc, srcStep, LSL #1
187
188        SUBS        Counter, Counter, #1
189        BGT         TwoRowsLoop
190End
191        SUB     pDst, pDst, dstStep, LSL #2
192        SUB     pSrc, pSrc, srcStep, LSL #2
193
194        M_END
195
196    ENDIF
197
198    END
199
200