armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        M_VARIANTS ARM1136JS
31
32        EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
33
34DEBUG_ON    SETL {FALSE}
35
36
37    IF ARM1136JS
38
39;// Function:
40;//     armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
41;//
42;// Implements horizontal interpolation for a block of size 4x4. Input and output should
43;// be aligned.
44;//
45;// Registers used as input for this function
46;// r0,r1,r2,r3 where r0,r2  input pointer and r1,r3 corresponding step size
47;//
48;// Registers preserved for top level function
49;// r0,r1,r2,r3,r4,r5,r6,r14
50;//
51;// Registers modified by the function
52;// r7,r8,r9,r10,r11,r12
53;//
54;// Output registers
55;// None. Function will preserve r0-r3
56
57
58;// Declare input registers
59pSrc            RN 0
60srcStep         RN 1
61pDst            RN 2
62dstStep         RN 3
63
64;// Declare inner loop registers
65Acc0            RN 4
66Acc1            RN 5
67Acc2            RN 6
68Acc3            RN 7
69
70ValA            RN 4
71ValB            RN 5
72ValC            RN 6
73ValD            RN 7
74ValE            RN 8
75ValF            RN 9
76ValG            RN 12
77ValH            RN 14
78ValI            RN 1
79
80Temp1           RN 3
81Temp2           RN 1
82Temp3           RN 12
83Temp4           RN 7
84Temp5           RN 5
85r0x0fe00fe0     RN 3                                    ;// [0 (16*255 - 16) 0 (16*255 - 16)]
86r0x00ff00ff     RN 10                                   ;// [0 255 0 255] where 255 is offset
87Counter         RN 11
88
89Height          RN 3
90
91        M_ALLOC4 pDstStep, 4
92        M_ALLOC4 pSrcStep, 4
93
94        ;// Function header
95        M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r6
96
97        MOV     Counter, #2
98        M_STR   dstStep, pDstStep
99        M_STR   srcStep, pSrcStep
100        LDR     r0x00ff00ff, =0x00ff00ff               ;// [0 255 0 255] 255 is offset to avoid negative results
101
102NextTwoRowsLoop
103        LDR     ValD, [pSrc, srcStep]                   ;// Load row 1 [d1 c1 b1 a1]
104        LDR     ValA, [pSrc], #4                        ;// Load row 0 [d0 c0 b0 a0]
105        LDR     ValH, [pSrc, srcStep]                   ;// Load  [h1 g1 f1 e1]
106        LDR     ValE, [pSrc], #4                        ;// Load  [h0 g0 f0 e0]
107        LDRB    Temp2, [pSrc, srcStep]                  ;// Load row 1 [l1 k1 j1 i1]
108        LDRB    Temp1, [pSrc], #-8                      ;// Load row 0 [l0 k0 j0 i0]
109
110        PKHBT   ValB, ValA, ValD, LSL #16               ;// [b1 a1 b0 a0]
111        PKHTB   ValD, ValD, ValA, ASR #16               ;// [d1 c1 d0 c0]
112        UXTAB16 ValA, r0x00ff00ff, ValB                 ;// [00 a1 00 a0] + [0 255 0 255]
113        UXTAB16 ValC, r0x00ff00ff, ValD                 ;// [00 c1 00 c0] + [0 255 0 255]
114        PKHBT   ValI, Temp1, Temp2, LSL #16             ;// [00 i1 00 i0]
115        PKHBT   ValF, ValE, ValH, LSL #16               ;// [f1 e1 f0 e0]
116        PKHTB   ValH, ValH, ValE, ASR #16               ;// [h1 g1 h0 g0]
117        UXTAB16 ValE, r0x00ff00ff, ValF                 ;// [00 e1 00 e0] + [0 255 0 255]
118
119        ;// Calculate Acc0
120        ;// Acc0 = a - 5*b + 20*c + 20*d - 5*e + f
121        UXTAB16 Temp1, ValC, ValD, ROR #8
122        UXTAB16 Temp3, ValE, ValB, ROR #8
123        RSB     Temp1, Temp3, Temp1, LSL #2
124        UXTAB16 Acc0, ValA, ValF, ROR #8
125        ADD     Temp1, Temp1, Temp1, LSL #2
126        ADD     Acc0, Acc0, Temp1
127
128        ;// Calculate Acc1
129        ;// Acc1 = b - 5*c + 20*d + 20*e - 5*f + g
130        UXTAB16 Temp1, ValE, ValD, ROR #8
131        UXTAB16 Temp3, ValC, ValF, ROR #8
132        RSB     Temp1, Temp3, Temp1, LSL #2
133        UXTAB16 ValG, r0x00ff00ff, ValH                 ;// [00 g1 00 g0] + [0 255 0 255]
134        ADD     Temp1, Temp1, Temp1, LSL #2
135        UXTAB16 Acc1, ValG, ValB, ROR #8
136        ADD     Acc1, Acc1, Temp1
137
138        LDR     r0x0fe00fe0, =0x0fe00fe0                ;// 0x0fe00fe0 = (16 * Offset) - 16 where Offset is 255
139        UXTAB16 Acc2, ValC, ValH, ROR #8
140        ADD     ValI, r0x00ff00ff, ValI                 ;// [00 i1 00 i0] + [0 255 0 255]
141        UQSUB16 Acc0, Acc0, r0x0fe00fe0
142        UQSUB16 Acc1, Acc1, r0x0fe00fe0
143        USAT16  Acc0, #13, Acc0
144        USAT16  Acc1, #13, Acc1
145
146        ;// Calculate Acc2
147        ;// Acc2 = c - 5*d + 20*e + 20*f - 5*g + h
148        UXTAB16 Temp1, ValG, ValD, ROR #8
149        UXTAB16 Acc3, ValI, ValD, ROR #8
150        UXTAB16 Temp2, ValE, ValF, ROR #8
151        AND     Acc1, r0x00ff00ff, Acc1, LSR #5
152        AND     Acc0, r0x00ff00ff, Acc0, LSR #5
153        ORR     Acc0, Acc0, Acc1, LSL #8
154        RSB     Temp5, Temp1, Temp2, LSL #2
155        UXTAB16 Temp2, ValG, ValF, ROR #8
156        ADD     Temp5, Temp5, Temp5, LSL #2
157        ADD     Acc2, Acc2, Temp5
158
159        ;// Calculate Acc3
160        ;// Acc3 = d - 5*e + 20*f + 20*g - 5*h + i
161        UXTAB16 Temp5, ValE, ValH, ROR #8
162        RSB     Temp5, Temp5, Temp2, LSL #2
163        LDR     r0x0fe00fe0, =0x0fe00fe0
164        ADD     Temp5, Temp5, Temp5, LSL #2
165        ADD     Acc3, Acc3, Temp5
166
167        UQSUB16 Acc3, Acc3, r0x0fe00fe0
168        UQSUB16 Acc2, Acc2, r0x0fe00fe0
169        USAT16  Acc3, #13, Acc3
170        USAT16  Acc2, #13, Acc2
171
172        M_LDR   dstStep, pDstStep
173        AND     Acc3, r0x00ff00ff, Acc3, LSR #5
174        AND     Acc2, r0x00ff00ff, Acc2, LSR #5
175        ORR     Acc2, Acc2, Acc3, LSL #8
176
177        SUBS    Counter, Counter, #1
178        M_LDR   srcStep, pSrcStep
179        PKHBT   Acc1, Acc0, Acc2, LSL #16
180        M_STR   Acc1, [pDst], dstStep                   ;// Store result1
181        PKHTB   Acc2, Acc2, Acc0, ASR #16
182        M_STR   Acc2, [pDst], dstStep                   ;// Store result2
183        ADD     pSrc, pSrc, srcStep, LSL #1
184
185        BGT     NextTwoRowsLoop
186End
187        SUB     pDst, pDst, dstStep, LSL #2
188        SUB     pSrc, pSrc, srcStep, LSL #2
189
190        M_END
191
192    ENDIF
193
194    END
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254