armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   9641
6;// Date:       Thursday, February 7, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12
13        INCLUDE omxtypes_s.h
14        INCLUDE armCOMM_s.h
15
16        EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
17
18        M_VARIANTS ARM1136JS
19
20
21
22    IF ARM1136JS
23
24
25        M_ALLOC8 ppDstArgs, 8
26        M_ALLOC8 pTempResult1, 8
27        M_ALLOC8 pTempResult2, 8
28        M_ALLOC4 ppSrc, 4
29        M_ALLOC4 ppDst, 4
30        M_ALLOC4 pDstStep, 4
31        M_ALLOC4 pSrcStep, 4
32        M_ALLOC4 pCounter, 4
33
34        ;// Function header
35        ;// Function:
36        ;//     armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
37        ;//
38        ;// Implements diagonal interpolation for a block of size 4x4. Input and output should
39        ;// be aligned.
40        ;//
41        ;// Registers used as input for this function
42        ;// r0,r1,r2,r3, r8 where r0,r2  input pointer and r1,r3 step size, r8 intermediate-buf pointer
43        ;//
44        ;// Registers preserved for top level function
45        ;// r0,r1,r2,r3,r4,r5,r6,r14
46        ;//
47        ;// Registers modified by the function
48        ;// r7,r8,r9,r10,r11,r12
49        ;//
50        ;// Output registers
51        ;// None. Function will preserve r0-r3
52
53        M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r6
54
55;// Declare input registers
56pSrc            RN 0
57srcStep         RN 1
58pDst            RN 2
59dstStep         RN 3
60
61;// Declare inner loop registers
62Acc0            RN 4
63Acc1            RN 5
64Acc2            RN 6
65Acc3            RN 7
66
67ValA            RN 4
68ValB            RN 5
69ValC            RN 6
70ValD            RN 7
71ValE            RN 8
72ValF            RN 9
73ValG            RN 12
74ValH            RN 14
75ValI            RN 1
76
77Temp1           RN 3
78Temp2           RN 1
79Temp3           RN 12
80Temp4           RN 7
81Temp5           RN 5
82r0x0fe00fe0     RN 3                                    ;// [0 (16*255 - 16) 0 (16*255 - 16)]
83r0x00ff00ff     RN 10                                   ;// [0 255 0 255] where 255 is offset
84Counter         RN 11
85pInterBuf       RN 8
86
87ValCA           RN 8
88ValDB           RN 9
89ValGE           RN 10
90ValHF           RN 11
91r0x00140001     RN 12
92r0x0014fffb     RN 14
93
94r0x0001fc00     RN 11
95
96Accx            RN 8
97Accy            RN 9
98Temp6           RN 14
99
100        M_STRD      pDst, dstStep, ppDstArgs
101
102        MOV         pDst, pInterBuf
103        MOV         dstStep, #16
104
105        ;// Set up counter of format, [0]  [0]  [1 (height)]  [8 (width)]
106        MOV         Counter, #4
107        M_STR       dstStep, pDstStep
108        M_STR       srcStep, pSrcStep
109        LDR         r0x00ff00ff, =0x00ff00ff               ;// [0 255 0 255] 255 is offset to avoid negative results
110
111HeightLoop
112NextTwoRowsLoop
113        LDR     ValD, [pSrc, srcStep]                   ;// Load row 1 [d1 c1 b1 a1]
114        LDR     ValA, [pSrc], #4                        ;// Load row 0 [d0 c0 b0 a0]
115        LDR     ValH, [pSrc, srcStep]                   ;// Load  [h1 g1 f1 e1]
116        LDR     ValE, [pSrc], #4                        ;// Load  [h0 g0 f0 e0]
117        LDRB    Temp2, [pSrc, srcStep]                  ;// Load row 1 [l1 k1 j1 i1]
118        LDRB    Temp1, [pSrc], #-8                      ;// Load row 0 [l0 k0 j0 i0]
119
120        PKHBT   ValB, ValA, ValD, LSL #16               ;// [b1 a1 b0 a0]
121        PKHTB   ValD, ValD, ValA, ASR #16               ;// [d1 c1 d0 c0]
122        UXTAB16 ValA, r0x00ff00ff, ValB                 ;// [00 a1 00 a0] + [0 255 0 255]
123        UXTAB16 ValC, r0x00ff00ff, ValD                 ;// [00 c1 00 c0] + [0 255 0 255]
124        PKHBT   ValI, Temp1, Temp2, LSL #16             ;// [00 i1 00 i0]
125        PKHBT   ValF, ValE, ValH, LSL #16               ;// [f1 e1 f0 e0]
126        PKHTB   ValH, ValH, ValE, ASR #16               ;// [h1 g1 h0 g0]
127        UXTAB16 ValE, r0x00ff00ff, ValF                 ;// [00 e1 00 e0] + [0 255 0 255]
128
129        ;// Calculate Acc0
130        ;// Acc0 = a - 5*b + 20*c + 20*d - 5*e + f
131        UXTAB16 Temp1, ValC, ValD, ROR #8
132        UXTAB16 Temp3, ValE, ValB, ROR #8
133        RSB     Temp1, Temp3, Temp1, LSL #2
134        UXTAB16 Acc0, ValA, ValF, ROR #8
135        ADD     Temp1, Temp1, Temp1, LSL #2
136        ADD     Acc0, Acc0, Temp1
137
138        ;// Calculate Acc1
139        ;// Acc1 = b - 5*c + 20*d + 20*e - 5*f + g
140        UXTAB16 Temp1, ValE, ValD, ROR #8
141        UXTAB16 Temp3, ValC, ValF, ROR #8
142        RSB     Temp1, Temp3, Temp1, LSL #2
143        UXTAB16 ValG, r0x00ff00ff, ValH                 ;// [00 g1 00 g0] + [0 255 0 255]
144        ADD     Temp1, Temp1, Temp1, LSL #2
145        UXTAB16 Acc1, ValG, ValB, ROR #8
146        ADD     Acc1, Acc1, Temp1
147
148        UXTAB16 Acc2, ValC, ValH, ROR #8
149        ADD     ValI, r0x00ff00ff, ValI                 ;// [00 i1 00 i0] + [0 255 0 255]
150
151        ;// Calculate Acc2
152        ;// Acc2 = c - 5*d + 20*e + 20*f - 5*g + h
153        UXTAB16 Temp1, ValG, ValD, ROR #8
154        UXTAB16 Acc3, ValI, ValD, ROR #8
155        UXTAB16 Temp2, ValE, ValF, ROR #8
156
157        RSB     Temp1, Temp1, Temp2, LSL #2
158        UXTAB16 Temp2, ValG, ValF, ROR #8
159        ADD     Temp1, Temp1, Temp1, LSL #2
160        ADD     Acc2, Acc2, Temp1
161
162        ;// Calculate Acc3
163        ;// Acc3 = d - 5*e + 20*f + 20*g - 5*h + i
164        UXTAB16 Temp1, ValE, ValH, ROR #8
165        RSB     Temp1, Temp1, Temp2, LSL #2
166        ADD     Temp1, Temp1, Temp1, LSL #2
167        ADD     Acc3, Acc3, Temp1
168
169        M_LDR   dstStep, pDstStep
170        M_LDR   srcStep, pSrcStep
171
172        ;// If Counter is even store Acc0-Acc3 in a temporary buffer
173        ;// If Counter is off store Acc0-Acc3 and previous Acc0-Acc3 in a intermediate buf
174        ANDS        Temp3, Counter, #1
175        BEQ         NoProcessing
176
177        ;// Packing previous and current Acc0-Acc3 values
178        M_LDRD      Accx, Accy, pTempResult1
179        PKHBT       Temp6, Accx, Acc0, LSL #16          ;//[0 a2 0 a0] = [0 a3 0 a2] [0 a1 0 a0]
180        PKHTB       Acc0, Acc0, Accx, ASR #16           ;//[0 a3 0 a1] = [0 a1 0 a0] [0 a3 0 a2]
181        STR         Acc0, [pDst, dstStep]
182        STR         Temp6, [pDst], #4
183        PKHBT       Temp6, Accy, Acc1, LSL #16          ;//[0 b2 0 b0] = [0 b3 0 b2] [0 b1 0 b0]
184        PKHTB       Acc1, Acc1, Accy, ASR #16            ;//[0 b3 0 b1] = [0 b1 0 b0] [0 b3 0 b2]
185        M_LDRD      Accx, Accy, pTempResult2
186        STR         Acc1, [pDst, dstStep]
187        STR         Temp6, [pDst], #4
188
189        PKHBT       Temp6, Accx, Acc2, LSL #16          ;//[0 c2 0 c0] = [0 c3 0 c2] [0 c1 0 c0]
190        PKHTB       Acc2, Acc2, Accx, ASR #16            ;//[0 c3 0 c1] = [0 c1 0 c0] [0 c3 0 c2]
191        STR         Acc2, [pDst, dstStep]
192        STR         Temp6, [pDst], #4
193        PKHBT       Temp6, Accy, Acc3, LSL #16          ;//[0 d2 0 d0] = [0 d3 0 d2] [0 d1 0 d0]
194        PKHTB       Acc3, Acc3, Accy, ASR #16            ;//[0 d3 0 d1] = [0 d1 0 d0] [0 d3 0 d2]
195        STR         Acc3, [pDst, dstStep]
196        STR         Temp6, [pDst], #-12
197        ADD         pDst, pDst, dstStep, LSL #1
198        B           AfterStore
199
200NoProcessing
201        M_STRD      Acc0, Acc1, pTempResult1
202        M_STRD      Acc2, Acc3, pTempResult2
203AfterStore
204        SUBS        Counter, Counter, #1                ;// Loop till height is 10
205        ADD         pSrc, pSrc, srcStep, LSL #1
206        BPL         HeightLoop
207
208        STR         Acc0, [pDst], #4                    ;//[0 a1 0 a0]
209        STR         Acc1, [pDst], #4
210        STR         Acc2, [pDst], #4
211        STR         Acc3, [pDst], #-12
212
213        ;//
214        ;// Horizontal interpolation using multiplication
215        ;//
216
217        SUB         pSrc, pDst, dstStep, LSL #2
218        MOV         srcStep, #16
219        M_LDRD      pDst, dstStep, ppDstArgs
220
221        MOV         Counter, #4
222        LDR         r0x0014fffb, =0x0014fffb
223        LDR         r0x00140001, =0x00140001
224
225HeightLoop1
226        M_STR       Counter, pCounter
227
228        M_LDR       ValCA, [pSrc], srcStep               ;// Load  [0 c 0 a]
229        M_LDR       ValDB, [pSrc], srcStep               ;// Load  [0 d 0 b]
230        M_LDR       ValGE, [pSrc], srcStep               ;// Load  [0 g 0 e]
231        M_LDR       ValHF, [pSrc], srcStep               ;// Load  [0 h 0 f]
232
233
234        ;// Acc0 = smuad ([0 20 0 1], add([0 c 0 a] + [0 d 0 f])) - (5 * (b + e))
235        ;// Acc1 = smuad ([0 20 0 1], add([0 e 0 g] + [0 d 0 b])) - (5 * (c + f))
236        ;// Acc2 = smuad ([0 1 0 20], add([0 c 0 e] + [0 h 0 f])) - (5 * (d + g))
237        ;// Acc3 = smuad ([0 20 0 1], add([0 d 0 f] + [0 i 0 g])) - (5 * (e + h))
238
239        SMUAD       Acc0, ValCA, r0x00140001            ;// Acc0  = [0 c 0 a] * [0 20 0 1]
240        SMUAD       Acc1, ValDB, r0x00140001            ;// Acc1  = [0 c 0 a] * [0 20 0 1]
241        SMUADX      Acc2, ValGE, r0x0014fffb            ;// Acc2  = [0 g 0 e] * [0 20 0 -5]
242        SMUAD       Acc3, ValGE, r0x0014fffb            ;// Acc3  = [0 g 0 e] * [0 20 0 -5]
243
244        SMLAD       Acc0, ValDB, r0x0014fffb, Acc0      ;// Acc0 += [0 d 0 b] * [0 20 0 -5]
245        SMLADX      Acc1, ValGE, r0x00140001, Acc1      ;// Acc1 += [0 g 0 e] * [0 20 0 1]
246        SMLADX      Acc2, ValHF, r0x00140001, Acc2      ;// Acc2 += [0 h 0 f] * [0 20 0 1]
247        SMLADX      Acc3, ValHF, r0x0014fffb, Acc3      ;// Acc3 += [0 h 0 f] * [0 20 0 -5]
248
249        SMLABB      Acc0, ValGE, r0x0014fffb, Acc0      ;// Acc0 += [0 g 0 e] * [0 0 0 -5]
250        SMLATB      Acc1, ValCA, r0x0014fffb, Acc1      ;// Acc1 += [0 d 0 b] * [0 0 0 -5]
251        SMLATB      Acc2, ValCA, r0x00140001, Acc2      ;// Acc2 += [0 c 0 a] * [0 0 0 1]
252        SMLATB      Acc3, ValDB, r0x00140001, Acc3      ;// Acc3 += [0 c 0 a] * [0 0 0 1]
253
254        LDRH        ValCA, [pSrc], #4                   ;// 8 = srcStep - 16
255        SMLABB      Acc0, ValHF, r0x00140001, Acc0      ;// Acc0 += [0 h 0 f] * [0 0 0 1]
256        SMLABB      Acc1, ValHF, r0x0014fffb, Acc1      ;// Acc1 += [0 h 0 f] * [0 0 0 -5]
257        SMLATB      Acc2, ValDB, r0x0014fffb, Acc2      ;// Acc2 += [0 d 0 b] * [0 0 0 -5]
258        SMLABB      Acc3, ValCA, r0x00140001, Acc3      ;// Acc3 += [0 d 0 b] * [0 0 0 1]
259
260        LDR         r0x0001fc00, =0x0001fc00            ;// (0xff * 16 * 32) - 512
261        SUB         Acc0, Acc0, r0x0001fc00
262        SUB         Acc1, Acc1, r0x0001fc00
263        SUB         Acc2, Acc2, r0x0001fc00
264        SUB         Acc3, Acc3, r0x0001fc00
265
266        USAT        Acc0, #18, Acc0
267        USAT        Acc1, #18, Acc1
268        USAT        Acc2, #18, Acc2
269        USAT        Acc3, #18, Acc3
270
271        MOV         Acc0, Acc0, LSR #10
272        M_STRB      Acc0, [pDst], dstStep
273        MOV         Acc1, Acc1, LSR #10
274        M_STRB      Acc1, [pDst], dstStep
275        MOV         Acc2, Acc2, LSR #10
276        M_STRB      Acc2, [pDst], dstStep
277        MOV         Acc3, Acc3, LSR #10
278        M_STRB      Acc3, [pDst], dstStep
279
280
281        M_LDR       Counter, pCounter
282        SUB         pDst, pDst, dstStep, LSL #2
283        SUB         pSrc, pSrc, srcStep, LSL #2
284        ADD         pDst, pDst, #1
285        SUBS        Counter, Counter, #1
286        BGT         HeightLoop1
287End
288        SUB         pDst, pDst, #4
289        SUB         pSrc, pSrc, #16
290
291        M_END
292
293    ENDIF
294
295    END
296
297