1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26
27        INCLUDE omxtypes_s.h
28        INCLUDE armCOMM_s.h
29
30        EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
31
32        M_VARIANTS ARM1136JS
33
34
35
36    IF ARM1136JS
37
38
39        M_ALLOC8 ppDstArgs, 8
40        M_ALLOC8 pTempResult1, 8
41        M_ALLOC8 pTempResult2, 8
42        M_ALLOC4 ppSrc, 4
43        M_ALLOC4 ppDst, 4
44        M_ALLOC4 pDstStep, 4
45        M_ALLOC4 pSrcStep, 4
46        M_ALLOC4 pCounter, 4
47
48        ;// Function header
49        ;// Function:
50        ;//     armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
51        ;//
52        ;// Implements diagonal interpolation for a block of size 4x4. Input and output should
53        ;// be aligned.
54        ;//
55        ;// Registers used as input for this function
56        ;// r0,r1,r2,r3, r8 where r0,r2  input pointer and r1,r3 step size, r8 intermediate-buf pointer
57        ;//
58        ;// Registers preserved for top level function
59        ;// r0,r1,r2,r3,r4,r5,r6,r14
60        ;//
61        ;// Registers modified by the function
62        ;// r7,r8,r9,r10,r11,r12
63        ;//
64        ;// Output registers
65        ;// None. Function will preserve r0-r3
66
67        M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r6
68
69;// Declare input registers
70pSrc            RN 0
71srcStep         RN 1
72pDst            RN 2
73dstStep         RN 3
74
75;// Declare inner loop registers
76Acc0            RN 4
77Acc1            RN 5
78Acc2            RN 6
79Acc3            RN 7
80
81ValA            RN 4
82ValB            RN 5
83ValC            RN 6
84ValD            RN 7
85ValE            RN 8
86ValF            RN 9
87ValG            RN 12
88ValH            RN 14
89ValI            RN 1
90
91Temp1           RN 3
92Temp2           RN 1
93Temp3           RN 12
94Temp4           RN 7
95Temp5           RN 5
96r0x0fe00fe0     RN 3                                    ;// [0 (16*255 - 16) 0 (16*255 - 16)]
97r0x00ff00ff     RN 10                                   ;// [0 255 0 255] where 255 is offset
98Counter         RN 11
99pInterBuf       RN 8
100
101ValCA           RN 8
102ValDB           RN 9
103ValGE           RN 10
104ValHF           RN 11
105r0x00140001     RN 12
106r0x0014fffb     RN 14
107
108r0x0001fc00     RN 11
109
110Accx            RN 8
111Accy            RN 9
112Temp6           RN 14
113
114        M_STRD      pDst, dstStep, ppDstArgs
115
116        MOV         pDst, pInterBuf
117        MOV         dstStep, #16
118
119        ;// Set up counter of format, [0]  [0]  [1 (height)]  [8 (width)]
120        MOV         Counter, #4
121        M_STR       dstStep, pDstStep
122        M_STR       srcStep, pSrcStep
123        LDR         r0x00ff00ff, =0x00ff00ff               ;// [0 255 0 255] 255 is offset to avoid negative results
124
125HeightLoop
126NextTwoRowsLoop
127        LDR     ValD, [pSrc, srcStep]                   ;// Load row 1 [d1 c1 b1 a1]
128        LDR     ValA, [pSrc], #4                        ;// Load row 0 [d0 c0 b0 a0]
129        LDR     ValH, [pSrc, srcStep]                   ;// Load  [h1 g1 f1 e1]
130        LDR     ValE, [pSrc], #4                        ;// Load  [h0 g0 f0 e0]
131        LDRB    Temp2, [pSrc, srcStep]                  ;// Load row 1 [l1 k1 j1 i1]
132        LDRB    Temp1, [pSrc], #-8                      ;// Load row 0 [l0 k0 j0 i0]
133
134        PKHBT   ValB, ValA, ValD, LSL #16               ;// [b1 a1 b0 a0]
135        PKHTB   ValD, ValD, ValA, ASR #16               ;// [d1 c1 d0 c0]
136        UXTAB16 ValA, r0x00ff00ff, ValB                 ;// [00 a1 00 a0] + [0 255 0 255]
137        UXTAB16 ValC, r0x00ff00ff, ValD                 ;// [00 c1 00 c0] + [0 255 0 255]
138        PKHBT   ValI, Temp1, Temp2, LSL #16             ;// [00 i1 00 i0]
139        PKHBT   ValF, ValE, ValH, LSL #16               ;// [f1 e1 f0 e0]
140        PKHTB   ValH, ValH, ValE, ASR #16               ;// [h1 g1 h0 g0]
141        UXTAB16 ValE, r0x00ff00ff, ValF                 ;// [00 e1 00 e0] + [0 255 0 255]
142
143        ;// Calculate Acc0
144        ;// Acc0 = a - 5*b + 20*c + 20*d - 5*e + f
145        UXTAB16 Temp1, ValC, ValD, ROR #8
146        UXTAB16 Temp3, ValE, ValB, ROR #8
147        RSB     Temp1, Temp3, Temp1, LSL #2
148        UXTAB16 Acc0, ValA, ValF, ROR #8
149        ADD     Temp1, Temp1, Temp1, LSL #2
150        ADD     Acc0, Acc0, Temp1
151
152        ;// Calculate Acc1
153        ;// Acc1 = b - 5*c + 20*d + 20*e - 5*f + g
154        UXTAB16 Temp1, ValE, ValD, ROR #8
155        UXTAB16 Temp3, ValC, ValF, ROR #8
156        RSB     Temp1, Temp3, Temp1, LSL #2
157        UXTAB16 ValG, r0x00ff00ff, ValH                 ;// [00 g1 00 g0] + [0 255 0 255]
158        ADD     Temp1, Temp1, Temp1, LSL #2
159        UXTAB16 Acc1, ValG, ValB, ROR #8
160        ADD     Acc1, Acc1, Temp1
161
162        UXTAB16 Acc2, ValC, ValH, ROR #8
163        ADD     ValI, r0x00ff00ff, ValI                 ;// [00 i1 00 i0] + [0 255 0 255]
164
165        ;// Calculate Acc2
166        ;// Acc2 = c - 5*d + 20*e + 20*f - 5*g + h
167        UXTAB16 Temp1, ValG, ValD, ROR #8
168        UXTAB16 Acc3, ValI, ValD, ROR #8
169        UXTAB16 Temp2, ValE, ValF, ROR #8
170
171        RSB     Temp1, Temp1, Temp2, LSL #2
172        UXTAB16 Temp2, ValG, ValF, ROR #8
173        ADD     Temp1, Temp1, Temp1, LSL #2
174        ADD     Acc2, Acc2, Temp1
175
176        ;// Calculate Acc3
177        ;// Acc3 = d - 5*e + 20*f + 20*g - 5*h + i
178        UXTAB16 Temp1, ValE, ValH, ROR #8
179        RSB     Temp1, Temp1, Temp2, LSL #2
180        ADD     Temp1, Temp1, Temp1, LSL #2
181        ADD     Acc3, Acc3, Temp1
182
183        M_LDR   dstStep, pDstStep
184        M_LDR   srcStep, pSrcStep
185
186        ;// If Counter is even store Acc0-Acc3 in a temporary buffer
187        ;// If Counter is off store Acc0-Acc3 and previous Acc0-Acc3 in a intermediate buf
188        ANDS        Temp3, Counter, #1
189        BEQ         NoProcessing
190
191        ;// Packing previous and current Acc0-Acc3 values
192        M_LDRD      Accx, Accy, pTempResult1
193        PKHBT       Temp6, Accx, Acc0, LSL #16          ;//[0 a2 0 a0] = [0 a3 0 a2] [0 a1 0 a0]
194        PKHTB       Acc0, Acc0, Accx, ASR #16           ;//[0 a3 0 a1] = [0 a1 0 a0] [0 a3 0 a2]
195        STR         Acc0, [pDst, dstStep]
196        STR         Temp6, [pDst], #4
197        PKHBT       Temp6, Accy, Acc1, LSL #16          ;//[0 b2 0 b0] = [0 b3 0 b2] [0 b1 0 b0]
198        PKHTB       Acc1, Acc1, Accy, ASR #16            ;//[0 b3 0 b1] = [0 b1 0 b0] [0 b3 0 b2]
199        M_LDRD      Accx, Accy, pTempResult2
200        STR         Acc1, [pDst, dstStep]
201        STR         Temp6, [pDst], #4
202
203        PKHBT       Temp6, Accx, Acc2, LSL #16          ;//[0 c2 0 c0] = [0 c3 0 c2] [0 c1 0 c0]
204        PKHTB       Acc2, Acc2, Accx, ASR #16            ;//[0 c3 0 c1] = [0 c1 0 c0] [0 c3 0 c2]
205        STR         Acc2, [pDst, dstStep]
206        STR         Temp6, [pDst], #4
207        PKHBT       Temp6, Accy, Acc3, LSL #16          ;//[0 d2 0 d0] = [0 d3 0 d2] [0 d1 0 d0]
208        PKHTB       Acc3, Acc3, Accy, ASR #16            ;//[0 d3 0 d1] = [0 d1 0 d0] [0 d3 0 d2]
209        STR         Acc3, [pDst, dstStep]
210        STR         Temp6, [pDst], #-12
211        ADD         pDst, pDst, dstStep, LSL #1
212        B           AfterStore
213
214NoProcessing
215        M_STRD      Acc0, Acc1, pTempResult1
216        M_STRD      Acc2, Acc3, pTempResult2
217AfterStore
218        SUBS        Counter, Counter, #1                ;// Loop till height is 10
219        ADD         pSrc, pSrc, srcStep, LSL #1
220        BPL         HeightLoop
221
222        STR         Acc0, [pDst], #4                    ;//[0 a1 0 a0]
223        STR         Acc1, [pDst], #4
224        STR         Acc2, [pDst], #4
225        STR         Acc3, [pDst], #-12
226
227        ;//
228        ;// Horizontal interpolation using multiplication
229        ;//
230
231        SUB         pSrc, pDst, dstStep, LSL #2
232        MOV         srcStep, #16
233        M_LDRD      pDst, dstStep, ppDstArgs
234
235        MOV         Counter, #4
236        LDR         r0x0014fffb, =0x0014fffb
237        LDR         r0x00140001, =0x00140001
238
239HeightLoop1
240        M_STR       Counter, pCounter
241
242        M_LDR       ValCA, [pSrc], srcStep               ;// Load  [0 c 0 a]
243        M_LDR       ValDB, [pSrc], srcStep               ;// Load  [0 d 0 b]
244        M_LDR       ValGE, [pSrc], srcStep               ;// Load  [0 g 0 e]
245        M_LDR       ValHF, [pSrc], srcStep               ;// Load  [0 h 0 f]
246
247
248        ;// Acc0 = smuad ([0 20 0 1], add([0 c 0 a] + [0 d 0 f])) - (5 * (b + e))
249        ;// Acc1 = smuad ([0 20 0 1], add([0 e 0 g] + [0 d 0 b])) - (5 * (c + f))
250        ;// Acc2 = smuad ([0 1 0 20], add([0 c 0 e] + [0 h 0 f])) - (5 * (d + g))
251        ;// Acc3 = smuad ([0 20 0 1], add([0 d 0 f] + [0 i 0 g])) - (5 * (e + h))
252
253        SMUAD       Acc0, ValCA, r0x00140001            ;// Acc0  = [0 c 0 a] * [0 20 0 1]
254        SMUAD       Acc1, ValDB, r0x00140001            ;// Acc1  = [0 c 0 a] * [0 20 0 1]
255        SMUADX      Acc2, ValGE, r0x0014fffb            ;// Acc2  = [0 g 0 e] * [0 20 0 -5]
256        SMUAD       Acc3, ValGE, r0x0014fffb            ;// Acc3  = [0 g 0 e] * [0 20 0 -5]
257
258        SMLAD       Acc0, ValDB, r0x0014fffb, Acc0      ;// Acc0 += [0 d 0 b] * [0 20 0 -5]
259        SMLADX      Acc1, ValGE, r0x00140001, Acc1      ;// Acc1 += [0 g 0 e] * [0 20 0 1]
260        SMLADX      Acc2, ValHF, r0x00140001, Acc2      ;// Acc2 += [0 h 0 f] * [0 20 0 1]
261        SMLADX      Acc3, ValHF, r0x0014fffb, Acc3      ;// Acc3 += [0 h 0 f] * [0 20 0 -5]
262
263        SMLABB      Acc0, ValGE, r0x0014fffb, Acc0      ;// Acc0 += [0 g 0 e] * [0 0 0 -5]
264        SMLATB      Acc1, ValCA, r0x0014fffb, Acc1      ;// Acc1 += [0 d 0 b] * [0 0 0 -5]
265        SMLATB      Acc2, ValCA, r0x00140001, Acc2      ;// Acc2 += [0 c 0 a] * [0 0 0 1]
266        SMLATB      Acc3, ValDB, r0x00140001, Acc3      ;// Acc3 += [0 c 0 a] * [0 0 0 1]
267
268        LDRH        ValCA, [pSrc], #4                   ;// 8 = srcStep - 16
269        SMLABB      Acc0, ValHF, r0x00140001, Acc0      ;// Acc0 += [0 h 0 f] * [0 0 0 1]
270        SMLABB      Acc1, ValHF, r0x0014fffb, Acc1      ;// Acc1 += [0 h 0 f] * [0 0 0 -5]
271        SMLATB      Acc2, ValDB, r0x0014fffb, Acc2      ;// Acc2 += [0 d 0 b] * [0 0 0 -5]
272        SMLABB      Acc3, ValCA, r0x00140001, Acc3      ;// Acc3 += [0 d 0 b] * [0 0 0 1]
273
274        LDR         r0x0001fc00, =0x0001fc00            ;// (0xff * 16 * 32) - 512
275        SUB         Acc0, Acc0, r0x0001fc00
276        SUB         Acc1, Acc1, r0x0001fc00
277        SUB         Acc2, Acc2, r0x0001fc00
278        SUB         Acc3, Acc3, r0x0001fc00
279
280        USAT        Acc0, #18, Acc0
281        USAT        Acc1, #18, Acc1
282        USAT        Acc2, #18, Acc2
283        USAT        Acc3, #18, Acc3
284
285        MOV         Acc0, Acc0, LSR #10
286        M_STRB      Acc0, [pDst], dstStep
287        MOV         Acc1, Acc1, LSR #10
288        M_STRB      Acc1, [pDst], dstStep
289        MOV         Acc2, Acc2, LSR #10
290        M_STRB      Acc2, [pDst], dstStep
291        MOV         Acc3, Acc3, LSR #10
292        M_STRB      Acc3, [pDst], dstStep
293
294
295        M_LDR       Counter, pCounter
296        SUB         pDst, pDst, dstStep, LSL #2
297        SUB         pSrc, pSrc, srcStep, LSL #2
298        ADD         pDst, pDst, #1
299        SUBS        Counter, Counter, #1
300        BGT         HeightLoop1
301End
302        SUB         pDst, pDst, #4
303        SUB         pSrc, pSrc, #16
304
305        M_END
306
307    ENDIF
308
309    END
310
311