omxVCM4P10_InterpolateLuma_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  omxVCM4P10_InterpolateLuma_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   12290
21;// Date:       Wednesday, April 9, 2008
22;//
23;//
24;//
25;//
26
27;// Function:
28;//     omxVCM4P10_InterpolateLuma
29;//
30;// This function implements omxVCM4P10_InterpolateLuma in v6 assembly.
31;// Performs quarter pel interpolation of inter luma MB.
32;// It's assumed that the frame is already padded when calling this function.
33;// Parameters:
34;// [in]    pSrc        Pointer to the source reference frame buffer
35;// [in]    srcStep     Reference frame step in byte
36;// [in]    dstStep     Destination frame step in byte. Must be multiple of roi.width
37;// [in]    dx          Fractional part of horizontal motion vector
38;//                         component in 1/4 pixel unit; valid in the range [0,3]
39;// [in]    dy          Fractional part of vertical motion vector
40;//                         component in 1/4 pixel unit; valid in the range [0,3]
41;// [in]    roi         Dimension of the interpolation region;the parameters roi.width and roi.height must
42;//                         be equal to either 4, 8, or 16.
43;// [out]   pDst        Pointer to the destination frame buffer.
44;//                   if roi.width==4,  4-byte alignment required
45;//                   if roi.width==8,  8-byte alignment required
46;//                   if roi.width==16, 16-byte alignment required
47;//
48;// Return Value:
49;// If the function runs without error, it returns OMX_Sts_NoErr.
50;// It is assued that following cases are satisfied before calling this function:
51;//  pSrc or pDst is not NULL.
52;//  srcStep or dstStep >= roi.width.
53;//     dx or dy is in the range [0-3].
54;//     roi.width or roi.height is not out of range {4, 8, 16}.
55;//     If roi.width is equal to 4, Dst is 4 byte aligned.
56;//     If roi.width is equal to 8, pDst is 8 byte aligned.
57;//     If roi.width is equal to 16, pDst is 16 byte aligned.
58;//     srcStep and dstStep is multiple of 8.
59;//
60;//
61
62
63        INCLUDE omxtypes_s.h
64        INCLUDE armCOMM_s.h
65
66        M_VARIANTS CortexA8
67
68        EXPORT omxVCM4P10_InterpolateLuma
69
70
71    IF CortexA8
72        IMPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
73        IMPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
74        IMPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
75        IMPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
76    ENDIF
77
78
79
80;// Declare input registers
81pSrc            RN 0
82srcStep         RN 1
83pDst            RN 2
84dstStep         RN 3
85iHeight         RN 4
86iWidth          RN 5
87
88;// Declare other intermediate registers
89idx             RN 6
90idy             RN 7
91index           RN 6
92Temp            RN 12
93pArgs           RN 11
94
95
96    IF CortexA8
97
98        ;//
99        ;// Interpolation of luma is implemented by processing block of pixels, size 4x4 at a time.
100        ;//
101        M_ALLOC4    ppArgs, 16
102
103        ;// Function header
104        M_START omxVCM4P10_InterpolateLuma, r11, d15
105
106pSrcBK          RN 8
107
108;// Declare Neon registers
109dCoeff5         DN 30.S16
110dCoeff20        DN 31.S16
111
112;// Registers used for implementing Horizontal interpolation
113dSrc0c          DN 14.U8
114dSrc1c          DN 16.U8
115dSrc2c          DN 18.U8
116dSrc3c          DN 20.U8
117dSrc0d          DN 15.U8
118dSrc1d          DN 17.U8
119dSrc2d          DN 19.U8
120dSrc3d          DN 21.U8
121dAccH0          DN 22.U8
122dAccH1          DN 24.U8
123dAccH2          DN 26.U8
124dAccH3          DN 28.U8
125dResultH0       DN 22.U32
126dResultH1       DN 24.U32
127dResultH2       DN 26.U32
128dResultH3       DN 28.U32
129
130;// Registers used for implementing Vertical interpolation
131dSrc0           DN 9.U8
132dSrc1           DN 10.U8
133dSrc2           DN 11.U8
134dSrc3           DN 12.U8
135dSrc4           DN 13.U8
136dAccV0          DN 0.U8
137dAccV1          DN 2.U8
138dAccV2          DN 4.U8
139dAccV3          DN 6.U8
140dResultV0       DN 0.U32
141dResultV1       DN 2.U32
142dResultV2       DN 4.U32
143dResultV3       DN 6.U32
144
145;// Registers used for implementing Diagonal interpolation
146dTAcc0          DN 0.U8
147dTAcc1          DN 2.U8
148dTAcc2          DN 4.U8
149dTAcc3          DN 6.U8
150dTRes0          DN 0.32
151dTRes1          DN 2.32
152dTRes2          DN 4.32
153dTRes3          DN 6.32
154dTResult0       DN 14.U8
155dTResult1       DN 16.U8
156dTResult2       DN 18.U8
157dTResult3       DN 20.U8
158dTempP0         DN 18.S16
159dTempP1         DN 19.S16
160dTempQ0         DN 20.S16
161dTempQ1         DN 21.S16
162dTempR0         DN 22.S16
163dTempR1         DN 23.S16
164dTempS0         DN 24.S16
165dTempS1         DN 25.S16
166qTempP01        QN 9.S16
167qTempQ01        QN 10.S16
168qTempR01        QN 11.S16
169qTempS01        QN 12.S16
170
171;// Intermediate values for averaging
172qRes2           QN 7.S16
173qRes3           QN 8.S16
174qRes4           QN 9.S16
175qRes5           QN 10.S16
176qRes6           QN 11.S16
177
178;// For implementing copy
179dDst0            DN 9.32
180dDst1            DN 10.32
181dDst2            DN 11.32
182dDst3            DN 12.32
183
184        ;// Define stack arguments
185        M_ARG       ptridx, 4
186        M_ARG       ptridy, 4
187        M_ARG       ptrWidth, 4
188        M_ARG       ptrHeight, 4
189
190        ;// Load structure elements of roi
191        M_LDR       idx, ptridx
192        M_LDR       idy, ptridy
193        M_LDR       iWidth, ptrWidth
194        M_LDR       iHeight, ptrHeight
195
196        ADD         index, idx, idy, LSL #2                 ;//  [index] = [idy][idx]
197        M_ADR       pArgs, ppArgs
198
199        ;// Move coefficients Neon registers
200        VMOV        dCoeff20, #20
201        VMOV        dCoeff5, #5
202
203Block4x4WidthLoop
204Block4x4HeightLoop
205
206        STM         pArgs, {pSrc,srcStep,pDst,dstStep}
207
208        ;// switch table using motion vector as index
209        ADD         pc, pc, index, LSL #2
210        B           Case_f
211        B           Case_0
212        B           Case_1
213        B           Case_2
214        B           Case_3
215        B           Case_4
216        B           Case_5
217        B           Case_6
218        B           Case_7
219        B           Case_8
220        B           Case_9
221        B           Case_a
222        B           Case_b
223        B           Case_c
224        B           Case_d
225        B           Case_e
226        B           Case_f
227
228Case_0
229        ;// Case G
230        M_PRINTF "Case 0 \n"
231
232        ;// Loads a 4x4 block of .8 and stores as .32
233        ADD         Temp, pSrc, srcStep, LSL #1
234        VLD1        dSrc0, [pSrc], srcStep
235        VLD1        dSrc2, [Temp], srcStep
236        VLD1        dSrc1, [pSrc]
237        VLD1        dSrc3, [Temp]
238
239        ADD         Temp, pDst, dstStep, LSL #1
240        VST1        dDst0[0], [pDst], dstStep
241        VST1        dDst2[0], [Temp], dstStep
242        VST1        dDst1[0], [pDst]
243        VST1        dDst3[0], [Temp]
244        M_ADR       pArgs, ppArgs
245        B           Block4x4LoopEnd
246Case_1
247        ;// Case a
248        M_PRINTF "Case 1 \n"
249
250        SUB         pSrc, pSrc, #2
251        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
252        VRHADD      dAccH0, dAccH0, dSrc0c
253        VRHADD      dAccH2, dAccH2, dSrc2c
254        VRHADD      dAccH1, dAccH1, dSrc1c
255        VRHADD      dAccH3, dAccH3, dSrc3c
256        ADD         Temp, pDst, dstStep, LSL #1
257        VST1        dResultH0[0], [pDst], dstStep
258        VST1        dResultH2[0], [Temp], dstStep
259        VST1        dResultH1[0], [pDst]
260        VST1        dResultH3[0], [Temp]
261        M_ADR       pArgs, ppArgs
262        B           Block4x4LoopEnd
263Case_2
264        ;// Case b
265        M_PRINTF "Case 2 \n"
266
267        SUB         pSrc, pSrc, #2
268        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
269        ADD         Temp, pDst, dstStep, LSL #1
270        VST1        dResultH0[0], [pDst], dstStep
271        VST1        dResultH2[0], [Temp], dstStep
272        VST1        dResultH1[0], [pDst]
273        VST1        dResultH3[0], [Temp]
274        M_ADR       pArgs, ppArgs
275        B           Block4x4LoopEnd
276Case_3
277        ;// Case c
278        M_PRINTF "Case 3 \n"
279
280        SUB         pSrc, pSrc, #2
281        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
282        VRHADD      dAccH0, dAccH0, dSrc0d
283        VRHADD      dAccH2, dAccH2, dSrc2d
284        VRHADD      dAccH1, dAccH1, dSrc1d
285        VRHADD      dAccH3, dAccH3, dSrc3d
286        ADD         Temp, pDst, dstStep, LSL #1
287        VST1        dResultH0[0], [pDst], dstStep
288        VST1        dResultH2[0], [Temp], dstStep
289        VST1        dResultH1[0], [pDst]
290        VST1        dResultH3[0], [Temp]
291        M_ADR       pArgs, ppArgs
292        B           Block4x4LoopEnd
293Case_4
294        ;// Case d
295        M_PRINTF "Case 4 \n"
296
297        SUB         pSrc, pSrc, srcStep, LSL #1
298        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
299        VRHADD      dAccV0, dAccV0, dSrc0
300        VRHADD      dAccV2, dAccV2, dSrc2
301        VRHADD      dAccV1, dAccV1, dSrc1
302        VRHADD      dAccV3, dAccV3, dSrc3
303        ADD         Temp, pDst, dstStep, LSL #1
304        VST1        dResultV0[0], [pDst], dstStep
305        VST1        dResultV2[0], [Temp], dstStep
306        VST1        dResultV1[0], [pDst]
307        VST1        dResultV3[0], [Temp]
308        M_ADR       pArgs, ppArgs
309        B           Block4x4LoopEnd
310Case_5
311        ;// Case e
312        M_PRINTF "Case 5 \n"
313
314        MOV         pSrcBK, pSrc
315        SUB         pSrc, pSrc, srcStep, LSL #1
316        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
317        SUB         pSrc, pSrcBK, #2
318        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
319        VRHADD      dAccH0, dAccH0, dAccV0
320        VRHADD      dAccH2, dAccH2, dAccV2
321        VRHADD      dAccH1, dAccH1, dAccV1
322        VRHADD      dAccH3, dAccH3, dAccV3
323        ADD         Temp, pDst, dstStep, LSL #1
324        VST1        dResultH0[0], [pDst], dstStep
325        VST1        dResultH2[0], [Temp], dstStep
326        VST1        dResultH1[0], [pDst]
327        VST1        dResultH3[0], [Temp]
328
329        M_ADR       pArgs, ppArgs
330        B       Block4x4LoopEnd
331Case_6
332        ;// Case f
333        M_PRINTF "Case 6 \n"
334
335        SUB         pSrc, pSrc, srcStep, LSL #1
336        SUB         pSrc, pSrc, #2
337        BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
338        VQRSHRUN    dTResult0, qRes2, #5
339        VQRSHRUN    dTResult1, qRes3, #5
340        VQRSHRUN    dTResult2, qRes4, #5
341        VQRSHRUN    dTResult3, qRes5, #5
342        VRHADD      dTAcc0, dTAcc0, dTResult0
343        VRHADD      dTAcc2, dTAcc2, dTResult2
344        VRHADD      dTAcc1, dTAcc1, dTResult1
345        VRHADD      dTAcc3, dTAcc3, dTResult3
346        ADD         Temp, pDst, dstStep, LSL #1
347        VST1        dTRes0[0], [pDst], dstStep
348        VST1        dTRes2[0], [Temp], dstStep
349        VST1        dTRes1[0], [pDst]
350        VST1        dTRes3[0], [Temp]
351
352        M_ADR       pArgs, ppArgs
353        B       Block4x4LoopEnd
354Case_7
355        ;// Case g
356        M_PRINTF "Case 7 \n"
357        MOV         pSrcBK, pSrc
358        ADD         pSrc, pSrc, #1
359        SUB         pSrc, pSrc, srcStep, LSL #1
360        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
361        SUB         pSrc, pSrcBK, #2
362        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
363        VRHADD      dAccH0, dAccH0, dAccV0
364        VRHADD      dAccH2, dAccH2, dAccV2
365        VRHADD      dAccH1, dAccH1, dAccV1
366        VRHADD      dAccH3, dAccH3, dAccV3
367        ADD         Temp, pDst, dstStep, LSL #1
368        VST1        dResultH0[0], [pDst], dstStep
369        VST1        dResultH2[0], [Temp], dstStep
370        VST1        dResultH1[0], [pDst]
371        VST1        dResultH3[0], [Temp]
372
373        M_ADR       pArgs, ppArgs
374        B       Block4x4LoopEnd
375Case_8
376        ;// Case h
377        M_PRINTF "Case 8 \n"
378
379        SUB         pSrc, pSrc, srcStep, LSL #1
380        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
381        ADD         Temp, pDst, dstStep, LSL #1
382        VST1        dResultV0[0], [pDst], dstStep
383        VST1        dResultV2[0], [Temp], dstStep
384        VST1        dResultV1[0], [pDst]
385        VST1        dResultV3[0], [Temp]
386        M_ADR       pArgs, ppArgs
387        B           Block4x4LoopEnd
388Case_9
389        ;// Case i
390        M_PRINTF "Case 9 \n"
391        SUB         pSrc, pSrc, srcStep, LSL #1
392        SUB         pSrc, pSrc, #2
393        BL          armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
394        VEXT        dTempP0, dTempP0, dTempP1, #2
395        VEXT        dTempQ0, dTempQ0, dTempQ1, #2
396        VEXT        dTempR0, dTempR0, dTempR1, #2
397        VEXT        dTempS0, dTempS0, dTempS1, #2
398
399        VQRSHRUN    dTResult0, qTempP01, #5
400        VQRSHRUN    dTResult1, qTempQ01, #5
401        VQRSHRUN    dTResult2, qTempR01, #5
402        VQRSHRUN    dTResult3, qTempS01, #5
403
404        VRHADD      dTAcc0, dTAcc0, dTResult0
405        VRHADD      dTAcc2, dTAcc2, dTResult2
406        VRHADD      dTAcc1, dTAcc1, dTResult1
407        VRHADD      dTAcc3, dTAcc3, dTResult3
408        ADD         Temp, pDst, dstStep, LSL #1
409        VST1        dTRes0[0], [pDst], dstStep
410        VST1        dTRes2[0], [Temp], dstStep
411        VST1        dTRes1[0], [pDst]
412        VST1        dTRes3[0], [Temp]
413        M_ADR       pArgs, ppArgs
414        B       Block4x4LoopEnd
415Case_a
416        ;// Case j
417        M_PRINTF "Case a \n"
418
419        SUB         pSrc, pSrc, srcStep, LSL #1
420        SUB         pSrc, pSrc, #2
421        BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
422        ADD         Temp, pDst, dstStep, LSL #1
423        VST1        dTRes0[0], [pDst], dstStep
424        VST1        dTRes2[0], [Temp], dstStep
425        VST1        dTRes1[0], [pDst]
426        VST1        dTRes3[0], [Temp]
427        M_ADR       pArgs, ppArgs
428        B       Block4x4LoopEnd
429Case_b
430        ;// Case k
431        M_PRINTF "Case b \n"
432        SUB         pSrc, pSrc, srcStep, LSL #1
433        SUB         pSrc, pSrc, #2
434        BL          armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
435        VEXT        dTempP0, dTempP0, dTempP1, #3
436        VEXT        dTempQ0, dTempQ0, dTempQ1, #3
437        VEXT        dTempR0, dTempR0, dTempR1, #3
438        VEXT        dTempS0, dTempS0, dTempS1, #3
439
440        VQRSHRUN    dTResult0, qTempP01, #5
441        VQRSHRUN    dTResult1, qTempQ01, #5
442        VQRSHRUN    dTResult2, qTempR01, #5
443        VQRSHRUN    dTResult3, qTempS01, #5
444
445        VRHADD      dTAcc0, dTAcc0, dTResult0
446        VRHADD      dTAcc2, dTAcc2, dTResult2
447        VRHADD      dTAcc1, dTAcc1, dTResult1
448        VRHADD      dTAcc3, dTAcc3, dTResult3
449        ADD         Temp, pDst, dstStep, LSL #1
450        VST1        dTRes0[0], [pDst], dstStep
451        VST1        dTRes2[0], [Temp], dstStep
452        VST1        dTRes1[0], [pDst]
453        VST1        dTRes3[0], [Temp]
454        M_ADR       pArgs, ppArgs
455        B       Block4x4LoopEnd
456Case_c
457        ;// Case n
458        M_PRINTF "Case c \n"
459
460        SUB         pSrc, pSrc, srcStep, LSL #1
461        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
462        VRHADD      dAccV0, dAccV0, dSrc1
463        VRHADD      dAccV2, dAccV2, dSrc3
464        VRHADD      dAccV1, dAccV1, dSrc2
465        VRHADD      dAccV3, dAccV3, dSrc4
466        ADD         Temp, pDst, dstStep, LSL #1
467        VST1        dResultV0[0], [pDst], dstStep
468        VST1        dResultV2[0], [Temp], dstStep
469        VST1        dResultV1[0], [pDst]
470        VST1        dResultV3[0], [Temp]
471        M_ADR       pArgs, ppArgs
472        B           Block4x4LoopEnd
473Case_d
474        ;// Case p
475        M_PRINTF "Case d \n"
476
477        MOV         pSrcBK, pSrc
478        SUB         pSrc, pSrc, srcStep, LSL #1
479        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
480        ADD         pSrc, pSrcBK, srcStep
481        SUB         pSrc, pSrc, #2
482        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
483        VRHADD      dAccH0, dAccH0, dAccV0
484        VRHADD      dAccH2, dAccH2, dAccV2
485        VRHADD      dAccH1, dAccH1, dAccV1
486        VRHADD      dAccH3, dAccH3, dAccV3
487        ADD         Temp, pDst, dstStep, LSL #1
488        VST1        dResultH0[0], [pDst], dstStep
489        VST1        dResultH2[0], [Temp], dstStep
490        VST1        dResultH1[0], [pDst]
491        VST1        dResultH3[0], [Temp]
492        M_ADR       pArgs, ppArgs
493        B       Block4x4LoopEnd
494Case_e
495        ;// Case q
496        M_PRINTF "Case e \n"
497
498        SUB         pSrc, pSrc, srcStep, LSL #1
499        SUB         pSrc, pSrc, #2
500        BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
501        VQRSHRUN    dTResult0, qRes3, #5
502        VQRSHRUN    dTResult1, qRes4, #5
503        VQRSHRUN    dTResult2, qRes5, #5
504        VQRSHRUN    dTResult3, qRes6, #5
505
506        VRHADD      dTAcc0, dTAcc0, dTResult0
507        VRHADD      dTAcc2, dTAcc2, dTResult2
508        VRHADD      dTAcc1, dTAcc1, dTResult1
509        VRHADD      dTAcc3, dTAcc3, dTResult3
510        ADD         Temp, pDst, dstStep, LSL #1
511        VST1        dTRes0[0], [pDst], dstStep
512        VST1        dTRes2[0], [Temp], dstStep
513        VST1        dTRes1[0], [pDst]
514        VST1        dTRes3[0], [Temp]
515        M_ADR       pArgs, ppArgs
516        B       Block4x4LoopEnd
517Case_f
518        ;// Case r
519        M_PRINTF "Case f \n"
520        MOV         pSrcBK, pSrc
521        ADD         pSrc, pSrc, #1
522        SUB         pSrc, pSrc, srcStep, LSL #1
523        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
524        ADD         pSrc, pSrcBK, srcStep
525        SUB         pSrc, pSrc, #2
526        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
527        VRHADD      dAccH0, dAccH0, dAccV0
528        VRHADD      dAccH2, dAccH2, dAccV2
529        VRHADD      dAccH1, dAccH1, dAccV1
530        VRHADD      dAccH3, dAccH3, dAccV3
531        ADD         Temp, pDst, dstStep, LSL #1
532        VST1        dResultH0[0], [pDst], dstStep
533        VST1        dResultH2[0], [Temp], dstStep
534        VST1        dResultH1[0], [pDst]
535        VST1        dResultH3[0], [Temp]
536        M_ADR       pArgs, ppArgs
537
538
539Block4x4LoopEnd
540
541        ;// Width Loop
542        ;//M_ADR       pArgs, ppArgs
543        LDM         pArgs, {pSrc,srcStep,pDst,dstStep}  ;// Load arguments
544        SUBS        iWidth, iWidth, #4
545        ADD         pSrc, pSrc, #4
546        ADD         pDst, pDst, #4
547        BGT         Block4x4WidthLoop
548
549        ;// Height Loop
550        SUBS        iHeight, iHeight, #4
551        M_LDR       iWidth, ptrWidth
552        M_ADR       pArgs, ppArgs
553        ADD         pSrc, pSrc, srcStep, LSL #2
554        ADD         pDst, pDst, dstStep, LSL #2
555        SUB         pSrc, pSrc, iWidth
556        SUB         pDst, pDst, iWidth
557        BGT         Block4x4HeightLoop
558
559EndOfInterpolation
560        MOV         r0, #0
561        M_END
562
563    ENDIF
564        ;// End of CortexA8
565
566    END
567
568