1;//
2;//
3;// File Name:  omxVCM4P2_MCReconBlock_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   12290
6;// Date:       Wednesday, April 9, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12;// Description:
13;//
14;//
15
16;// Include standard headers
17    INCLUDE omxtypes_s.h
18    INCLUDE armCOMM_s.h
19
20;// Import symbols required from other files
21
22    M_VARIANTS CortexA8
23
24;// ***************************************************************************
25;// ARM1136JS implementation
26;// ***************************************************************************
27
28;// ***************************************************************************
29;// CortexA8 implementation
30;// ***************************************************************************
31    IF  CortexA8
32;// ***************************************************************************
33;// MACRO DEFINITIONS
34;// ***************************************************************************
35    ;// Description:
36    ;// Does interpolation for the case of "IntegerPixel" predictType. Both
37    ;// rounding cases are handled. Just copies a block from pSrc to pDst
38    ;//
39    ;// Syntax:
40    ;// M_MCRECONBLOCK_IntegerPixel
41    ;//
42    ;// Inputs: None
43    ;// Outputs: None
44
45    MACRO
46    M_MCRECONBLOCK_IntegerPixel
47CaseIntegerPixel_Rnd0
48CaseIntegerPixel_Rnd1
49
50    VLD1        dRow0, [pSrc], srcStep
51    VLD1        dRow1, [pSrc], srcStep
52    VLD1        dRow2, [pSrc], srcStep
53    VLD1        dRow3, [pSrc], srcStep
54    VLD1        dRow4, [pSrc], srcStep
55    VLD1        dRow5, [pSrc], srcStep
56    VLD1        dRow6, [pSrc], srcStep
57    VLD1        dRow7, [pSrc], srcStep
58
59    VST1        dRow0, [pDst@64], dstStep
60    VST1        dRow1, [pDst@64], dstStep
61    VST1        dRow2, [pDst@64], dstStep
62    VST1        dRow3, [pDst@64], dstStep
63    VST1        dRow4, [pDst@64], dstStep
64    VST1        dRow5, [pDst@64], dstStep
65    VST1        dRow6, [pDst@64], dstStep
66    VST1        dRow7, [pDst@64], dstStep
67
68    B           SwitchPredictTypeEnd
69    MEND
70;// ***************************************************************************
71    ;// Description:
72    ;// Does interpolation for the case of "HalfPixelX" predictType. The two
73    ;// rounding cases are handled by the parameter "$rndVal". Averages between
74    ;// a pixel and pixel right to it, rounding it based on $rndVal. The
75    ;// rounding is implemented by using opCode switching between "VRHADD" and
76    ;// "VHADD" instructions.
77    ;//
78    ;// Syntax:
79    ;// M_MCRECONBLOCK_HalfPixelX $rndVal
80    ;//
81    ;// Inputs:
82    ;//     $rndVal: 0 for rounding and 1 for no rounding
83    ;// Outputs: None
84
85    MACRO
86    M_MCRECONBLOCK_HalfPixelX $rndVal
87
88    LCLS M_VHADDR
89    IF $rndVal = 0
90M_VHADDR SETS "VRHADD"
91    ELSE
92M_VHADDR SETS "VHADD"
93    ENDIF
94
95CaseHalfPixelX_Rnd$rndVal
96
97    VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep
98    VEXT        dRow0Shft, dRow0, dRow0Shft, #1
99    VLD1        {dRow1, dRow1Shft}, [pSrc], srcStep
100    VEXT        dRow1Shft, dRow1, dRow1Shft, #1
101    VLD1        {dRow2, dRow2Shft}, [pSrc], srcStep
102    VEXT        dRow2Shft, dRow2, dRow2Shft, #1
103    VLD1        {dRow3, dRow3Shft}, [pSrc], srcStep
104    VEXT        dRow3Shft, dRow3, dRow3Shft, #1
105    VLD1        {dRow4, dRow4Shft}, [pSrc], srcStep
106    VEXT        dRow4Shft, dRow4, dRow4Shft, #1
107    VLD1        {dRow5, dRow5Shft}, [pSrc], srcStep
108    VEXT        dRow5Shft, dRow5, dRow5Shft, #1
109    VLD1        {dRow6, dRow6Shft}, [pSrc], srcStep
110    VEXT        dRow6Shft, dRow6, dRow6Shft, #1
111    VLD1        {dRow7, dRow7Shft}, [pSrc], srcStep
112    VEXT        dRow7Shft, dRow7, dRow7Shft, #1
113    $M_VHADDR   dRow0, dRow0, dRow0Shft
114    $M_VHADDR   dRow1, dRow1, dRow1Shft
115    VST1        dRow0, [pDst@64], dstStep
116    $M_VHADDR   dRow2, dRow2, dRow2Shft
117    VST1        dRow1, [pDst@64], dstStep
118    $M_VHADDR   dRow3, dRow3, dRow3Shft
119    VST1        dRow2, [pDst@64], dstStep
120    $M_VHADDR   dRow4, dRow4, dRow4Shft
121    VST1        dRow3, [pDst@64], dstStep
122    $M_VHADDR   dRow5, dRow5, dRow5Shft
123    VST1        dRow4, [pDst@64], dstStep
124    $M_VHADDR   dRow6, dRow6, dRow6Shft
125    VST1        dRow5, [pDst@64], dstStep
126    $M_VHADDR   dRow7, dRow7, dRow7Shft
127    VST1        dRow6, [pDst@64], dstStep
128    VST1        dRow7, [pDst@64], dstStep
129
130    B           SwitchPredictTypeEnd
131    MEND
132;// ***************************************************************************
133    ;// Description:
134    ;// Does interpolation for the case of "HalfPixelY" predictType. The two
135    ;// rounding cases are handled by the parameter "$rndVal". Averages between
136    ;// a pixel and pixel below it, rounding it based on $rndVal. The
137    ;// rounding is implemented by using opCode switching between "VRHADD" and
138    ;// "VHADD" instructions.
139    ;//
140    ;// Syntax:
141    ;// M_MCRECONBLOCK_HalfPixelY $rndVal
142    ;//
143    ;// Inputs:
144    ;//     $rndVal: 0 for rounding and 1 for no rounding
145    ;// Outputs: None
146
147    MACRO
148    M_MCRECONBLOCK_HalfPixelY $rndVal
149
150    LCLS M_VHADDR
151    IF $rndVal = 0
152M_VHADDR SETS "VRHADD"
153    ELSE
154M_VHADDR SETS "VHADD"
155    ENDIF
156
157CaseHalfPixelY_Rnd$rndVal
158    VLD1        dRow0, [pSrc], srcStep
159    VLD1        dRow1, [pSrc], srcStep
160    VLD1        dRow2, [pSrc], srcStep
161    VLD1        dRow3, [pSrc], srcStep
162    VLD1        dRow4, [pSrc], srcStep
163    VLD1        dRow5, [pSrc], srcStep
164    VLD1        dRow6, [pSrc], srcStep
165    VLD1        dRow7, [pSrc], srcStep
166    $M_VHADDR   dRow0, dRow0, dRow1
167    VLD1        dRow8, [pSrc], srcStep
168    $M_VHADDR   dRow1, dRow1, dRow2
169    VST1        dRow0, [pDst@64], dstStep
170    $M_VHADDR   dRow2, dRow2, dRow3
171    VST1        dRow1, [pDst@64], dstStep
172    $M_VHADDR   dRow3, dRow3, dRow4
173    VST1        dRow2, [pDst@64], dstStep
174    $M_VHADDR   dRow4, dRow4, dRow5
175    VST1        dRow3, [pDst@64], dstStep
176    $M_VHADDR   dRow5, dRow5, dRow6
177    VST1        dRow4, [pDst@64], dstStep
178    $M_VHADDR   dRow6, dRow6, dRow7
179    VST1        dRow5, [pDst@64], dstStep
180    $M_VHADDR   dRow7, dRow7, dRow8
181    VST1        dRow6, [pDst@64], dstStep
182    VST1        dRow7, [pDst@64], dstStep
183
184    B           SwitchPredictTypeEnd
185    MEND
186;// ***************************************************************************
187    ;// Description:
188    ;// Does interpolation for the case of "IntegerPixel" predictType. Both
189    ;// rounding cases are handled.
190    ;// Typical computation for a row goes like this
191    ;//     1. VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep ;// Load the row and next 8 bytes
192    ;//     2. VEXT        dRow0Shft, dRow0, dRow0Shft, #1     ;// Generate the shifted row
193    ;//     3. VADDL       qSum0, dRow0, dRow0Shft             ;// Generate the sum of row and shifted row
194    ;//     5. VADD        qSum0, qSum0, qSum1                 ;// Add to the sum of next row (odd row sum has rounding value added to it)
195    ;//     6. VSHRN       dRow0, qSum0, #2                    ;// Divide by 4
196    ;//     7. VST1        dRow0, [pDst@64], dstStep           ;// Store
197    ;// Odd rows undergo following computation after step 3
198    ;//     4. VADD        qSum1, qSum1, qRound
199    ;// This saves for adding rounding value to each final sum (overall saves 4
200    ;// instructions).
201    ;// There is reuse of registers for qSum6, qSum7 & qSum8. Overall scheduling takes
202    ;// care of this and also minimizes stalls. Rounding value was modified in
203    ;// ARM register rndVal (originally used for rounding flag) before the switch.
204    ;// It is then populated into all lanes in this macro. No branching out to
205    ;// label "SwitchPredictTypeEnd" is required in the end of the macro as these
206    ;// are the last of switch cases.
207    ;//
208    ;// Syntax:
209    ;// M_MCRECONBLOCK_HalfPixelXY
210    ;//
211    ;// Inputs: None
212    ;// Outputs: None
213
214    MACRO
215    M_MCRECONBLOCK_HalfPixelXY
216
217CaseHalfPixelXY_Rnd0
218CaseHalfPixelXY_Rnd1
219    VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep
220    VDUP        qRound, rndVal
221    VLD1        {dRow1, dRow1Shft}, [pSrc], srcStep
222    VEXT        dRow0Shft, dRow0, dRow0Shft, #1
223    VLD1        {dRow2, dRow2Shft}, [pSrc], srcStep
224    VEXT        dRow1Shft, dRow1, dRow1Shft, #1
225    VLD1        {dRow3, dRow3Shft}, [pSrc], srcStep
226    VEXT        dRow2Shft, dRow2, dRow2Shft, #1
227    VLD1        {dRow4, dRow4Shft}, [pSrc], srcStep
228    VADDL       qSum0, dRow0, dRow0Shft
229    VLD1        {dRow5, dRow5Shft}, [pSrc], srcStep
230    VADDL       qSum1, dRow1, dRow1Shft
231    VLD1        {dRow6, dRow6Shft}, [pSrc], srcStep
232    VEXT        dRow3Shft, dRow3, dRow3Shft, #1
233    VLD1        {dRow7, dRow7Shft}, [pSrc], srcStep
234    VEXT        dRow4Shft, dRow4, dRow4Shft, #1
235    VLD1        {dRow8, dRow8Shft}, [pSrc], srcStep
236    VADD        qSum1, qSum1, qRound
237    VADDL       qSum2, dRow2, dRow2Shft
238    VEXT        dRow5Shft, dRow5, dRow5Shft, #1
239    VADD        qSum0, qSum0, qSum1
240    VADDL       qSum3, dRow3, dRow3Shft
241    VEXT        dRow6Shft, dRow6, dRow6Shft, #1
242    VADD        qSum1, qSum1, qSum2
243    VSHRN       dRow0, qSum0, #2
244    VADDL       qSum4, dRow4, dRow4Shft
245    VSHRN       dRow1, qSum1, #2
246    VADD        qSum3, qSum3, qRound
247    VADDL       qSum5, dRow5, dRow5Shft
248    VST1        dRow0, [pDst@64], dstStep
249    VEXT        dRow7Shft, dRow7, dRow7Shft, #1
250    VST1        dRow1, [pDst@64], dstStep
251    VEXT        dRow8Shft, dRow8, dRow8Shft, #1
252    VADD        qSum5, qSum5, qRound
253    VADD        qSum2, qSum2, qSum3
254    VADD        qSum3, qSum3, qSum4
255    VADD        qSum4, qSum4, qSum5
256    VSHRN       dRow2, qSum2, #2
257    VSHRN       dRow3, qSum3, #2
258    VSHRN       dRow4, qSum4, #2
259    VADDL       qSum6, dRow6, dRow6Shft
260    VADDL       qSum7, dRow7, dRow7Shft
261    VST1        dRow2, [pDst@64], dstStep
262    VADDL       qSum8, dRow8, dRow8Shft
263    VADD        qSum7, qSum7, qRound
264    VST1        dRow3, [pDst@64], dstStep
265    VST1        dRow4, [pDst@64], dstStep
266    VADD        qSum5, qSum5, qSum6
267    VADD        qSum6, qSum6, qSum7
268    VADD        qSum7, qSum7, qSum8
269    VSHRN       dRow5, qSum5, #2
270    VSHRN       dRow6, qSum6, #2
271    VSHRN       dRow7, qSum7, #2
272    VST1        dRow5, [pDst@64], dstStep
273    VST1        dRow6, [pDst@64], dstStep
274    VST1        dRow7, [pDst@64], dstStep
275
276    MEND
277;// ***************************************************************************
278
279;// Input/Output Registers
280pSrc                  RN 0
281srcStep               RN 1
282pSrcResidue           RN 2
283pDst                  RN 3
284dstStep               RN 4
285predictType           RN 5
286rndVal                RN 6
287
288;// Local Scratch Registers
289pDstCopy              RN 0
290return                RN 0
291
292;// Neon Registers
293dRow0                 DN D0.U8
294dRow0Shft             DN D1.U8
295dRow1                 DN D2.U8
296dRow1Shft             DN D3.U8
297dRow2                 DN D4.U8
298dRow2Shft             DN D5.U8
299dRow3                 DN D6.U8
300dRow3Shft             DN D7.U8
301dRow4                 DN D8.U8
302dRow4Shft             DN D9.U8
303dRow5                 DN D10.U8
304dRow5Shft             DN D11.U8
305dRow6                 DN D12.U8
306dRow6Shft             DN D13.U8
307dRow7                 DN D14.U8
308dRow7Shft             DN D15.U8
309dRow8                 DN D16.U8
310dRow8Shft             DN D17.U8
311
312
313qSum0                 QN Q9.U16
314qSum1                 QN Q10.U16
315qSum2                 QN Q11.U16
316qSum3                 QN Q12.U16
317qSum4                 QN Q13.U16
318qSum5                 QN Q14.U16
319qSum6                 QN Q0.U16
320qSum7                 QN Q1.U16
321qSum8                 QN Q2.U16
322
323qRound                QN Q15.U16
324
325dDst0                 DN D0.U8
326dDst1                 DN D1.U8
327dDst2                 DN D2.U8
328dDst3                 DN D3.U8
329dDst4                 DN D4.U8
330dDst5                 DN D5.U8
331dDst6                 DN D6.U8
332dDst7                 DN D7.U8
333
334qRes0                 QN Q4.S16
335qRes1                 QN Q5.S16
336qRes2                 QN Q6.S16
337qRes3                 QN Q7.S16
338qRes4                 QN Q8.S16
339qRes5                 QN Q9.S16
340qRes6                 QN Q10.S16
341qRes7                 QN Q11.S16
342
343    ;// Function header
344    M_START     omxVCM4P2_MCReconBlock, r6, d15
345    ;// Define stack arguments
346    M_ARG       Arg_dstStep,        4
347    M_ARG       Arg_predictType,    4
348    M_ARG       Arg_rndVal,         4
349    ;// Load argument from the stack
350    M_LDR       dstStep, Arg_dstStep
351    M_LDR       predictType, Arg_predictType
352    M_LDR       rndVal, Arg_rndVal
353    ADD         predictType, rndVal, predictType, LSL #1
354    RSB         rndVal, rndVal, #2              ;// preparing rndVal for HalfPixelXY
355
356    ;// The following is implementation of switching to different code segments
357    ;// based on different predictType and rndVal flags. The corresponding
358    ;// labels (e.g. CaseIntegerPixel_Rnd0) are embedded in the macros following
359    ;// M_ENDSWITCH (e.g. M_MCRECONBLOCK_IntegerPixel). While "M_MCRECONBLOCK_IntegerPixel"
360    ;// and "M_MCRECONBLOCK_HalfPixelXY" handle for both rounding cases;
361    ;// "M_MCRECONBLOCK_HalfPixelX" and "M_MCRECONBLOCK_HalfPixelY" macros handle
362    ;// the two rounding cases in separate code bases.
363    ;// All these together implement the interpolation functionality
364
365    M_SWITCH    predictType
366        M_CASE      CaseIntegerPixel_Rnd0
367        M_CASE      CaseIntegerPixel_Rnd1
368        M_CASE      CaseHalfPixelX_Rnd0
369        M_CASE      CaseHalfPixelX_Rnd1
370        M_CASE      CaseHalfPixelY_Rnd0
371        M_CASE      CaseHalfPixelY_Rnd1
372        M_CASE      CaseHalfPixelXY_Rnd0
373        M_CASE      CaseHalfPixelXY_Rnd1
374    M_ENDSWITCH
375
376    M_MCRECONBLOCK_IntegerPixel
377    M_MCRECONBLOCK_HalfPixelX 0
378    M_MCRECONBLOCK_HalfPixelX 1
379    M_MCRECONBLOCK_HalfPixelY 0
380    M_MCRECONBLOCK_HalfPixelY 1
381    M_MCRECONBLOCK_HalfPixelXY
382SwitchPredictTypeEnd
383
384    ;// After interpolation is done, residue needs to be added. This is done
385    ;// only in case "pSrcResidue" parameter to the function is not NULL.
386    ;// Following is a completely unrolled code to do so. Each row and
387    ;// corresponding residue is loaded and residue is added and value
388    ;// stored
389
390    CMP         pSrcResidue, #0
391    SUBNE       pDst, pDst, dstStep, LSL #3     ;// Restoring pDst
392    MOVNE       pDstCopy, pDst
393    BEQ         pSrcResidueConditionEnd
394pSrcResidueNotNull
395    VLD1        dDst0, [pDst@64], dstStep
396    VLD1        qRes0, [pSrcResidue@128]!
397    VLD1        dDst1, [pDst@64], dstStep
398    VLD1        qRes1, [pSrcResidue@128]!
399    VLD1        dDst2, [pDst@64], dstStep
400    VLD1        qRes2, [pSrcResidue@128]!
401    VADDW       qRes0, qRes0, dDst0
402    VLD1        dDst3, [pDst@64], dstStep
403    VADDW       qRes1, qRes1, dDst1
404    VLD1        qRes3, [pSrcResidue@128]!
405    VADDW       qRes2, qRes2, dDst2
406    VLD1        dDst4, [pDst@64], dstStep
407    VQMOVUN     dDst0, qRes0
408    VLD1        qRes4, [pSrcResidue@128]!
409    VADDW       qRes3, qRes3, dDst3
410    VLD1        dDst5, [pDst@64], dstStep
411    VQMOVUN     dDst1, qRes1
412    VLD1        qRes5, [pSrcResidue@128]!
413    VADDW       qRes4, qRes4, dDst4
414    VLD1        dDst6, [pDst@64], dstStep
415    VQMOVUN     dDst2, qRes2
416    VLD1        qRes6, [pSrcResidue@128]!
417    VADDW       qRes5, qRes5, dDst5
418    VLD1        dDst7, [pDst@64], dstStep
419    VQMOVUN     dDst3, qRes3
420    VLD1        qRes7, [pSrcResidue@128]!
421    VADDW       qRes6, qRes6, dDst6
422    VST1        dDst0, [pDstCopy@64], dstStep
423    VQMOVUN     dDst4, qRes4
424    VST1        dDst1, [pDstCopy@64], dstStep
425    VADDW       qRes7, qRes7, dDst7
426    VST1        dDst2, [pDstCopy@64], dstStep
427    VQMOVUN     dDst5, qRes5
428    VST1        dDst3, [pDstCopy@64], dstStep
429    VQMOVUN     dDst6, qRes6
430    VST1        dDst4, [pDstCopy@64], dstStep
431    VQMOVUN     dDst7, qRes7
432    VST1        dDst5, [pDstCopy@64], dstStep
433    VST1        dDst6, [pDstCopy@64], dstStep
434    VST1        dDst7, [pDstCopy@64], dstStep
435
436pSrcResidueConditionEnd
437    MOV         return, #OMX_Sts_NoErr
438
439    M_END
440    ENDIF ;// CortexA8
441    END
442;// ***************************************************************************
443;// omxVCM4P2_MCReconBlock ends
444;// ***************************************************************************
445