omxVCM4P2_MCReconBlock_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;//
3;// File Name:  omxVCM4P2_MCReconBlock_s.s
4;// OpenMAX DL: v1.0.2
5;// Revision:   9641
6;// Date:       Thursday, February 7, 2008
7;//
8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
9;//
10;//
11;//
12;// Description:
13;//
14;//
15
16;// Include standard headers
17    INCLUDE omxtypes_s.h
18    INCLUDE armCOMM_s.h
19
20;// Import symbols required from other files
21
22    M_VARIANTS ARM1136JS
23
24;// ***************************************************************************
25;// ARM1136JS implementation
26;// ***************************************************************************
27    IF  ARM1136JS
28
29;// ***************************************************************************
30;// MACRO DEFINITIONS
31;// ***************************************************************************
32    ;// Description:
33    ;//
34    ;//   dest[j] = (x[j] + y[j] + round) >> 1,   j=0..3
35    ;//
36    ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to
37    ;// each sum before dividing by two, if round is 1
38    ;//
39    ;// Syntax:
40    ;// M_UHADD8R   $dest, $x, $y, $round, $mask
41    ;//
42    ;// Inputs:
43    ;// $x        four packed bytes,   x[3] :  x[2]  :  x[1]  :  x[0]
44    ;// $y        four packed bytes,   y[3] :  y[2]  :  y[1]  :  y[0]
45    ;// $round    0 if no rounding to be added, 1 if rounding to be done
46    ;// $mask     some register set to 0x80808080
47    ;//
48    ;// Outputs:
49    ;// $dest     four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
50
51    MACRO
52    M_UHADD8R   $dest, $x, $y, $round, $mask
53    IF $round = 1
54        IF  $dest /= $y
55            MVN         $dest, $x
56            UHSUB8      $dest, $y, $dest
57            EOR         $dest, $dest, $mask
58        ELSE
59            MVN         $dest, $y
60            UHSUB8      $dest, $x, $dest
61            EOR         $dest, $dest, $mask
62        ENDIF
63    ELSE
64        UHADD8      $dest, $x, $y
65    ENDIF
66    MEND
67;// ***************************************************************************
68    ;// Description:
69    ;// Load 8 bytes from $pSrc (aligned or unaligned locations)
70    ;//
71    ;// Syntax:
72    ;// M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
73    ;//
74    ;// Inputs:
75    ;// $pSrc       4 byte aligned source pointer to an address just less than
76    ;//             or equal to the data location
77    ;// $srcStep    The stride on source
78    ;// $scratch    A scratch register, used internally for temp calculations
79    ;// $offset     Difference of source data location to the source pointer
80    ;//             Use when $offset != 0 (unaligned load)
81    ;//
82    ;// Outputs:
83    ;// $pSrc       In case the macro accepts stride, it increments the pSrc by
84    ;//             that value, else unchanged
85    ;// $out0       four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
86    ;// $out1       four packed bytes,   z[7] :  z[6]  :  z[5]  :  z[4]
87    ;//
88    ;// Note: {$out0, $out1, $scratch} should be registers with ascending
89    ;// register numbering. In case offset is 0, $scratch is not modified.
90
91    MACRO
92    M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
93        IF $offset = 0
94            LDM         $pSrc, {$out0, $out1}
95            ADD         $pSrc, $pSrc, $srcStep
96        ELSE
97            LDM         $pSrc, {$out0, $out1, $scratch}
98            ADD         $pSrc, $pSrc, $srcStep
99
100            MOV         $out0, $out0, LSR #8 * $offset
101            ORR         $out0, $out0, $out1, LSL #(32 - 8 * ($offset))
102            MOV         $out1, $out1, LSR #8 * $offset
103            ORR         $out1, $out1, $scratch, LSL #(32 - 8 * ($offset))
104        ENDIF
105    MEND
106
107;// ***************************************************************************
108    ;// Description:
109    ;// Loads three words for X interpolation, update pointer to next row. For
110    ;// X interpolation, given a truncated-4byteAligned source pointer,
111    ;// invariably three continous words are required from there to get the
112    ;// nine bytes from the source pointer for filtering.
113    ;//
114    ;// Syntax:
115    ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
116    ;//
117    ;// Inputs:
118    ;// $pSrc       4 byte aligned source pointer to an address just less than
119    ;//             or equal to the data location
120    ;//
121    ;// $srcStep    The stride on source
122    ;//
123    ;// $offset     Difference of source data location to the source pointer
124    ;//             Use when $offset != 0 (unaligned load)
125    ;//
126    ;// Outputs:
127    ;// $pSrc       Incremented by $srcStep
128    ;//
129    ;// $word0, $word1, $word2, $word3
130    ;//             Three of these are outputs based on the $offset parameter.
131    ;//             The outputs are specifically generated to be processed by
132    ;//             the M_EXT_XINT macro. Following is the illustration to show
133    ;//             how the nine bytes are spanned for different offsets from
134    ;//             notTruncatedForAlignmentSourcePointer.
135    ;//
136    ;//              ------------------------------------------------------
137    ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
138    ;//             |------------------------------------------------------|
139    ;//             |    0   |       0     | 0123  | 4567  | 8xxx  |       |
140    ;//             |    1   |      -1     | x012  | 3456  | 78xx  |       |
141    ;//             |    2   |      -2     | xx01  | 2345  | 678x  |       |
142    ;//             |    3   |      -3     | xxx0  |       | 1234  | 5678  |
143    ;//              ------------------------------------------------------
144    ;//
145    ;//             where the numbering (0-8) is to designate the 9 bytes from
146    ;//             start of a particular row. The illustration doesn't take in
147    ;//             account the positioning of bytes with in the word and the
148    ;//             macro combination with M_EXT_XINT will work only in little
149    ;//             endian environs
150    ;//
151    ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
152    ;// register numbering
153
154    MACRO
155    M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
156        IF $offset /= 3
157            LDM         $pSrc, {$word0, $word1, $word2}
158        ELSE
159            LDM         $pSrc, {$word0, $word2, $word3}
160        ENDIF
161        ADD         $pSrc, $pSrc, $srcStep
162    MEND
163
164;// ***************************************************************************
165    ;// Description:
166    ;// Extract four registers of four pixels for X interpolation
167    ;//
168    ;// Syntax:
169    ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3
170    ;//
171    ;// Inputs:
172    ;// $offset     Difference of source data location to the source pointer
173    ;//             Use when $offset != 0 (unaligned load)
174    ;//
175    ;// $word0, $word1, $word2, $word3
176    ;//             Three of these are inputs based on the $offset parameter.
177    ;//             The inputs are specifically selected to be processed by
178    ;//             the M_EXT_XINT macro.
179    ;//
180    ;//              ------------------------------------------------------
181    ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
182    ;//             |------------------------------------------------------|
183    ;//             |    0   |       0     | 0123  | 4567  | 8xxx  | yyyy  |
184    ;//             |    1   |      -1     | x012  | 3456  | 78xx  | yyyy  |
185    ;//             |    2   |      -2     | xx01  | 2345  | 678x  | yyyy  |
186    ;//             |    3   |      -3     | xxx0  | yyyy  | 1234  | 5678  |
187    ;//              ------------------------------------------------------
188    ;//
189    ;// Outputs:
190    ;// $word0, $word1, $word2, $word3
191    ;//             Bytes from the original source pointer (not truncated for
192    ;//             4 byte alignment) as shown in the table.
193    ;//              -------------------------------
194    ;//             | word0 | word1 | word2 | word3 |
195    ;//             |-------------------------------|
196    ;//             | 0123  | 4567  | 1234  | 5678  |
197    ;//              -------------------------------
198    ;//
199    ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
200    ;// register numbering
201
202    MACRO
203    M_EXT_XINT $offset, $word0, $word1, $word2, $word3
204        IF $offset = 0
205            ; $word0 and $word1 are ok
206            ; $word2, $word3 are just 8 shifted versions
207            MOV         $word3, $word1, LSR #8
208            ORR         $word3, $word3, $word2, LSL #24
209            MOV         $word2, $word0, LSR #8
210            ORR         $word2, $word2, $word1, LSL #24
211        ELIF $offset = 3
212            ; $word2 and $word3 are ok (taken care while loading itself)
213            ; set $word0 & $word1
214            MOV         $word0, $word0, LSR #24
215            ORR         $word0, $word0, $word2, LSL #8
216            MOV         $word1, $word2, LSR #24
217            ORR         $word1, $word1, $word3, LSL #8
218        ELSE
219            MOV         $word0, $word0, LSR #8 * $offset
220            ORR         $word0, $word0, $word1, LSL #(32 - 8 * ($offset))
221            MOV         $word1, $word1, LSR #8 * $offset
222            ORR         $word1, $word1, $word2, LSL #(32 - 8 * ($offset))
223
224            MOV         $word3, $word1, LSR #8
225            ORR         $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1))
226            MOV         $word2, $word0, LSR #8
227            ORR         $word2, $word2, $word1, LSL #24
228        ENDIF
229    MEND
230
231;// ***************************************************************************
232    ;// Description:
233    ;// Computes half-sum and xor of two inputs and puts them in the input
234    ;// registers in that order
235    ;//
236    ;// Syntax:
237    ;// M_HSUM_XOR      $v0, $v1, $tmp
238    ;//
239    ;// Inputs:
240    ;// $v0         a, first input
241    ;// $v1         b, second input
242    ;// $tmp        scratch register
243    ;//
244    ;// Outputs:
245    ;// $v0         (a + b)/2
246    ;// $v1         a ^ b
247
248    MACRO
249    M_HSUM_XOR      $v0, $v1, $tmp
250        UHADD8      $tmp, $v0, $v1     ;// s0 = a + b
251        EOR         $v1, $v0, $v1      ;// l0 = a ^ b
252        MOV         $v0, $tmp          ;// s0
253    MEND
254;// ***************************************************************************
255    ;// Description:
256    ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in
257    ;// mcReconBlock module. Very specific to the implementation of
258    ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and
259    ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are
260    ;// not significant and are used by the callee for row counter (y)
261    ;//
262    ;// Some points to note are:
263    ;// 1. Input is pair of pair-averages and Xors
264    ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another
265    ;//    running average
266    ;// 3. Output is in the first argument
267    ;//
268    ;// Syntax:
269    ;// M_AVG4         $sum0, $lsb0, $sum1, $lsb1, $rndVal
270    ;//
271    ;// Inputs:
272    ;// $sum0       (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged
273    ;// $lsb0       (a ^ b)
274    ;// $sum1       (c + d) >> 1. Not modified
275    ;// $lsb1       (c ^ d)       Not modified
276    ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
277    ;//
278    ;// Outputs:
279    ;// $sum0       (a + b + c + d + 1) / 4 : If no rounding
280    ;//             (a + b + c + d + 2) / 4 : If rounding
281
282    MACRO
283    M_AVG4          $sum0, $lsb0, $sum1, $lsb1, $rndVal
284        LCLS OP1
285        LCLS OP2
286        IF $rndVal = 0 ;// rounding case
287OP1 SETS "AND"
288OP2 SETS "ORR"
289        ELSE           ;// Not rounding case
290OP1 SETS "ORR"
291OP2 SETS "AND"
292        ENDIF
293
294        LCLS lsb2
295        LCLS sum2
296        LCLS dest
297
298lsb2  SETS "tmp"
299sum2  SETS "$lsb0"
300dest  SETS "$sum0"
301
302        $OP1        $lsb0, $lsb0, $lsb1          ;// e0 = e0 & e1
303        EOR         $lsb2, $sum0, $sum1          ;// e2 = s0 ^ s1
304        $OP2        $lsb2, $lsb2, $lsb0          ;// e2 = e2 | e0
305        AND         $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask
306        UHADD8      $sum2, $sum0, $sum1          ;// s2 = (s0 + s1)/2
307        UADD8       $dest, $sum2, $lsb2          ;// dest =  s2 + e2
308    MEND
309;// ***************************************************************************
310;// Motion compensation handler macros
311;// ***************************************************************************
312    ;// Description:
313    ;// Implement motion compensation routines using the named registers in
314    ;// callee function. Each of the following 4 implement the 4 predict type
315    ;// Each handles 8 cases each ie all the combinations of 4 types of source
316    ;// alignment offsets and 2 types of rounding flag
317    ;//
318    ;// Syntax:
319    ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
320    ;// M_MCRECONBLOCK_HalfPixelX   $rndVal, $offset
321    ;// M_MCRECONBLOCK_HalfPixelY   $rndVal, $offset
322    ;// M_MCRECONBLOCK_HalfPixelXY  $rndVal, $offset
323    ;//
324    ;// Inputs:
325    ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
326    ;// $offset     $pSrc MOD 4 value. Offset from 4 byte aligned location.
327    ;//
328    ;// Outputs:
329    ;// Outputs come in the named registers of the callee functions
330    ;// The macro loads the data from the source pointer, processes it and
331    ;// stores in the destination pointer. Does the whole prediction cycle
332    ;// of Motion Compensation routine for a particular predictType
333    ;// After this only residue addition to the predicted values remain
334
335    MACRO
336    M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
337    ;// Algorithmic Description:
338    ;// This handles motion compensation for IntegerPixel predictType. Both
339    ;// rounding cases are handled by the same code base. It is just a copy
340    ;// from source to destination. Two lines are done per loop to reduce
341    ;// stalls. Loop has been software pipelined as well for that purpose.
342    ;//
343    ;// M_LOAD_X loads a whole row in two registers and then they are stored
344
345CaseIntegerPixelRnd0Offset$offset
346CaseIntegerPixelRnd1Offset$offset
347    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
348    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
349YloopIntegerPixelOffset$offset
350    SUBS        y, y, #2
351    STRD        tmp1, tmp2, [pDst], dstStep
352    STRD        tmp3, tmp4, [pDst], dstStep
353    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
354    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
355    BGT         YloopIntegerPixelOffset$offset
356
357    B           SwitchPredictTypeEnd
358    MEND
359;// ***************************************************************************
360    MACRO
361    M_MCRECONBLOCK_HalfPixelX $rndVal, $offset
362    ;// Algorithmic Description:
363    ;// This handles motion compensation for HalfPixelX predictType. The two
364    ;// rounding cases are handled by the different code base and spanned by
365    ;// different macro calls. Loop has been software pipelined to reduce
366    ;// stalls.
367    ;//
368    ;// Filtering involves averaging a pixel with the next horizontal pixel.
369    ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with
370    ;// all pixels in a row with 4 pixel in each register and another 2
371    ;// registers with pixels corresponding to one horizontally shifted pixel
372    ;// corresponding to the initial row pixels. These are set of packed
373    ;// registers appropriate to do 4 lane SIMD.
374    ;// After that M_UHADD8R macro does the averaging taking care of the
375    ;// rounding as required
376
377CaseHalfPixelXRnd$rndVal.Offset$offset
378    IF $rndVal = 0
379        LDR mask, =0x80808080
380    ENDIF
381
382    M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
383YloopHalfPixelXRnd$rndVal.Offset$offset
384    SUBS        y, y, #1
385    M_EXT_XINT  $offset, tmp1, tmp2, tmp3, tmp4
386    M_UHADD8R   tmp5, tmp1, tmp3, (1-$rndVal), mask
387    M_UHADD8R   tmp6, tmp2, tmp4, (1-$rndVal), mask
388    STRD        tmp5, tmp6, [pDst], dstStep
389    M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
390    BGT         YloopHalfPixelXRnd$rndVal.Offset$offset
391
392    B           SwitchPredictTypeEnd
393    MEND
394;// ***************************************************************************
395    MACRO
396    M_MCRECONBLOCK_HalfPixelY $rndVal, $offset
397    ;// Algorithmic Description:
398    ;// This handles motion compensation for HalfPixelY predictType. The two
399    ;// rounding cases are handled by the different code base and spanned by
400    ;// different macro calls. PreLoading is used to avoid reload of same data.
401    ;//
402    ;// Filtering involves averaging a pixel with the next vertical pixel.
403    ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in
404    ;// each register. These are set of packed registers appropriate to do
405    ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care
406    ;// of the rounding as required
407
408CaseHalfPixelYRnd$rndVal.Offset$offset
409    IF $rndVal = 0
410        LDR mask, =0x80808080
411    ENDIF
412
413    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load
414YloopHalfPixelYRnd$rndVal.Offset$offset
415    SUBS        y, y, #2
416    ;// Processing one line
417    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
418    M_UHADD8R   tmp1, tmp1, tmp3, (1-$rndVal), mask
419    M_UHADD8R   tmp2, tmp2, tmp4, (1-$rndVal), mask
420    STRD        tmp1, tmp2, [pDst], dstStep
421    ;// Processing another line
422    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset
423    M_UHADD8R   tmp3, tmp3, tmp1, (1-$rndVal), mask
424    M_UHADD8R   tmp4, tmp4, tmp2, (1-$rndVal), mask
425    STRD        tmp3, tmp4, [pDst], dstStep
426
427    BGT         YloopHalfPixelYRnd$rndVal.Offset$offset
428
429    B           SwitchPredictTypeEnd
430    MEND
431;// ***************************************************************************
432    MACRO
433    M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset
434    ;// Algorithmic Description:
435    ;// This handles motion compensation for HalfPixelXY predictType. The two
436    ;// rounding cases are handled by the different code base and spanned by
437    ;// different macro calls. PreLoading is used to avoid reload of same data.
438    ;//
439    ;// Filtering involves averaging a pixel with the next vertical, horizontal
440    ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT
441    ;// and M_EXT_XINT combination generates 4 registers with a row and its
442    ;// 1 pixel right shifted version, with 4 pixels in one register. Another
443    ;// call of that macro-combination gets another row. Then M_HSUM_XOR is
444    ;// called to get mutual half-sum and xor combinations of a row with its
445    ;// shifted version as they are inputs to the M_AVG4 macro which computes
446    ;// the 4 element average with rounding. Note that it is the half-sum/xor
447    ;// values that are preserved for next row as they can be re-used in the
448    ;// next call to the M_AVG4 and saves recomputation.
449    ;// Due to lack of register, the row counter and a masking value required
450    ;// in M_AVG4 are packed into a single register yMask where the last nibble
451    ;// holds the row counter values and rest holds the masking variable left
452    ;// shifted by 4
453
454CaseHalfPixelXYRnd$rndVal.Offset$offset
455    LDR         yMask, =((0x01010101 << 4) + 8)
456
457    M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
458    M_EXT_XINT  $offset, t00, t01, t10, t11
459    M_HSUM_XOR  t00, t10, tmp               ;// s0, l0
460    M_HSUM_XOR  t01, t11, tmp               ;// s0', l0'
461
462YloopHalfPixelXYRnd$rndVal.Offset$offset
463    ;// Processsing one line
464    ;// t00, t01, t10, t11 required from previous loop
465    M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d'
466    SUB         yMask, yMask, #2
467    M_EXT_XINT  $offset, t20, t21, t30, t31
468    M_HSUM_XOR  t20, t30, tmp               ;// s1, l1
469    M_HSUM_XOR  t21, t31, tmp               ;// s1', l1'
470    M_AVG4      t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1
471    M_AVG4      t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1'
472    STRD        t00, t01, [pDst], dstStep   ;// store the average
473
474    ;// Processsing another line
475    ;// t20, t21, t30, t31 required from above
476    M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
477    TST         yMask, #7
478    M_EXT_XINT  $offset, t00, t01, t10, t11
479    M_HSUM_XOR  t00, t10, tmp
480    M_HSUM_XOR  t01, t11, tmp
481    M_AVG4      t20, t30, t00, t10, $rndVal
482    M_AVG4      t21, t31, t01, t11, $rndVal
483    STRD        t20, t21, [pDst], dstStep
484
485    BGT         YloopHalfPixelXYRnd$rndVal.Offset$offset
486
487    IF $offset/=3 :LOR: $rndVal/=1
488        B           SwitchPredictTypeEnd
489    ENDIF
490    MEND
491;// ***************************************************************************
492;// Motion compensation handler macros end here
493;// ***************************************************************************
494    ;// Description:
495    ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal
496    ;// combination in the "switch" to prediction processing code segment
497    ;//
498    ;// Syntax:
499    ;// M_CASE_OFFSET $rnd, $predictType
500    ;//
501    ;// Inputs:
502    ;// $rnd            0 for rounding, 1 for no rounding
503    ;// $predictType    The prediction mode
504    ;//
505    ;// Outputs:
506    ;// Populated list of "M_CASE"s for the "M_SWITCH" macro
507
508    MACRO
509    M_CASE_OFFSET $rnd, $predictType
510        M_CASE      Case$predictType.Rnd$rnd.Offset0
511        M_CASE      Case$predictType.Rnd$rnd.Offset1
512        M_CASE      Case$predictType.Rnd$rnd.Offset2
513        M_CASE      Case$predictType.Rnd$rnd.Offset3
514    MEND
515;// ***************************************************************************
516    ;// Description:
517    ;// Populates all 2 kinds of rounding "cases" for each predictType in the
518    ;// "switch" to prediction processing code segment
519    ;//
520    ;// Syntax:
521    ;// M_CASE_OFFSET $predictType
522    ;//
523    ;// Inputs:
524    ;// $predictType    The prediction mode
525    ;//
526    ;// Outputs:
527    ;// Populated list of "M_CASE_OFFSET" macros
528
529    MACRO
530    M_CASE_MCRECONBLOCK $predictType
531        M_CASE_OFFSET  0, $predictType ;// 0 for rounding
532        M_CASE_OFFSET  1, $predictType ;// 1 for no rounding
533    MEND
534;// ***************************************************************************
535    ;// Description:
536    ;// Populates all 8 kinds of rounding and offset combinations handling macros
537    ;// for the specified predictType. In case of "IntegerPixel" predictType,
538    ;// rounding is not required so same code segment handles both cases
539    ;//
540    ;// Syntax:
541    ;// M_MCRECONBLOCK    $predictType
542    ;//
543    ;// Inputs:
544    ;// $predictType    The prediction mode
545    ;//
546    ;// Outputs:
547    ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified
548    ;// predictType. Each
549    ;//                 M_MCRECONBLOCK_<predictType> $rnd, $offset
550    ;// is an code segment (starting with a label indicating the predictType,
551    ;// rounding and offset combination)
552    ;// Four calls of this macro with the 4 prediction modes populate all the 32
553    ;// handlers
554
555    MACRO
556    M_MCRECONBLOCK $predictType
557        M_MCRECONBLOCK_$predictType 0, 0
558        M_MCRECONBLOCK_$predictType 0, 1
559        M_MCRECONBLOCK_$predictType 0, 2
560        M_MCRECONBLOCK_$predictType 0, 3
561    IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference
562        M_MCRECONBLOCK_$predictType 1, 0
563        M_MCRECONBLOCK_$predictType 1, 1
564        M_MCRECONBLOCK_$predictType 1, 2
565        M_MCRECONBLOCK_$predictType 1, 3
566    ENDIF
567    MEND
568;// ***************************************************************************
569;// Input/Output Registers
570pSrc                  RN 0
571srcStep               RN 1
572arg_pSrcResidue       RN 2
573pSrcResidue           RN 12
574pDst                  RN 3
575dstStep               RN 2
576predictType           RN 10
577rndVal                RN 11
578mask                  RN 11
579
580;// Local Scratch Registers
581zero                  RN 12
582y                     RN 14
583
584tmp1                  RN 4
585tmp2                  RN 5
586tmp3                  RN 6
587tmp4                  RN 7
588tmp5                  RN 8
589tmp6                  RN 9
590tmp7                  RN 10
591tmp8                  RN 11
592tmp9                  RN 12
593
594t00                   RN 4
595t01                   RN 5
596t10                   RN 6
597t11                   RN 7
598t20                   RN 8
599t21                   RN 9
600t30                   RN 10
601t31                   RN 11
602tmp                   RN 12
603
604yMask                 RN 14
605
606dst                   RN 1
607return                RN 0
608
609    ;// Allocate memory on stack
610    M_ALLOC4    Stk_pDst,           4
611    M_ALLOC4    Stk_pSrcResidue,    4
612    ;// Function header
613    M_START     omxVCM4P2_MCReconBlock, r11
614    ;// Define stack arguments
615    M_ARG       Arg_dstStep,        4
616    M_ARG       Arg_predictType,    4
617    M_ARG       Arg_rndVal,         4
618    ;// Save on stack
619    M_STR       pDst, Stk_pDst
620    M_STR       arg_pSrcResidue, Stk_pSrcResidue
621    ;// Load argument from the stack
622    M_LDR       dstStep, Arg_dstStep
623    M_LDR       predictType, Arg_predictType
624    M_LDR       rndVal, Arg_rndVal
625
626    MOV         y, #8
627
628    AND         tmp1, pSrc, #3
629    ORR         predictType, tmp1, predictType, LSL #3
630    ORR         predictType, predictType, rndVal, LSL #2
631    ;// Truncating source pointer to align to 4 byte location
632    BIC         pSrc, pSrc, #3
633
634    ;// Implementation takes care of all combinations of different
635    ;// predictTypes, rounding cases and source pointer offsets to alignment
636    ;// of 4 bytes in different code bases unless one of these parameter wasn't
637    ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK
638    ;// macros branch into 8 M_CASE macros for all combinations of the 2
639    ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte
640    ;// alignment.
641    M_SWITCH    predictType
642        M_CASE_MCRECONBLOCK IntegerPixel
643        M_CASE_MCRECONBLOCK HalfPixelX
644        M_CASE_MCRECONBLOCK HalfPixelY
645        M_CASE_MCRECONBLOCK HalfPixelXY
646    M_ENDSWITCH
647
648    ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8
649    ;// particular macros (4 in case of IntegerPixel as rounding makes no
650    ;// difference there) to generate the code for all cases of rounding and
651    ;// offsets. LTORG is used to segment the code as code size bloated beyond
652    ;// 4KB.
653    M_MCRECONBLOCK IntegerPixel
654    M_MCRECONBLOCK HalfPixelX
655    LTORG
656    M_MCRECONBLOCK HalfPixelY
657    M_MCRECONBLOCK HalfPixelXY
658SwitchPredictTypeEnd
659
660    ;// Residue Addition
661    ;// This is done in 2 lane SIMD though loads are further optimized and
662    ;// 4 bytes are loaded in case of destination buffer. Algorithmic
663    ;// details are in inlined comments
664    M_LDR       pSrcResidue, Stk_pSrcResidue
665    CMP         pSrcResidue, #0
666    BEQ         pSrcResidueConditionEnd
667pSrcResidueNotNull
668    M_LDR       pDst, Stk_pDst
669    MOV         y, #8
670    SUB         dstStep, dstStep, #4
671Yloop_pSrcResidueNotNull
672    SUBS        y, y, #1
673    LDR         dst, [pDst]                ;// dst = [dcba]
674    LDMIA       pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA]
675    PKHBT       tmp3, tmp1, tmp2, LSL #16  ;// Deltaval1 = [C A]
676    PKHTB       tmp4, tmp2, tmp1, ASR #16  ;// DeltaVal2 = [D B]
677    UXTB16      tmp1, dst                  ;// tmp1 = [0c0a]
678    UXTB16      tmp2, dst, ROR #8          ;// tmp2 = [0d0b]
679    QADD16      tmp1, tmp1, tmp3           ;// Add and saturate to 16 bits
680    QADD16      tmp2, tmp2, tmp4
681    USAT16      tmp1, #8, tmp1
682    USAT16      tmp2, #8, tmp2             ;// armClip(0, 255, tmp2)
683    ORR         tmp1, tmp1, tmp2, LSL #8   ;// tmp1 = [dcba]
684    STR         tmp1, [pDst], #4
685
686    LDR         dst, [pDst]
687    LDMIA       pSrcResidue!, {tmp1, tmp2}
688    PKHBT       tmp3, tmp1, tmp2, LSL #16
689    PKHTB       tmp4, tmp2, tmp1, ASR #16
690    UXTB16      tmp1, dst
691    UXTB16      tmp2, dst, ROR #8
692    QADD16      tmp1, tmp1, tmp3
693    QADD16      tmp2, tmp2, tmp4
694    USAT16      tmp1, #8, tmp1
695    USAT16      tmp2, #8, tmp2
696    ORR         tmp1, tmp1, tmp2, LSL #8
697    STR         tmp1, [pDst], dstStep
698
699    BGT         Yloop_pSrcResidueNotNull
700pSrcResidueConditionEnd
701
702    MOV         return, #OMX_Sts_NoErr
703
704    M_END
705    ENDIF ;// ARM1136JS
706
707;// ***************************************************************************
708;// CortexA8 implementation
709;// ***************************************************************************
710    END
711;// ***************************************************************************
712;// omxVCM4P2_MCReconBlock ends
713;// ***************************************************************************
714