10c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
20c1bc742181ded4930842b46e9507372f0b1b963James Dong;// This confidential and proprietary software may be used only as
30c1bc742181ded4930842b46e9507372f0b1b963James Dong;// authorised by a licensing agreement from ARM Limited
40c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   (C) COPYRIGHT 2004 ARM Limited
50c1bc742181ded4930842b46e9507372f0b1b963James Dong;//       ALL RIGHTS RESERVED
60c1bc742181ded4930842b46e9507372f0b1b963James Dong;// The entire notice above must be reproduced on all authorised
70c1bc742181ded4930842b46e9507372f0b1b963James Dong;// copies and copies may only be made to the extent permitted
80c1bc742181ded4930842b46e9507372f0b1b963James Dong;// by a licensing agreement from ARM Limited.
90c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
100c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IDCT_s.s
110c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
120c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Inverse DCT module
130c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
140c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
150c1bc742181ded4930842b46e9507372f0b1b963James Dong;// ALGORITHM DESCRIPTION
160c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
170c1bc742181ded4930842b46e9507372f0b1b963James Dong;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
180c1bc742181ded4930842b46e9507372f0b1b963James Dong;// column and then a 1D IDCT for each row.
190c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
200c1bc742181ded4930842b46e9507372f0b1b963James Dong;// The 8-point 1D IDCT is defined by
210c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
220c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
230c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
240c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   c(u,x) = cos( (2x+1)*u*pi/16 )
250c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
260c1bc742181ded4930842b46e9507372f0b1b963James Dong;// We compute the 8-point 1D IDCT using the reverse of
270c1bc742181ded4930842b46e9507372f0b1b963James Dong;// the Arai-Agui-Nakajima flow graph which we split into
280c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 5 stages named in reverse order to identify with the
290c1bc742181ded4930842b46e9507372f0b1b963James Dong;// forward DCT. Direct inversion of the forward formulae
300c1bc742181ded4930842b46e9507372f0b1b963James Dong;// in file FDCT_s.s gives:
310c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
320c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
330c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             [ A(0) = 2*sqrt(2)
340c1bc742181ded4930842b46e9507372f0b1b963James Dong;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
350c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
360c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 4:   i0 = j0             i1 = j4
370c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
380c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
390c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
400c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
410c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
420c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h2 = (i2*sqrt2)-i3  h3 = i3
430c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
440c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
450c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             [ The above two lines rotate by -(pi/8) ]
460c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2
470c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
480c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
490c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
500c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g7 = h7             g6 = h6 - h7
510c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g5 = h5 - g6        g4 = h4 - g5
520c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
530c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
540c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
550c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
560c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
570c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
580c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Note that most coefficients are halved 3 times during the
590c1bc742181ded4930842b46e9507372f0b1b963James Dong;// above calculation. We can rescale the algorithm dividing
600c1bc742181ded4930842b46e9507372f0b1b963James Dong;// the input by 8 to remove the halvings.
610c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
620c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 5:   j(u) = T(u)*A(u)/8
630c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
640c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 4:   i0 = j0             i1 = j4
650c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i3 = j2 + j6        i2 = j2 - j6
660c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i7 = j5 + j3        i4 = j5 - j3
670c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             i5 = j1 + j7        i6 = j1 - j7
680c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
690c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
700c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h2 = (i2*sqrt2)-i3  h3 = i3
710c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
720c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
730c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7
740c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
750c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
760c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g1 = h1 + h2        g2 = h1 - h2
770c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g7 = h7             g6 = h6 - h7
780c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             g5 = h5 - g6        g4 = h4 - g5
790c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
800c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
810c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f1 = g1 + g6        f6 = g1 - g6
820c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f2 = g2 + g5        f5 = g2 - g5
830c1bc742181ded4930842b46e9507372f0b1b963James Dong;//             f3 = g3 + g4        f4 = g3 - g4
840c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
850c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Note:
860c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 1. The scaling by A(u)/8 can often be combined with inverse
870c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    quantization. The column and row scalings can be combined.
880c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
890c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    to the above code but is otherwise identical.
900c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 3. The rotation by -pi/8 can be peformed using three multiplies
910c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
920c1bc742181ded4930842b46e9507372f0b1b963James Dong;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
930c1bc742181ded4930842b46e9507372f0b1b963James Dong;// 4. If |T(u)|<=1 then from the IDCT definition,
940c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
950c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
960c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
970c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
980c1bc742181ded4930842b46e9507372f0b1b963James Dong;//            = (approx)2.64
990c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
1000c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    The table below shows input patterns generating the maximum
1010c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
1020c1bc742181ded4930842b46e9507372f0b1b963James Dong;//    InputPattern      Max |f(x)|
1030c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPPPPPPP        |f0| =  2.64
1040c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPPMMMMM        |f1| =  2.64
1050c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPMMMPPP        |f2| =  2.64
1060c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PPMMPPMM        |f3| =  2.64
1070c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMMPPMMP        |f4| =  2.64
1080c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMMPMMPM        |f5| =  2.64
1090c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMPPMPMP        |f6| =  2.64
1100c1bc742181ded4930842b46e9507372f0b1b963James Dong;//      PMPMPMPM        |f7| =  2.64
1110c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   Note that this input pattern is the transpose of the
1120c1bc742181ded4930842b46e9507372f0b1b963James Dong;//   corresponding max input patter for the FDCT.
1130c1bc742181ded4930842b46e9507372f0b1b963James Dong
1140c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Arguments
1150c1bc742181ded4930842b46e9507372f0b1b963James Dong
1160c1bc742181ded4930842b46e9507372f0b1b963James DongpSrc    RN 0    ;// source data buffer
1170c1bc742181ded4930842b46e9507372f0b1b963James DongStride  RN 1    ;// destination stride in bytes
1180c1bc742181ded4930842b46e9507372f0b1b963James DongpDest   RN 2    ;// destination data buffer
1190c1bc742181ded4930842b46e9507372f0b1b963James DongpScale  RN 3    ;// pointer to scaling table
1200c1bc742181ded4930842b46e9507372f0b1b963James Dong
1210c1bc742181ded4930842b46e9507372f0b1b963James Dong
1220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// DCT Inverse Macro
1230c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// The DCT code should be parametrized according
1240c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// to the following inputs:
1250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
1260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
1270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
1280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
1290c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
1300c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
1310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Inputs:
1320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pSrc   = r0 = Pointer to input data
1330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//               Range is -256 to +255 (9-bit)
1340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Stride = r1 = Stride between input lines
1350c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pDest  = r2 = Pointer to output data
1360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
1370c1bc742181ded4930842b46e9507372f0b1b963James Dong
1380c1bc742181ded4930842b46e9507372f0b1b963James Dong
1390c1bc742181ded4930842b46e9507372f0b1b963James Dong
1400c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
1410c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT  $outsize, $inscale, $stride
1420c1bc742181ded4930842b46e9507372f0b1b963James Dong        LCLA    SHIFT
1430c1bc742181ded4930842b46e9507372f0b1b963James Dong
1440c1bc742181ded4930842b46e9507372f0b1b963James Dong
1450c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ARM1136JS
1460c1bc742181ded4930842b46e9507372f0b1b963James Dong
1470c1bc742181ded4930842b46e9507372f0b1b963James Dong;// REGISTER ALLOCATION
1480c1bc742181ded4930842b46e9507372f0b1b963James Dong;// This is hard since we have 8 values, 9 free registers and each
1490c1bc742181ded4930842b46e9507372f0b1b963James Dong;// butterfly requires a temporary register. We also want to
1500c1bc742181ded4930842b46e9507372f0b1b963James Dong;// maintain register order so we can use LDM/STM. The table below
1510c1bc742181ded4930842b46e9507372f0b1b963James Dong;// summarises the register allocation that meets all these criteria.
1520c1bc742181ded4930842b46e9507372f0b1b963James Dong;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
1530c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
1540c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r1  a01     g0  h0
1550c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r4  b01 f0  g1  h1  i0
1560c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r5  a23 f1  g2      i1
1570c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r6  b23 f2  g3  h2  i2
1580c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r7  a45 f3      h3  i3
1590c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r8  b45 f4  g4  h4  i4
1600c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r9  a67 f5  g5  h5  i5
1610c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r10 b67 f6  g6  h6  i6
1620c1bc742181ded4930842b46e9507372f0b1b963James Dong;// r11     f7  g7  h7  i7
1630c1bc742181ded4930842b46e9507372f0b1b963James Dong;//
1640c1bc742181ded4930842b46e9507372f0b1b963James Dongra01    RN 1
1650c1bc742181ded4930842b46e9507372f0b1b963James Dongrb01    RN 4
1660c1bc742181ded4930842b46e9507372f0b1b963James Dongra23    RN 5
1670c1bc742181ded4930842b46e9507372f0b1b963James Dongrb23    RN 6
1680c1bc742181ded4930842b46e9507372f0b1b963James Dongra45    RN 7
1690c1bc742181ded4930842b46e9507372f0b1b963James Dongrb45    RN 8
1700c1bc742181ded4930842b46e9507372f0b1b963James Dongra67    RN 9
1710c1bc742181ded4930842b46e9507372f0b1b963James Dongrb67    RN 10
1720c1bc742181ded4930842b46e9507372f0b1b963James Dongrtmp    RN 11
1730c1bc742181ded4930842b46e9507372f0b1b963James DongcsPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
1740c1bc742181ded4930842b46e9507372f0b1b963James DongLoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
1750c1bc742181ded4930842b46e9507372f0b1b963James Dong;// Transpose allocation
1760c1bc742181ded4930842b46e9507372f0b1b963James Dongxft     RN ra01
1770c1bc742181ded4930842b46e9507372f0b1b963James Dongxf0     RN rb01
1780c1bc742181ded4930842b46e9507372f0b1b963James Dongxf1     RN ra23
1790c1bc742181ded4930842b46e9507372f0b1b963James Dongxf2     RN rb23
1800c1bc742181ded4930842b46e9507372f0b1b963James Dongxf3     RN ra45
1810c1bc742181ded4930842b46e9507372f0b1b963James Dongxf4     RN rb45
1820c1bc742181ded4930842b46e9507372f0b1b963James Dongxf5     RN ra67
1830c1bc742181ded4930842b46e9507372f0b1b963James Dongxf6     RN rb67
1840c1bc742181ded4930842b46e9507372f0b1b963James Dongxf7     RN rtmp
1850c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 1 allocation
1860c1bc742181ded4930842b46e9507372f0b1b963James Dongxg0     RN xft
1870c1bc742181ded4930842b46e9507372f0b1b963James Dongxg1     RN xf0
1880c1bc742181ded4930842b46e9507372f0b1b963James Dongxg2     RN xf1
1890c1bc742181ded4930842b46e9507372f0b1b963James Dongxg3     RN xf2
1900c1bc742181ded4930842b46e9507372f0b1b963James Dongxgt     RN xf3
1910c1bc742181ded4930842b46e9507372f0b1b963James Dongxg4     RN xf4
1920c1bc742181ded4930842b46e9507372f0b1b963James Dongxg5     RN xf5
1930c1bc742181ded4930842b46e9507372f0b1b963James Dongxg6     RN xf6
1940c1bc742181ded4930842b46e9507372f0b1b963James Dongxg7     RN xf7
1950c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 2 allocation
1960c1bc742181ded4930842b46e9507372f0b1b963James Dongxh0     RN xg0
1970c1bc742181ded4930842b46e9507372f0b1b963James Dongxh1     RN xg1
1980c1bc742181ded4930842b46e9507372f0b1b963James Dongxht     RN xg2
1990c1bc742181ded4930842b46e9507372f0b1b963James Dongxh2     RN xg3
2000c1bc742181ded4930842b46e9507372f0b1b963James Dongxh3     RN xgt
2010c1bc742181ded4930842b46e9507372f0b1b963James Dongxh4     RN xg4
2020c1bc742181ded4930842b46e9507372f0b1b963James Dongxh5     RN xg5
2030c1bc742181ded4930842b46e9507372f0b1b963James Dongxh6     RN xg6
2040c1bc742181ded4930842b46e9507372f0b1b963James Dongxh7     RN xg7
2050c1bc742181ded4930842b46e9507372f0b1b963James Dong;// IStage 3,4 allocation
2060c1bc742181ded4930842b46e9507372f0b1b963James Dongxit     RN xh0
2070c1bc742181ded4930842b46e9507372f0b1b963James Dongxi0     RN xh1
2080c1bc742181ded4930842b46e9507372f0b1b963James Dongxi1     RN xht
2090c1bc742181ded4930842b46e9507372f0b1b963James Dongxi2     RN xh2
2100c1bc742181ded4930842b46e9507372f0b1b963James Dongxi3     RN xh3
2110c1bc742181ded4930842b46e9507372f0b1b963James Dongxi4     RN xh4
2120c1bc742181ded4930842b46e9507372f0b1b963James Dongxi5     RN xh5
2130c1bc742181ded4930842b46e9507372f0b1b963James Dongxi6     RN xh6
2140c1bc742181ded4930842b46e9507372f0b1b963James Dongxi7     RN xh7
2150c1bc742181ded4930842b46e9507372f0b1b963James Dong
2160c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_STR   pDest,  ppDest
2170c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
2180c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_STR   Stride, pStride
2190c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
2200c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_ADR   pDest,  pBlk
2210c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     csPiBy8, =0x30fc7642
2220c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     LoopRR2, =0x00005a82
2230c1bc742181ded4930842b46e9507372f0b1b963James Dong
2240c1bc742181ded4930842b46e9507372f0b1b963James Dongv6_idct_col$_F
2250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load even values
2260c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi4, [pSrc], #4  ;// j0
2270c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi5, [pSrc, #4*16-4]  ;// j4
2280c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi6, [pSrc, #2*16-4]  ;// j2
2290c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi7, [pSrc, #6*16-4]  ;// j6
2300c1bc742181ded4930842b46e9507372f0b1b963James Dong
2310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale Even Values
2320c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s16" ;// 16x16 mul
2330c1bc742181ded4930842b46e9507372f0b1b963James DongSHIFT       SETA    12
2340c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi0, [pScale], #4
2350c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi1, [pScale, #4*16-4]
2360c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi2, [pScale, #2*16-4]
2370c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xit, #1<<(SHIFT-1)
2380c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi3, xi0, xi4, xit
2390c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi4, xi0, xi4, xit
2400c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi0, xi1, xi5, xit
2410c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi5, xi1, xi5, xit
2420c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi3, xi3, ASR #SHIFT
2430c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
2440c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi3, [pScale, #6*16-4]
2450c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi1, xi2, xi6, xit
2460c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi6, xi2, xi6, xit
2470c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi0, xi0, ASR #SHIFT
2480c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
2490c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi2, xi3, xi7, xit
2500c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi7, xi3, xi7, xit
2510c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi1, xi1, ASR #SHIFT
2520c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
2530c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi2, xi2, ASR #SHIFT
2540c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
2550c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
2560c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s32" ;// 32x16 mul
2570c1bc742181ded4930842b46e9507372f0b1b963James DongSHIFT       SETA    (12+8-16)
2580c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xit, #1<<(SHIFT-1)
2590c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi0, [pScale], #8
2600c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi1, [pScale, #0*32+4-8]
2610c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi2, [pScale, #4*32-8]
2620c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi3, [pScale, #4*32+4-8]
2630c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi0, xi0, xi4, xit
2640c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi1, xi1, xi4, xit
2650c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi2, xi2, xi5, xit
2660c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi3, xi3, xi5, xit
2670c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi0, xi0, ASR #SHIFT
2680c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
2690c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi2, xi2, ASR #SHIFT
2700c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
2710c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi0, [pScale, #2*32-8]
2720c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi1, [pScale, #2*32+4-8]
2730c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi2, [pScale, #6*32-8]
2740c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi3, [pScale, #6*32+4-8]
2750c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi0, xi0, xi6, xit
2760c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi1, xi1, xi6, xit
2770c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi2, xi2, xi7, xit
2780c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi3, xi3, xi7, xit
2790c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi0, xi0, ASR #SHIFT
2800c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
2810c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi2, xi2, ASR #SHIFT
2820c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
2830c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
2840c1bc742181ded4930842b46e9507372f0b1b963James Dong
2850c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load odd values
2860c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc, #1*16-4]      ;// j1
2870c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #7*16-4]      ;// j7
2880c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi2, [pSrc, #5*16-4]      ;// j5
2890c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi3, [pSrc, #3*16-4]      ;// j3
2900c1bc742181ded4930842b46e9507372f0b1b963James Dong
2910c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF  {TRUE}
2920c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// shortcut if odd values 0
2930c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQ     xi0, #0
2940c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQEQ   xi1, #0
2950c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQEQ   xi2, #0
2960c1bc742181ded4930842b46e9507372f0b1b963James Dong            TEQEQ   xi3, #0
2970c1bc742181ded4930842b46e9507372f0b1b963James Dong            BEQ     v6OddZero$_F
2980c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
2990c1bc742181ded4930842b46e9507372f0b1b963James Dong
3000c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store scaled even values
3010c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest, {xi4, xi5, xi6, xi7}
3020c1bc742181ded4930842b46e9507372f0b1b963James Dong
3030c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale odd values
3040c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s16"
3050c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Perform AAN Scale
3060c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi4, [pScale, #1*16-4]
3070c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi5, [pScale, #7*16-4]
3080c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi6, [pScale, #5*16-4]
3090c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi7, xi0, xi4, xit
3100c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi0, xi0, xi4, xit
3110c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi4, xi1, xi5, xit
3120c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi1, xi1, xi5, xit
3130c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi7, xi7, ASR #SHIFT
3140c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
3150c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi7, [pScale, #3*16-4]
3160c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi5, xi2, xi6, xit
3170c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi2, xi2, xi6, xit
3180c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi4, xi4, ASR #SHIFT
3190c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
3200c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLABB  xi6, xi3, xi7, xit
3210c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLATT  xi3, xi3, xi7, xit
3220c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi5, xi5, ASR #SHIFT
3230c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
3240c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi6, xi6, ASR #SHIFT
3250c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
3260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
3270c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s32" ;// 32x16 mul
3280c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi4, [pScale, #1*32-8]
3290c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi5, [pScale, #1*32+4-8]
3300c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi6, [pScale, #7*32-8]
3310c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi7, [pScale, #7*32+4-8]
3320c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi4, xi4, xi0, xit
3330c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi5, xi5, xi0, xit
3340c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi6, xi6, xi1, xit
3350c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi7, xi7, xi1, xit
3360c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi4, xi4, ASR #SHIFT
3370c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
3380c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi6, xi6, ASR #SHIFT
3390c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
3400c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi4, [pScale, #5*32-8]
3410c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi5, [pScale, #5*32+4-8]
3420c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi6, [pScale, #3*32-8]
3430c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR     xi7, [pScale, #3*32+4-8]
3440c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi4, xi4, xi2, xit
3450c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi5, xi5, xi2, xit
3460c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWB  xi6, xi6, xi3, xit
3470c1bc742181ded4930842b46e9507372f0b1b963James Dong            SMLAWT  xi7, xi7, xi3, xit
3480c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi4, xi4, ASR #SHIFT
3490c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
3500c1bc742181ded4930842b46e9507372f0b1b963James Dong            MOV     xi6, xi6, ASR #SHIFT
3510c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
3520c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
3530c1bc742181ded4930842b46e9507372f0b1b963James Dong
3540c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
3550c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi6, xi0, xi1           ;// j1-j7
3560c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
3570c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi4, xi2, xi3           ;// j5-j3
3580c1bc742181ded4930842b46e9507372f0b1b963James Dong
3590c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
3600c1bc742181ded4930842b46e9507372f0b1b963James Dong
3610c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
3620c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
3630c1bc742181ded4930842b46e9507372f0b1b963James Dong
3640c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
3650c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
3660c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
3670c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
3680c1bc742181ded4930842b46e9507372f0b1b963James Dong
3690c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi1, xi3, LoopRR2
3700c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi3, xi3, LoopRR2
3710c1bc742181ded4930842b46e9507372f0b1b963James Dong
3720c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
3730c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
3740c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
3750c1bc742181ded4930842b46e9507372f0b1b963James Dong
3760c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0,xi1,xi2,xi3 now free
3770c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3, rows 2to3 x1/2
3780c1bc742181ded4930842b46e9507372f0b1b963James Dong
3790c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi3, xi3, LSL #1
3800c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
3810c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
3820c1bc742181ded4930842b46e9507372f0b1b963James Dong
3830c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2, rows4to7
3840c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg6, xh6, xh7
3850c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg5, xh5, xg6
3860c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg4, xh4, xg5
3870c1bc742181ded4930842b46e9507372f0b1b963James Dong
3880c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
3890c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
3900c1bc742181ded4930842b46e9507372f0b1b963James Dong
3910c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi0, xi2, LoopRR2
3920c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi2, xi2, LoopRR2
3930c1bc742181ded4930842b46e9507372f0b1b963James Dong
3940c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi2, xi2, LSL #1
3950c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
3960c1bc742181ded4930842b46e9507372f0b1b963James Dong
3970c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0, xi1 now free
3980c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
3990c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDRD    xi0, [pDest]            ;// j0, j4 scaled
4000c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xh2, xh2, xi3
4010c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
4020c1bc742181ded4930842b46e9507372f0b1b963James Dong
4030c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh0, xi0, xi1
4040c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xh1, xi0, xi1
4050c1bc742181ded4930842b46e9507372f0b1b963James Dong
4060c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
4070c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg2, xh1, xh2
4080c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg1, xh1, xh2
4090c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg3, xh0, xh3
4100c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg0, xh0, xh3
4110c1bc742181ded4930842b46e9507372f0b1b963James Dong
4120c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
4130c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf3, xg3, xg4
4140c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf4, xg3, xg4
4150c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf2, xg2, xg5
4160c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf5, xg2, xg5
4170c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf1, xg1, xg6
4180c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf6, xg1, xg6
4190c1bc742181ded4930842b46e9507372f0b1b963James Dong        SADD16  xf0, xg0, xg7
4200c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xf7, xg0, xg7
4210c1bc742181ded4930842b46e9507372f0b1b963James Dong
4220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose, store and loop
4230c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra01, xf0, xf1, LSL #16
4240c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb01, xf1, xf0, ASR #16
4250c1bc742181ded4930842b46e9507372f0b1b963James Dong
4260c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra23, xf2, xf3, LSL #16
4270c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb23, xf3, xf2, ASR #16
4280c1bc742181ded4930842b46e9507372f0b1b963James Dong
4290c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra45, xf4, xf5, LSL #16
4300c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb45, xf5, xf4, ASR #16
4310c1bc742181ded4930842b46e9507372f0b1b963James Dong
4320c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra67, xf6, xf7, LSL #16
4330c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {ra01, ra23, ra45, ra67}
4340c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb67, xf7, xf6, ASR #16
4350c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {rb01, rb23, rb45, rb67}
4360c1bc742181ded4930842b46e9507372f0b1b963James Dong        BCC     v6_idct_col$_F
4370c1bc742181ded4930842b46e9507372f0b1b963James Dong
4380c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB     pSrc, pDest, #(64*2)
4390c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   pDest, ppDest
4400c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
4410c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_LDR   pScale, pStride
4420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
4430c1bc742181ded4930842b46e9507372f0b1b963James Dong        B       v6_idct_row$_F
4440c1bc742181ded4930842b46e9507372f0b1b963James Dong
4450c1bc742181ded4930842b46e9507372f0b1b963James Dongv6OddZero$_F
4460c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi2, xi6, xi7           ;// (j2-j6)
4470c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
4480c1bc742181ded4930842b46e9507372f0b1b963James Dong
4490c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi0, xi2, LoopRR2
4500c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi2, xi2, LoopRR2
4510c1bc742181ded4930842b46e9507372f0b1b963James Dong
4520c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi2, xi2, LSL #1
4530c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
4540c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xh2, xh2, xi3
4550c1bc742181ded4930842b46e9507372f0b1b963James Dong
4560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0, xi1 now free
4570c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
4580c1bc742181ded4930842b46e9507372f0b1b963James Dong
4590c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh0, xi4, xi5
4600c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xh1, xi4, xi5
4610c1bc742181ded4930842b46e9507372f0b1b963James Dong
4620c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
4630c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg2, xh1, xh2
4640c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg1, xh1, xh2
4650c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg3, xh0, xh3
4660c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg0, xh0, xh3
4670c1bc742181ded4930842b46e9507372f0b1b963James Dong
4680c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
4690c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf3, xg3
4700c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf4, xg3
4710c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf2, xg2
4720c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf5, xg2
4730c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf1, xg1
4740c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf6, xg1
4750c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf0, xg0
4760c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV  xf7, xg0
4770c1bc742181ded4930842b46e9507372f0b1b963James Dong
4780c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose
4790c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra01, xf0, xf1, LSL #16
4800c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb01, xf1, xf0, ASR #16
4810c1bc742181ded4930842b46e9507372f0b1b963James Dong
4820c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra23, xf2, xf3, LSL #16
4830c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb23, xf3, xf2, ASR #16
4840c1bc742181ded4930842b46e9507372f0b1b963James Dong
4850c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra45, xf4, xf5, LSL #16
4860c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb45, xf5, xf4, ASR #16
4870c1bc742181ded4930842b46e9507372f0b1b963James Dong
4880c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   ra67, xf6, xf7, LSL #16
4890c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   rb67, xf7, xf6, ASR #16
4900c1bc742181ded4930842b46e9507372f0b1b963James Dong
4910c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {ra01, ra23, ra45, ra67}
4920c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
4930c1bc742181ded4930842b46e9507372f0b1b963James Dong        STMIA   pDest!, {rb01, rb23, rb45, rb67}
4940c1bc742181ded4930842b46e9507372f0b1b963James Dong
4950c1bc742181ded4930842b46e9507372f0b1b963James Dong        BCC     v6_idct_col$_F
4960c1bc742181ded4930842b46e9507372f0b1b963James Dong        SUB     pSrc, pDest, #(64*2)
4970c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_LDR   pDest, ppDest
4980c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
4990c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_LDR   pScale, pStride
5000c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
5010c1bc742181ded4930842b46e9507372f0b1b963James Dong
5020c1bc742181ded4930842b46e9507372f0b1b963James Dong
5030c1bc742181ded4930842b46e9507372f0b1b963James Dongv6_idct_row$_F
5040c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3, rows4to7 x1/4
5050c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xit, =0x00010001        ;// rounding constant
5060c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc, #1*16]      ;// j1
5070c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #7*16]      ;// 4*j7
5080c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi2, [pSrc, #5*16]      ;// j5
5090c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi3, [pSrc, #3*16]      ;// j3
5100c1bc742181ded4930842b46e9507372f0b1b963James Dong
5110c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi1, xi1, xit           ;// 2*j7
5120c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi1, xi1, xit           ;// j7
5130c1bc742181ded4930842b46e9507372f0b1b963James Dong
5140c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
5150c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi6, xi0, xi1           ;// j1-j7
5160c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
5170c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi4, xi2, xi3           ;// j5-j3
5180c1bc742181ded4930842b46e9507372f0b1b963James Dong
5190c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
5200c1bc742181ded4930842b46e9507372f0b1b963James Dong
5210c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
5220c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
5230c1bc742181ded4930842b46e9507372f0b1b963James Dong
5240c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
5250c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
5260c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
5270c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
5280c1bc742181ded4930842b46e9507372f0b1b963James Dong
5290c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi1, xi3, LoopRR2
5300c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi3, xi3, LoopRR2
5310c1bc742181ded4930842b46e9507372f0b1b963James Dong
5320c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
5330c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
5340c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
5350c1bc742181ded4930842b46e9507372f0b1b963James Dong
5360c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi3, xi3, LSL #1
5370c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
5380c1bc742181ded4930842b46e9507372f0b1b963James Dong
5390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0,xi1,xi2,xi3 now free
5400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3, rows 2to3 x1/2
5410c1bc742181ded4930842b46e9507372f0b1b963James Dong
5420c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc, #2*16]      ;// j2
5430c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #6*16]      ;// 2*j6
5440c1bc742181ded4930842b46e9507372f0b1b963James Dong
5450c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2, rows4to7
5460c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg6, xh6, xh7
5470c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg5, xh5, xg6
5480c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xg4, xh4, xg5
5490c1bc742181ded4930842b46e9507372f0b1b963James Dong
5500c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi1, xi1, xit           ;// j6
5510c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
5520c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
5530c1bc742181ded4930842b46e9507372f0b1b963James Dong
5540c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULBB  xi0, xi2, LoopRR2
5550c1bc742181ded4930842b46e9507372f0b1b963James Dong        SMULTB  xi2, xi2, LoopRR2
5560c1bc742181ded4930842b46e9507372f0b1b963James Dong
5570c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV     xi2, xi2, LSL #1
5580c1bc742181ded4930842b46e9507372f0b1b963James Dong
5590c1bc742181ded4930842b46e9507372f0b1b963James Dong        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
5600c1bc742181ded4930842b46e9507372f0b1b963James Dong
5610c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// xi0, xi1 now free
5620c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
5630c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi1, [pSrc, #4*16]      ;// j4
5640c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR     xi0, [pSrc], #4         ;// j0
5650c1bc742181ded4930842b46e9507372f0b1b963James Dong
5660c1bc742181ded4930842b46e9507372f0b1b963James Dong        SSUB16  xh2, xh2, xi3
5670c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
5680c1bc742181ded4930842b46e9507372f0b1b963James Dong
5690c1bc742181ded4930842b46e9507372f0b1b963James Dong        ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
5700c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xh0, xi0, xi1           ;// of DC result
5710c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xh1, xi0, xi1
5720c1bc742181ded4930842b46e9507372f0b1b963James Dong
5730c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
5740c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg2, xh1, xh2
5750c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg1, xh1, xh2
5760c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xg3, xh0, xh3
5770c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xg0, xh0, xh3
5780c1bc742181ded4930842b46e9507372f0b1b963James Dong
5790c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
5800c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf3, xg3, xg4
5810c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf4, xg3, xg4
5820c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf2, xg2, xg5
5830c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf5, xg2, xg5
5840c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf1, xg1, xg6
5850c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf6, xg1, xg6
5860c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHADD16 xf0, xg0, xg7
5870c1bc742181ded4930842b46e9507372f0b1b963James Dong        SHSUB16 xf7, xg0, xg7
5880c1bc742181ded4930842b46e9507372f0b1b963James Dong
5890c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Saturate
5900c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="u8")
5910c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf0, #8, xf0
5920c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf1, #8, xf1
5930c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf2, #8, xf2
5940c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf3, #8, xf3
5950c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf4, #8, xf4
5960c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf5, #8, xf5
5970c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf6, #8, xf6
5980c1bc742181ded4930842b46e9507372f0b1b963James Dong            USAT16  xf7, #8, xf7
5990c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6000c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="s9")
6010c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf0, #9, xf0
6020c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf1, #9, xf1
6030c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf2, #9, xf2
6040c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf3, #9, xf3
6050c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf4, #9, xf4
6060c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf5, #9, xf5
6070c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf6, #9, xf6
6080c1bc742181ded4930842b46e9507372f0b1b963James Dong            SSAT16  xf7, #9, xf7
6090c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6100c1bc742181ded4930842b46e9507372f0b1b963James Dong
6110c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose to Row, Pack and store
6120c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="u8")
6130c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
6140c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
6150c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
6160c1bc742181ded4930842b46e9507372f0b1b963James Dong            ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
6170c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra01, xf0, xf2, LSL #16
6180c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb01, xf2, xf0, ASR #16
6190c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra23, xf4, xf6, LSL #16
6200c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb23, xf6, xf4, ASR #16
6210c1bc742181ded4930842b46e9507372f0b1b963James Dong            STMIA   pDest, {ra01, ra23}
6220c1bc742181ded4930842b46e9507372f0b1b963James Dong            IF "$stride"="s"
6230c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6240c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23}
6250c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6260c1bc742181ded4930842b46e9507372f0b1b963James Dong            ELSE
6270c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6280c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23}
6290c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6300c1bc742181ded4930842b46e9507372f0b1b963James Dong            ENDIF
6310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6320c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="s9"):LOR:("$outsize"="s16")
6330c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra01, xf0, xf1, LSL #16
6340c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb01, xf1, xf0, ASR #16
6350c1bc742181ded4930842b46e9507372f0b1b963James Dong
6360c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra23, xf2, xf3, LSL #16
6370c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb23, xf3, xf2, ASR #16
6380c1bc742181ded4930842b46e9507372f0b1b963James Dong
6390c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra45, xf4, xf5, LSL #16
6400c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb45, xf5, xf4, ASR #16
6410c1bc742181ded4930842b46e9507372f0b1b963James Dong
6420c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHBT   ra67, xf6, xf7, LSL #16
6430c1bc742181ded4930842b46e9507372f0b1b963James Dong            PKHTB   rb67, xf7, xf6, ASR #16
6440c1bc742181ded4930842b46e9507372f0b1b963James Dong
6450c1bc742181ded4930842b46e9507372f0b1b963James Dong            STMIA   pDest, {ra01, ra23, ra45, ra67}
6460c1bc742181ded4930842b46e9507372f0b1b963James Dong            IF "$stride"="s"
6470c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6480c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23, rb45, rb67}
6490c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, pScale
6500c1bc742181ded4930842b46e9507372f0b1b963James Dong            ELSE
6510c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6520c1bc742181ded4930842b46e9507372f0b1b963James Dong                STMIA   pDest, {rb01, rb23, rb45, rb67}
6530c1bc742181ded4930842b46e9507372f0b1b963James Dong                ADD     pDest, pDest, #($stride)
6540c1bc742181ded4930842b46e9507372f0b1b963James Dong            ENDIF
6550c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
6560c1bc742181ded4930842b46e9507372f0b1b963James Dong
6570c1bc742181ded4930842b46e9507372f0b1b963James Dong        BCC     v6_idct_row$_F
6580c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF ;// ARM1136JS
6590c1bc742181ded4930842b46e9507372f0b1b963James Dong
6600c1bc742181ded4930842b46e9507372f0b1b963James Dong
6610c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF CortexA8
6620c1bc742181ded4930842b46e9507372f0b1b963James Dong
6630c1bc742181ded4930842b46e9507372f0b1b963James DongSrc0            EQU  7
6640c1bc742181ded4930842b46e9507372f0b1b963James DongSrc1            EQU  8
6650c1bc742181ded4930842b46e9507372f0b1b963James DongSrc2            EQU  9
6660c1bc742181ded4930842b46e9507372f0b1b963James DongSrc3            EQU  10
6670c1bc742181ded4930842b46e9507372f0b1b963James DongSrc4            EQU  11
6680c1bc742181ded4930842b46e9507372f0b1b963James DongSrc5            EQU  12
6690c1bc742181ded4930842b46e9507372f0b1b963James DongSrc6            EQU  13
6700c1bc742181ded4930842b46e9507372f0b1b963James DongSrc7            EQU  14
6710c1bc742181ded4930842b46e9507372f0b1b963James DongTmp             EQU  15
6720c1bc742181ded4930842b46e9507372f0b1b963James Dong
6730c1bc742181ded4930842b46e9507372f0b1b963James DongqXj0            QN Src0.S16
6740c1bc742181ded4930842b46e9507372f0b1b963James DongqXj1            QN Src1.S16
6750c1bc742181ded4930842b46e9507372f0b1b963James DongqXj2            QN Src2.S16
6760c1bc742181ded4930842b46e9507372f0b1b963James DongqXj3            QN Src3.S16
6770c1bc742181ded4930842b46e9507372f0b1b963James DongqXj4            QN Src4.S16
6780c1bc742181ded4930842b46e9507372f0b1b963James DongqXj5            QN Src5.S16
6790c1bc742181ded4930842b46e9507372f0b1b963James DongqXj6            QN Src6.S16
6800c1bc742181ded4930842b46e9507372f0b1b963James DongqXj7            QN Src7.S16
6810c1bc742181ded4930842b46e9507372f0b1b963James DongqXjt            QN Tmp.S16
6820c1bc742181ded4930842b46e9507372f0b1b963James Dong
6830c1bc742181ded4930842b46e9507372f0b1b963James DongdXj0lo          DN (Src0*2).S16
6840c1bc742181ded4930842b46e9507372f0b1b963James DongdXj0hi          DN (Src0*2+1).S16
6850c1bc742181ded4930842b46e9507372f0b1b963James DongdXj1lo          DN (Src1*2).S16
6860c1bc742181ded4930842b46e9507372f0b1b963James DongdXj1hi          DN (Src1*2+1).S16
6870c1bc742181ded4930842b46e9507372f0b1b963James DongdXj2lo          DN (Src2*2).S16
6880c1bc742181ded4930842b46e9507372f0b1b963James DongdXj2hi          DN (Src2*2+1).S16
6890c1bc742181ded4930842b46e9507372f0b1b963James DongdXj3lo          DN (Src3*2).S16
6900c1bc742181ded4930842b46e9507372f0b1b963James DongdXj3hi          DN (Src3*2+1).S16
6910c1bc742181ded4930842b46e9507372f0b1b963James DongdXj4lo          DN (Src4*2).S16
6920c1bc742181ded4930842b46e9507372f0b1b963James DongdXj4hi          DN (Src4*2+1).S16
6930c1bc742181ded4930842b46e9507372f0b1b963James DongdXj5lo          DN (Src5*2).S16
6940c1bc742181ded4930842b46e9507372f0b1b963James DongdXj5hi          DN (Src5*2+1).S16
6950c1bc742181ded4930842b46e9507372f0b1b963James DongdXj6lo          DN (Src6*2).S16
6960c1bc742181ded4930842b46e9507372f0b1b963James DongdXj6hi          DN (Src6*2+1).S16
6970c1bc742181ded4930842b46e9507372f0b1b963James DongdXj7lo          DN (Src7*2).S16
6980c1bc742181ded4930842b46e9507372f0b1b963James DongdXj7hi          DN (Src7*2+1).S16
6990c1bc742181ded4930842b46e9507372f0b1b963James DongdXjtlo          DN (Tmp*2).S16
7000c1bc742181ded4930842b46e9507372f0b1b963James DongdXjthi          DN (Tmp*2+1).S16
7010c1bc742181ded4930842b46e9507372f0b1b963James Dong
7020c1bc742181ded4930842b46e9507372f0b1b963James DongqXi0            QN qXj0
7030c1bc742181ded4930842b46e9507372f0b1b963James DongqXi1            QN qXj4
7040c1bc742181ded4930842b46e9507372f0b1b963James DongqXi2            QN qXj2
7050c1bc742181ded4930842b46e9507372f0b1b963James DongqXi3            QN qXj7
7060c1bc742181ded4930842b46e9507372f0b1b963James DongqXi4            QN qXj5
7070c1bc742181ded4930842b46e9507372f0b1b963James DongqXi5            QN qXjt
7080c1bc742181ded4930842b46e9507372f0b1b963James DongqXi6            QN qXj1
7090c1bc742181ded4930842b46e9507372f0b1b963James DongqXi7            QN qXj6
7100c1bc742181ded4930842b46e9507372f0b1b963James DongqXit            QN qXj3
7110c1bc742181ded4930842b46e9507372f0b1b963James Dong
7120c1bc742181ded4930842b46e9507372f0b1b963James DongdXi0lo          DN dXj0lo
7130c1bc742181ded4930842b46e9507372f0b1b963James DongdXi0hi          DN dXj0hi
7140c1bc742181ded4930842b46e9507372f0b1b963James DongdXi1lo          DN dXj4lo
7150c1bc742181ded4930842b46e9507372f0b1b963James DongdXi1hi          DN dXj4hi
7160c1bc742181ded4930842b46e9507372f0b1b963James DongdXi2lo          DN dXj2lo
7170c1bc742181ded4930842b46e9507372f0b1b963James DongdXi2hi          DN dXj2hi
7180c1bc742181ded4930842b46e9507372f0b1b963James DongdXi3lo          DN dXj7lo
7190c1bc742181ded4930842b46e9507372f0b1b963James DongdXi3hi          DN dXj7hi
7200c1bc742181ded4930842b46e9507372f0b1b963James DongdXi4lo          DN dXj5lo
7210c1bc742181ded4930842b46e9507372f0b1b963James DongdXi4hi          DN dXj5hi
7220c1bc742181ded4930842b46e9507372f0b1b963James DongdXi5lo          DN dXjtlo
7230c1bc742181ded4930842b46e9507372f0b1b963James DongdXi5hi          DN dXjthi
7240c1bc742181ded4930842b46e9507372f0b1b963James DongdXi6lo          DN dXj1lo
7250c1bc742181ded4930842b46e9507372f0b1b963James DongdXi6hi          DN dXj1hi
7260c1bc742181ded4930842b46e9507372f0b1b963James DongdXi7lo          DN dXj6lo
7270c1bc742181ded4930842b46e9507372f0b1b963James DongdXi7hi          DN dXj6hi
7280c1bc742181ded4930842b46e9507372f0b1b963James DongdXitlo          DN dXj3lo
7290c1bc742181ded4930842b46e9507372f0b1b963James DongdXithi          DN dXj3hi
7300c1bc742181ded4930842b46e9507372f0b1b963James Dong
7310c1bc742181ded4930842b46e9507372f0b1b963James DongqXh0            QN qXit
7320c1bc742181ded4930842b46e9507372f0b1b963James DongqXh1            QN qXi0
7330c1bc742181ded4930842b46e9507372f0b1b963James DongqXh2            QN qXi2
7340c1bc742181ded4930842b46e9507372f0b1b963James DongqXh3            QN qXi3
7350c1bc742181ded4930842b46e9507372f0b1b963James DongqXh4            QN qXi7
7360c1bc742181ded4930842b46e9507372f0b1b963James DongqXh5            QN qXi5
7370c1bc742181ded4930842b46e9507372f0b1b963James DongqXh6            QN qXi4
7380c1bc742181ded4930842b46e9507372f0b1b963James DongqXh7            QN qXi1
7390c1bc742181ded4930842b46e9507372f0b1b963James DongqXht            QN qXi6
7400c1bc742181ded4930842b46e9507372f0b1b963James Dong
7410c1bc742181ded4930842b46e9507372f0b1b963James DongdXh0lo          DN dXitlo
7420c1bc742181ded4930842b46e9507372f0b1b963James DongdXh0hi          DN dXithi
7430c1bc742181ded4930842b46e9507372f0b1b963James DongdXh1lo          DN dXi0lo
7440c1bc742181ded4930842b46e9507372f0b1b963James DongdXh1hi          DN dXi0hi
7450c1bc742181ded4930842b46e9507372f0b1b963James DongdXh2lo          DN dXi2lo
7460c1bc742181ded4930842b46e9507372f0b1b963James DongdXh2hi          DN dXi2hi
7470c1bc742181ded4930842b46e9507372f0b1b963James DongdXh3lo          DN dXi3lo
7480c1bc742181ded4930842b46e9507372f0b1b963James DongdXh3hi          DN dXi3hi
7490c1bc742181ded4930842b46e9507372f0b1b963James DongdXh4lo          DN dXi7lo
7500c1bc742181ded4930842b46e9507372f0b1b963James DongdXh4hi          DN dXi7hi
7510c1bc742181ded4930842b46e9507372f0b1b963James DongdXh5lo          DN dXi5lo
7520c1bc742181ded4930842b46e9507372f0b1b963James DongdXh5hi          DN dXi5hi
7530c1bc742181ded4930842b46e9507372f0b1b963James DongdXh6lo          DN dXi4lo
7540c1bc742181ded4930842b46e9507372f0b1b963James DongdXh6hi          DN dXi4hi
7550c1bc742181ded4930842b46e9507372f0b1b963James DongdXh7lo          DN dXi1lo
7560c1bc742181ded4930842b46e9507372f0b1b963James DongdXh7hi          DN dXi1hi
7570c1bc742181ded4930842b46e9507372f0b1b963James DongdXhtlo          DN dXi6lo
7580c1bc742181ded4930842b46e9507372f0b1b963James DongdXhthi          DN dXi6hi
7590c1bc742181ded4930842b46e9507372f0b1b963James Dong
7600c1bc742181ded4930842b46e9507372f0b1b963James DongqXg0            QN qXh2
7610c1bc742181ded4930842b46e9507372f0b1b963James DongqXg1            QN qXht
7620c1bc742181ded4930842b46e9507372f0b1b963James DongqXg2            QN qXh1
7630c1bc742181ded4930842b46e9507372f0b1b963James DongqXg3            QN qXh0
7640c1bc742181ded4930842b46e9507372f0b1b963James DongqXg4            QN qXh4
7650c1bc742181ded4930842b46e9507372f0b1b963James DongqXg5            QN qXh5
7660c1bc742181ded4930842b46e9507372f0b1b963James DongqXg6            QN qXh6
7670c1bc742181ded4930842b46e9507372f0b1b963James DongqXg7            QN qXh7
7680c1bc742181ded4930842b46e9507372f0b1b963James DongqXgt            QN qXh3
7690c1bc742181ded4930842b46e9507372f0b1b963James Dong
7700c1bc742181ded4930842b46e9507372f0b1b963James DongqXf0            QN qXg6
7710c1bc742181ded4930842b46e9507372f0b1b963James DongqXf1            QN qXg5
7720c1bc742181ded4930842b46e9507372f0b1b963James DongqXf2            QN qXg4
7730c1bc742181ded4930842b46e9507372f0b1b963James DongqXf3            QN qXgt
7740c1bc742181ded4930842b46e9507372f0b1b963James DongqXf4            QN qXg3
7750c1bc742181ded4930842b46e9507372f0b1b963James DongqXf5            QN qXg2
7760c1bc742181ded4930842b46e9507372f0b1b963James DongqXf6            QN qXg1
7770c1bc742181ded4930842b46e9507372f0b1b963James DongqXf7            QN qXg0
7780c1bc742181ded4930842b46e9507372f0b1b963James DongqXft            QN qXg7
7790c1bc742181ded4930842b46e9507372f0b1b963James Dong
7800c1bc742181ded4930842b46e9507372f0b1b963James Dong
7810c1bc742181ded4930842b46e9507372f0b1b963James DongqXt0            QN 1.S32
7820c1bc742181ded4930842b46e9507372f0b1b963James DongqXt1            QN 2.S32
7830c1bc742181ded4930842b46e9507372f0b1b963James DongqT0lo           QN 1.S32
7840c1bc742181ded4930842b46e9507372f0b1b963James DongqT0hi           QN 2.S32
7850c1bc742181ded4930842b46e9507372f0b1b963James DongqT1lo           QN 3.S32
7860c1bc742181ded4930842b46e9507372f0b1b963James DongqT1hi           QN 4.S32
7870c1bc742181ded4930842b46e9507372f0b1b963James DongqScalelo        QN 5.S32        ;// used to read post scale values
7880c1bc742181ded4930842b46e9507372f0b1b963James DongqScalehi        QN 6.S32
7890c1bc742181ded4930842b46e9507372f0b1b963James DongqTemp0          QN 5.S32
7900c1bc742181ded4930842b46e9507372f0b1b963James DongqTemp1          QN 6.S32
7910c1bc742181ded4930842b46e9507372f0b1b963James Dong
7920c1bc742181ded4930842b46e9507372f0b1b963James Dong
7930c1bc742181ded4930842b46e9507372f0b1b963James DongScale1          EQU 6
7940c1bc742181ded4930842b46e9507372f0b1b963James DongScale2          EQU 15
7950c1bc742181ded4930842b46e9507372f0b1b963James DongqScale1         QN Scale1.S16
7960c1bc742181ded4930842b46e9507372f0b1b963James DongqScale2         QN Scale2.S16
7970c1bc742181ded4930842b46e9507372f0b1b963James DongdScale1lo       DN (Scale1*2).S16
7980c1bc742181ded4930842b46e9507372f0b1b963James DongdScale1hi       DN (Scale1*2+1).S16
7990c1bc742181ded4930842b46e9507372f0b1b963James DongdScale2lo       DN (Scale2*2).S16
8000c1bc742181ded4930842b46e9507372f0b1b963James DongdScale2hi       DN (Scale2*2+1).S16
8010c1bc742181ded4930842b46e9507372f0b1b963James Dong
8020c1bc742181ded4930842b46e9507372f0b1b963James DongdCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
8030c1bc742181ded4930842b46e9507372f0b1b963James DongInvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
8040c1bc742181ded4930842b46e9507372f0b1b963James DongS               DN dCoefs[1]    ;// Sin(PI/8) in Q15
8050c1bc742181ded4930842b46e9507372f0b1b963James DongC               DN dCoefs[2]    ;// Cos(PI/8) in Q15
8060c1bc742181ded4930842b46e9507372f0b1b963James Dong
8070c1bc742181ded4930842b46e9507372f0b1b963James DongpTemp           RN 12
8080c1bc742181ded4930842b46e9507372f0b1b963James Dong
8090c1bc742181ded4930842b46e9507372f0b1b963James Dong
8100c1bc742181ded4930842b46e9507372f0b1b963James Dong        IMPORT  armCOMM_IDCTCoef
8110c1bc742181ded4930842b46e9507372f0b1b963James Dong
8120c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj0,qXj1}, [pSrc @64]!
8130c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj2,qXj3}, [pSrc @64]!
8140c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj4,qXj5}, [pSrc @64]!
8150c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        {qXj6,qXj7}, [pSrc @64]!
8160c1bc742181ded4930842b46e9507372f0b1b963James Dong
8170c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Load PreScale and multiply with Src
8180c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4
8190c1bc742181ded4930842b46e9507372f0b1b963James Dong
8200c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s16"                         ;// 16X16 Mul
8210c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_IDCT_PRESCALE16
8220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
8230c1bc742181ded4930842b46e9507372f0b1b963James Dong
8240c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$inscale"="s32"                         ;// 32X32 ,ul
8250c1bc742181ded4930842b46e9507372f0b1b963James Dong            M_IDCT_PRESCALE32
8260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
8270c1bc742181ded4930842b46e9507372f0b1b963James Dong
8280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 3
8290c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
8300c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
8310c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
8320c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
8330c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
8340c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
8350c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXh2, qXi2, qXi3                ;// h2, h3
8360c1bc742181ded4930842b46e9507372f0b1b963James Dong
8370c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dXi4lo, C                 ;// c*i4
8380c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
8390c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dXi4hi, C
8400c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt1, dXi6hi, S
8410c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh4lo, qXt0, #16               ;// h4
8420c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh4hi, qXt1, #16
8430c1bc742181ded4930842b46e9507372f0b1b963James Dong
8440c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dXi6lo, C                 ;// c*i6
8450c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
8460c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dXi6hi, C
8470c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt1, dXi4hi, S
8480c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh6lo, qXt0, #16               ;// h6
8490c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dXh6hi, qXt1, #16
8500c1bc742181ded4930842b46e9507372f0b1b963James Dong
8510c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2
8520c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXg6, qXh6, qXh7
8530c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXg5, qXh5, qXg6
8540c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXg4, qXh4, qXg5
8550c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
8560c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
8570c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
8580c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
8590c1bc742181ded4930842b46e9507372f0b1b963James Dong
8600c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
8610c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf3, qXg3, qXg4
8620c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf4, qXg3, qXg4
8630c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf2, qXg2, qXg5
8640c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf5, qXg2, qXg5
8650c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf1, qXg1, qXg6
8660c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf6, qXg1, qXg6
8670c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qXf0, qXg0, qXg7
8680c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXf7, qXg0, qXg7
8690c1bc742181ded4930842b46e9507372f0b1b963James Dong
8700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Transpose, store and loop
8710c1bc742181ded4930842b46e9507372f0b1b963James DongXTR0            EQU Src5
8720c1bc742181ded4930842b46e9507372f0b1b963James DongXTR1            EQU Tmp
8730c1bc742181ded4930842b46e9507372f0b1b963James DongXTR2            EQU Src6
8740c1bc742181ded4930842b46e9507372f0b1b963James DongXTR3            EQU Src7
8750c1bc742181ded4930842b46e9507372f0b1b963James DongXTR4            EQU Src3
8760c1bc742181ded4930842b46e9507372f0b1b963James DongXTR5            EQU Src0
8770c1bc742181ded4930842b46e9507372f0b1b963James DongXTR6            EQU Src1
8780c1bc742181ded4930842b46e9507372f0b1b963James DongXTR7            EQU Src2
8790c1bc742181ded4930842b46e9507372f0b1b963James DongXTRt            EQU Src4
8800c1bc742181ded4930842b46e9507372f0b1b963James Dong
8810c1bc742181ded4930842b46e9507372f0b1b963James DongqA0             QN  XTR0.S32  ;// for XTRpose
8820c1bc742181ded4930842b46e9507372f0b1b963James DongqA1             QN  XTR1.S32
8830c1bc742181ded4930842b46e9507372f0b1b963James DongqA2             QN  XTR2.S32
8840c1bc742181ded4930842b46e9507372f0b1b963James DongqA3             QN  XTR3.S32
8850c1bc742181ded4930842b46e9507372f0b1b963James DongqA4             QN  XTR4.S32
8860c1bc742181ded4930842b46e9507372f0b1b963James DongqA5             QN  XTR5.S32
8870c1bc742181ded4930842b46e9507372f0b1b963James DongqA6             QN  XTR6.S32
8880c1bc742181ded4930842b46e9507372f0b1b963James DongqA7             QN  XTR7.S32
8890c1bc742181ded4930842b46e9507372f0b1b963James Dong
8900c1bc742181ded4930842b46e9507372f0b1b963James DongdB0             DN  XTR0*2+1      ;// for using VSWP
8910c1bc742181ded4930842b46e9507372f0b1b963James DongdB1             DN  XTR1*2+1
8920c1bc742181ded4930842b46e9507372f0b1b963James DongdB2             DN  XTR2*2+1
8930c1bc742181ded4930842b46e9507372f0b1b963James DongdB3             DN  XTR3*2+1
8940c1bc742181ded4930842b46e9507372f0b1b963James DongdB4             DN  XTR4*2
8950c1bc742181ded4930842b46e9507372f0b1b963James DongdB5             DN  XTR5*2
8960c1bc742181ded4930842b46e9507372f0b1b963James DongdB6             DN  XTR6*2
8970c1bc742181ded4930842b46e9507372f0b1b963James DongdB7             DN  XTR7*2
8980c1bc742181ded4930842b46e9507372f0b1b963James Dong
8990c1bc742181ded4930842b46e9507372f0b1b963James Dong
9000c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf0, qXf1
9010c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf2, qXf3
9020c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf4, qXf5
9030c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qXf6, qXf7
9040c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA0, qA2
9050c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA1, qA3
9060c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA4, qA6
9070c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qA5, qA7
9080c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB0, dB4
9090c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB1, dB5
9100c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB2, dB6
9110c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dB3, dB7
9120c1bc742181ded4930842b46e9507372f0b1b963James Dong
9130c1bc742181ded4930842b46e9507372f0b1b963James Dong
9140c1bc742181ded4930842b46e9507372f0b1b963James DongqYj0            QN qXf0
9150c1bc742181ded4930842b46e9507372f0b1b963James DongqYj1            QN qXf1
9160c1bc742181ded4930842b46e9507372f0b1b963James DongqYj2            QN qXf2
9170c1bc742181ded4930842b46e9507372f0b1b963James DongqYj3            QN qXf3
9180c1bc742181ded4930842b46e9507372f0b1b963James DongqYj4            QN qXf4
9190c1bc742181ded4930842b46e9507372f0b1b963James DongqYj5            QN qXf5
9200c1bc742181ded4930842b46e9507372f0b1b963James DongqYj6            QN qXf6
9210c1bc742181ded4930842b46e9507372f0b1b963James DongqYj7            QN qXf7
9220c1bc742181ded4930842b46e9507372f0b1b963James DongqYjt            QN qXft
9230c1bc742181ded4930842b46e9507372f0b1b963James Dong
9240c1bc742181ded4930842b46e9507372f0b1b963James DongdYj0lo          DN (XTR0*2).S16
9250c1bc742181ded4930842b46e9507372f0b1b963James DongdYj0hi          DN (XTR0*2+1).S16
9260c1bc742181ded4930842b46e9507372f0b1b963James DongdYj1lo          DN (XTR1*2).S16
9270c1bc742181ded4930842b46e9507372f0b1b963James DongdYj1hi          DN (XTR1*2+1).S16
9280c1bc742181ded4930842b46e9507372f0b1b963James DongdYj2lo          DN (XTR2*2).S16
9290c1bc742181ded4930842b46e9507372f0b1b963James DongdYj2hi          DN (XTR2*2+1).S16
9300c1bc742181ded4930842b46e9507372f0b1b963James DongdYj3lo          DN (XTR3*2).S16
9310c1bc742181ded4930842b46e9507372f0b1b963James DongdYj3hi          DN (XTR3*2+1).S16
9320c1bc742181ded4930842b46e9507372f0b1b963James DongdYj4lo          DN (XTR4*2).S16
9330c1bc742181ded4930842b46e9507372f0b1b963James DongdYj4hi          DN (XTR4*2+1).S16
9340c1bc742181ded4930842b46e9507372f0b1b963James DongdYj5lo          DN (XTR5*2).S16
9350c1bc742181ded4930842b46e9507372f0b1b963James DongdYj5hi          DN (XTR5*2+1).S16
9360c1bc742181ded4930842b46e9507372f0b1b963James DongdYj6lo          DN (XTR6*2).S16
9370c1bc742181ded4930842b46e9507372f0b1b963James DongdYj6hi          DN (XTR6*2+1).S16
9380c1bc742181ded4930842b46e9507372f0b1b963James DongdYj7lo          DN (XTR7*2).S16
9390c1bc742181ded4930842b46e9507372f0b1b963James DongdYj7hi          DN (XTR7*2+1).S16
9400c1bc742181ded4930842b46e9507372f0b1b963James DongdYjtlo          DN (XTRt*2).S16
9410c1bc742181ded4930842b46e9507372f0b1b963James DongdYjthi          DN (XTRt*2+1).S16
9420c1bc742181ded4930842b46e9507372f0b1b963James Dong
9430c1bc742181ded4930842b46e9507372f0b1b963James DongqYi0            QN qYj0
9440c1bc742181ded4930842b46e9507372f0b1b963James DongqYi1            QN qYj4
9450c1bc742181ded4930842b46e9507372f0b1b963James DongqYi2            QN qYj2
9460c1bc742181ded4930842b46e9507372f0b1b963James DongqYi3            QN qYj7
9470c1bc742181ded4930842b46e9507372f0b1b963James DongqYi4            QN qYj5
9480c1bc742181ded4930842b46e9507372f0b1b963James DongqYi5            QN qYjt
9490c1bc742181ded4930842b46e9507372f0b1b963James DongqYi6            QN qYj1
9500c1bc742181ded4930842b46e9507372f0b1b963James DongqYi7            QN qYj6
9510c1bc742181ded4930842b46e9507372f0b1b963James DongqYit            QN qYj3
9520c1bc742181ded4930842b46e9507372f0b1b963James Dong
9530c1bc742181ded4930842b46e9507372f0b1b963James DongdYi0lo          DN dYj0lo
9540c1bc742181ded4930842b46e9507372f0b1b963James DongdYi0hi          DN dYj0hi
9550c1bc742181ded4930842b46e9507372f0b1b963James DongdYi1lo          DN dYj4lo
9560c1bc742181ded4930842b46e9507372f0b1b963James DongdYi1hi          DN dYj4hi
9570c1bc742181ded4930842b46e9507372f0b1b963James DongdYi2lo          DN dYj2lo
9580c1bc742181ded4930842b46e9507372f0b1b963James DongdYi2hi          DN dYj2hi
9590c1bc742181ded4930842b46e9507372f0b1b963James DongdYi3lo          DN dYj7lo
9600c1bc742181ded4930842b46e9507372f0b1b963James DongdYi3hi          DN dYj7hi
9610c1bc742181ded4930842b46e9507372f0b1b963James DongdYi4lo          DN dYj5lo
9620c1bc742181ded4930842b46e9507372f0b1b963James DongdYi4hi          DN dYj5hi
9630c1bc742181ded4930842b46e9507372f0b1b963James DongdYi5lo          DN dYjtlo
9640c1bc742181ded4930842b46e9507372f0b1b963James DongdYi5hi          DN dYjthi
9650c1bc742181ded4930842b46e9507372f0b1b963James DongdYi6lo          DN dYj1lo
9660c1bc742181ded4930842b46e9507372f0b1b963James DongdYi6hi          DN dYj1hi
9670c1bc742181ded4930842b46e9507372f0b1b963James DongdYi7lo          DN dYj6lo
9680c1bc742181ded4930842b46e9507372f0b1b963James DongdYi7hi          DN dYj6hi
9690c1bc742181ded4930842b46e9507372f0b1b963James DongdYitlo          DN dYj3lo
9700c1bc742181ded4930842b46e9507372f0b1b963James DongdYithi          DN dYj3hi
9710c1bc742181ded4930842b46e9507372f0b1b963James Dong
9720c1bc742181ded4930842b46e9507372f0b1b963James DongqYh0            QN qYit
9730c1bc742181ded4930842b46e9507372f0b1b963James DongqYh1            QN qYi0
9740c1bc742181ded4930842b46e9507372f0b1b963James DongqYh2            QN qYi2
9750c1bc742181ded4930842b46e9507372f0b1b963James DongqYh3            QN qYi3
9760c1bc742181ded4930842b46e9507372f0b1b963James DongqYh4            QN qYi7
9770c1bc742181ded4930842b46e9507372f0b1b963James DongqYh5            QN qYi5
9780c1bc742181ded4930842b46e9507372f0b1b963James DongqYh6            QN qYi4
9790c1bc742181ded4930842b46e9507372f0b1b963James DongqYh7            QN qYi1
9800c1bc742181ded4930842b46e9507372f0b1b963James DongqYht            QN qYi6
9810c1bc742181ded4930842b46e9507372f0b1b963James Dong
9820c1bc742181ded4930842b46e9507372f0b1b963James DongdYh0lo          DN dYitlo
9830c1bc742181ded4930842b46e9507372f0b1b963James DongdYh0hi          DN dYithi
9840c1bc742181ded4930842b46e9507372f0b1b963James DongdYh1lo          DN dYi0lo
9850c1bc742181ded4930842b46e9507372f0b1b963James DongdYh1hi          DN dYi0hi
9860c1bc742181ded4930842b46e9507372f0b1b963James DongdYh2lo          DN dYi2lo
9870c1bc742181ded4930842b46e9507372f0b1b963James DongdYh2hi          DN dYi2hi
9880c1bc742181ded4930842b46e9507372f0b1b963James DongdYh3lo          DN dYi3lo
9890c1bc742181ded4930842b46e9507372f0b1b963James DongdYh3hi          DN dYi3hi
9900c1bc742181ded4930842b46e9507372f0b1b963James DongdYh4lo          DN dYi7lo
9910c1bc742181ded4930842b46e9507372f0b1b963James DongdYh4hi          DN dYi7hi
9920c1bc742181ded4930842b46e9507372f0b1b963James DongdYh5lo          DN dYi5lo
9930c1bc742181ded4930842b46e9507372f0b1b963James DongdYh5hi          DN dYi5hi
9940c1bc742181ded4930842b46e9507372f0b1b963James DongdYh6lo          DN dYi4lo
9950c1bc742181ded4930842b46e9507372f0b1b963James DongdYh6hi          DN dYi4hi
9960c1bc742181ded4930842b46e9507372f0b1b963James DongdYh7lo          DN dYi1lo
9970c1bc742181ded4930842b46e9507372f0b1b963James DongdYh7hi          DN dYi1hi
9980c1bc742181ded4930842b46e9507372f0b1b963James DongdYhtlo          DN dYi6lo
9990c1bc742181ded4930842b46e9507372f0b1b963James DongdYhthi          DN dYi6hi
10000c1bc742181ded4930842b46e9507372f0b1b963James Dong
10010c1bc742181ded4930842b46e9507372f0b1b963James DongqYg0            QN qYh2
10020c1bc742181ded4930842b46e9507372f0b1b963James DongqYg1            QN qYht
10030c1bc742181ded4930842b46e9507372f0b1b963James DongqYg2            QN qYh1
10040c1bc742181ded4930842b46e9507372f0b1b963James DongqYg3            QN qYh0
10050c1bc742181ded4930842b46e9507372f0b1b963James DongqYg4            QN qYh4
10060c1bc742181ded4930842b46e9507372f0b1b963James DongqYg5            QN qYh5
10070c1bc742181ded4930842b46e9507372f0b1b963James DongqYg6            QN qYh6
10080c1bc742181ded4930842b46e9507372f0b1b963James DongqYg7            QN qYh7
10090c1bc742181ded4930842b46e9507372f0b1b963James DongqYgt            QN qYh3
10100c1bc742181ded4930842b46e9507372f0b1b963James Dong
10110c1bc742181ded4930842b46e9507372f0b1b963James DongqYf0            QN qYg6
10120c1bc742181ded4930842b46e9507372f0b1b963James DongqYf1            QN qYg5
10130c1bc742181ded4930842b46e9507372f0b1b963James DongqYf2            QN qYg4
10140c1bc742181ded4930842b46e9507372f0b1b963James DongqYf3            QN qYgt
10150c1bc742181ded4930842b46e9507372f0b1b963James DongqYf4            QN qYg3
10160c1bc742181ded4930842b46e9507372f0b1b963James DongqYf5            QN qYg2
10170c1bc742181ded4930842b46e9507372f0b1b963James DongqYf6            QN qYg1
10180c1bc742181ded4930842b46e9507372f0b1b963James DongqYf7            QN qYg0
10190c1bc742181ded4930842b46e9507372f0b1b963James DongqYft            QN qYg7
10200c1bc742181ded4930842b46e9507372f0b1b963James Dong
10210c1bc742181ded4930842b46e9507372f0b1b963James Dong        VRSHR       qYj7, qYj7, #2
10220c1bc742181ded4930842b46e9507372f0b1b963James Dong        VRSHR       qYj6, qYj6, #1
10230c1bc742181ded4930842b46e9507372f0b1b963James Dong
10240c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
10250c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
10260c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
10270c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
10280c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
10290c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
10300c1bc742181ded4930842b46e9507372f0b1b963James Dong
10310c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
10320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 4,3 rows 0to1 x 1/2
10330c1bc742181ded4930842b46e9507372f0b1b963James Dong
10340c1bc742181ded4930842b46e9507372f0b1b963James Dong        MOV         pTemp, #0x4             ;// ensure correct round
10350c1bc742181ded4930842b46e9507372f0b1b963James Dong        VDUP        qScale1, pTemp           ;// of DC result
10360c1bc742181ded4930842b46e9507372f0b1b963James Dong        VADD        qYi0, qYi0, qScale1
10370c1bc742181ded4930842b46e9507372f0b1b963James Dong
10380c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
10390c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
10400c1bc742181ded4930842b46e9507372f0b1b963James Dong
10410c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
10420c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
10430c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYh2, qYi2, qYi3        ;// h2, h3
10440c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
10450c1bc742181ded4930842b46e9507372f0b1b963James Dong
10460c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dYi4lo, C         ;// c*i4
10470c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
10480c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dYi4hi, C
10490c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLAL       qXt1, dYi6hi, S
10500c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh4lo, qXt0, #16       ;// h4
10510c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh4hi, qXt1, #16
10520c1bc742181ded4930842b46e9507372f0b1b963James Dong
10530c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt0, dYi6lo, C         ;// c*i6
10540c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
10550c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qXt1, dYi6hi, C
10560c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMLSL       qXt1, dYi4hi, S
10570c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh6lo, qXt0, #16       ;// h6
10580c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSHRN       dYh6hi, qXt1, #16
10590c1bc742181ded4930842b46e9507372f0b1b963James Dong
10600c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYg6, qYh6, qYh7
10610c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYg5, qYh5, qYg6
10620c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qYg4, qYh4, qYg5
10630c1bc742181ded4930842b46e9507372f0b1b963James Dong
10640c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 2 rows 0to3 x 1/2
10650c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
10660c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
10670c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
10680c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
10690c1bc742181ded4930842b46e9507372f0b1b963James Dong
10700c1bc742181ded4930842b46e9507372f0b1b963James Dong
10710c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// IStage 1 all rows
10720c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf3, qYg3, qYg4
10730c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf4, qYg3, qYg4
10740c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf2, qYg2, qYg5
10750c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf5, qYg2, qYg5
10760c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf1, qYg1, qYg6
10770c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf6, qYg1, qYg6
10780c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD        qYf0, qYg0, qYg7
10790c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHSUB        qYf7, qYg0, qYg7
10800c1bc742181ded4930842b46e9507372f0b1b963James Dong
10810c1bc742181ded4930842b46e9507372f0b1b963James DongYTR0            EQU Src0
10820c1bc742181ded4930842b46e9507372f0b1b963James DongYTR1            EQU Src4
10830c1bc742181ded4930842b46e9507372f0b1b963James DongYTR2            EQU Src1
10840c1bc742181ded4930842b46e9507372f0b1b963James DongYTR3            EQU Src2
10850c1bc742181ded4930842b46e9507372f0b1b963James DongYTR4            EQU Src7
10860c1bc742181ded4930842b46e9507372f0b1b963James DongYTR5            EQU Src5
10870c1bc742181ded4930842b46e9507372f0b1b963James DongYTR6            EQU Tmp
10880c1bc742181ded4930842b46e9507372f0b1b963James DongYTR7            EQU Src6
10890c1bc742181ded4930842b46e9507372f0b1b963James DongYTRt            EQU Src3
10900c1bc742181ded4930842b46e9507372f0b1b963James Dong
10910c1bc742181ded4930842b46e9507372f0b1b963James DongqC0             QN  YTR0.S32                ;// for YTRpose
10920c1bc742181ded4930842b46e9507372f0b1b963James DongqC1             QN  YTR1.S32
10930c1bc742181ded4930842b46e9507372f0b1b963James DongqC2             QN  YTR2.S32
10940c1bc742181ded4930842b46e9507372f0b1b963James DongqC3             QN  YTR3.S32
10950c1bc742181ded4930842b46e9507372f0b1b963James DongqC4             QN  YTR4.S32
10960c1bc742181ded4930842b46e9507372f0b1b963James DongqC5             QN  YTR5.S32
10970c1bc742181ded4930842b46e9507372f0b1b963James DongqC6             QN  YTR6.S32
10980c1bc742181ded4930842b46e9507372f0b1b963James DongqC7             QN  YTR7.S32
10990c1bc742181ded4930842b46e9507372f0b1b963James Dong
11000c1bc742181ded4930842b46e9507372f0b1b963James DongdD0             DN  YTR0*2+1                ;// for using VSWP
11010c1bc742181ded4930842b46e9507372f0b1b963James DongdD1             DN  YTR1*2+1
11020c1bc742181ded4930842b46e9507372f0b1b963James DongdD2             DN  YTR2*2+1
11030c1bc742181ded4930842b46e9507372f0b1b963James DongdD3             DN  YTR3*2+1
11040c1bc742181ded4930842b46e9507372f0b1b963James DongdD4             DN  YTR4*2
11050c1bc742181ded4930842b46e9507372f0b1b963James DongdD5             DN  YTR5*2
11060c1bc742181ded4930842b46e9507372f0b1b963James DongdD6             DN  YTR6*2
11070c1bc742181ded4930842b46e9507372f0b1b963James DongdD7             DN  YTR7*2
11080c1bc742181ded4930842b46e9507372f0b1b963James Dong
11090c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf0, qYf1
11100c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf2, qYf3
11110c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf4, qYf5
11120c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qYf6, qYf7
11130c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC0, qC2
11140c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC1, qC3
11150c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC4, qC6
11160c1bc742181ded4930842b46e9507372f0b1b963James Dong        VTRN        qC5, qC7
11170c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD0, dD4
11180c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD1, dD5
11190c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD2, dD6
11200c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSWP        dD3, dD7
11210c1bc742181ded4930842b46e9507372f0b1b963James Dong
11220c1bc742181ded4930842b46e9507372f0b1b963James Dong
11230c1bc742181ded4930842b46e9507372f0b1b963James DongdYf0U8          DN YTR0*2.U8
11240c1bc742181ded4930842b46e9507372f0b1b963James DongdYf1U8          DN YTR1*2.U8
11250c1bc742181ded4930842b46e9507372f0b1b963James DongdYf2U8          DN YTR2*2.U8
11260c1bc742181ded4930842b46e9507372f0b1b963James DongdYf3U8          DN YTR3*2.U8
11270c1bc742181ded4930842b46e9507372f0b1b963James DongdYf4U8          DN YTR4*2.U8
11280c1bc742181ded4930842b46e9507372f0b1b963James DongdYf5U8          DN YTR5*2.U8
11290c1bc742181ded4930842b46e9507372f0b1b963James DongdYf6U8          DN YTR6*2.U8
11300c1bc742181ded4930842b46e9507372f0b1b963James DongdYf7U8          DN YTR7*2.U8
11310c1bc742181ded4930842b46e9507372f0b1b963James Dong
11320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
11330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Do saturation if outsize is other than S16
11340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
11350c1bc742181ded4930842b46e9507372f0b1b963James Dong
11360c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="u8")
11370c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Output range [0-255]
11380c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf0U8, qYf0
11390c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf1U8, qYf1
11400c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf2U8, qYf2
11410c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf3U8, qYf3
11420c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf4U8, qYf4
11430c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf5U8, qYf5
11440c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf6U8, qYf6
11450c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQMOVN            dYf7U8, qYf7
11460c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
11470c1bc742181ded4930842b46e9507372f0b1b963James Dong
11480c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF ("$outsize"="s9")
11490c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Output range [-256 to +255]
11500c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf0, qYf0, #16-9
11510c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf1, qYf1, #16-9
11520c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf2, qYf2, #16-9
11530c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf3, qYf3, #16-9
11540c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf4, qYf4, #16-9
11550c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf5, qYf5, #16-9
11560c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf6, qYf6, #16-9
11570c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQSHL            qYf7, qYf7, #16-9
11580c1bc742181ded4930842b46e9507372f0b1b963James Dong
11590c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf0, qYf0, #16-9
11600c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf1, qYf1, #16-9
11610c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf2, qYf2, #16-9
11620c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf3, qYf3, #16-9
11630c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf4, qYf4, #16-9
11640c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf5, qYf5, #16-9
11650c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf6, qYf6, #16-9
11660c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHR             qYf7, qYf7, #16-9
11670c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
11680c1bc742181ded4930842b46e9507372f0b1b963James Dong
11690c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Store output depending on the Stride size
11700c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$stride"="s"
11710c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf0, [pDest @64], Stride
11720c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf1, [pDest @64], Stride
11730c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf2, [pDest @64], Stride
11740c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf3, [pDest @64], Stride
11750c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf4, [pDest @64], Stride
11760c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf5, [pDest @64], Stride
11770c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf6, [pDest @64], Stride
11780c1bc742181ded4930842b46e9507372f0b1b963James Dong            VST1        qYf7, [pDest @64]
11790c1bc742181ded4930842b46e9507372f0b1b963James Dong        ELSE
11800c1bc742181ded4930842b46e9507372f0b1b963James Dong            IF ("$outsize"="u8")
11810c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf0U8, [pDest @64], #8
11820c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf1U8, [pDest @64], #8
11830c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf2U8, [pDest @64], #8
11840c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf3U8, [pDest @64], #8
11850c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf4U8, [pDest @64], #8
11860c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf5U8, [pDest @64], #8
11870c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf6U8, [pDest @64], #8
11880c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        dYf7U8, [pDest @64]
11890c1bc742181ded4930842b46e9507372f0b1b963James Dong            ELSE
11900c1bc742181ded4930842b46e9507372f0b1b963James Dong                ;// ("$outsize"="s9") or ("$outsize"="s16")
11910c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf0, [pDest @64], #16
11920c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf1, [pDest @64], #16
11930c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf2, [pDest @64], #16
11940c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf3, [pDest @64], #16
11950c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf4, [pDest @64], #16
11960c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf5, [pDest @64], #16
11970c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf6, [pDest @64], #16
11980c1bc742181ded4930842b46e9507372f0b1b963James Dong                VST1        qYf7, [pDest @64]
11990c1bc742181ded4930842b46e9507372f0b1b963James Dong            ENDIF
12000c1bc742181ded4930842b46e9507372f0b1b963James Dong
12010c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
12020c1bc742181ded4930842b46e9507372f0b1b963James Dong
12030c1bc742181ded4930842b46e9507372f0b1b963James Dong
12040c1bc742181ded4930842b46e9507372f0b1b963James Dong
12050c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF ;// CortexA8
12060c1bc742181ded4930842b46e9507372f0b1b963James Dong
12070c1bc742181ded4930842b46e9507372f0b1b963James Dong
12080c1bc742181ded4930842b46e9507372f0b1b963James Dong
12090c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
12100c1bc742181ded4930842b46e9507372f0b1b963James Dong
12110c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale TWO input rows with TWO rows of 16 bit scale values
12120c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12130c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
12140c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// input (Eight input values) with one row of scale values. Also
12150c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Loads next scale values from pScale, if $LastRow flag is not set.
12160c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12170c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Registers:
12180c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12190c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAlo           - Input D register with first four S16 values of row n
12200c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAhi           - Input D register with next four S16 values of row n
12210c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBlo           - Input D register with first four S16 values of row n+1
12220c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBhi           - Input D register with next four S16 values of row n+1
12230c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to next row of scale values
12240c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT0lo           - Temporary scratch register
12250c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT0hi           - Temporary scratch register
12260c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT1lo           - Temporary scratch register
12270c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qT1hi           - Temporary scratch register
12280c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale1lo       - Scale value of row n
12290c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale1hi       - Scale value of row n
12300c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale2lo       - Scale value of row n+1
12310c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dScale2hi       - Scale value of row n+1
12320c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12330c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Flag
12340c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12350c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $LastRow        - Flag to indicate whether current row is last row
12360c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12370c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output Registers:
12380c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12390c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAlo           - Scaled output values (first four S16 of row n)
12400c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dAhi           - Scaled output values (next four S16 of row n)
12410c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBlo           - Scaled output values (first four S16 of row n+1)
12420c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// $dBhi           - Scaled output values (next four S16 of row n+1)
12430c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qScale1         - Scale values for next row
12440c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qScale2         - Scale values for next row+1
12450c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to next row of scale values
12460c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12470c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
12480c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
12490c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT0lo, $dAlo, dScale1lo
12500c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT0hi, $dAhi, dScale1hi
12510c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT1lo, $dBlo, dScale2lo
12520c1bc742181ded4930842b46e9507372f0b1b963James Dong        VMULL       qT1hi, $dBhi, dScale2hi
12530c1bc742181ded4930842b46e9507372f0b1b963James Dong        IF "$LastRow"="0"
12540c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
12550c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
12560c1bc742181ded4930842b46e9507372f0b1b963James Dong        ENDIF
12570c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dAlo, qT0lo, #12
12580c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dAhi, qT0hi, #12
12590c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dBlo, qT1lo, #12
12600c1bc742181ded4930842b46e9507372f0b1b963James Dong        VQRSHRN       $dBhi, qT1hi, #12
12610c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
12620c1bc742181ded4930842b46e9507372f0b1b963James Dong
12630c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale 8x8 block input values with 16 bit scale values
12640c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12650c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This macro is used to pre-scale block of 8x8 input.
12660c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This also do the Ist stage transformations of IDCT.
12670c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12680c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Registers:
12690c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12700c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnlo          - n th input D register with first four S16 values
12710c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnhi          - n th input D register with next four S16 values
12720c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qXjn            - n th input Q register with eight S16 values
12730c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to scale values
12740c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12750c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output Registers:
12760c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12770c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qXin            - n th output Q register with eight S16 output values of 1st stage
12780c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
12790c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
12800c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_PRESCALE16
12810c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
12820c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
12830c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
12840c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
12850c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
12860c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
12870c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
12880c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXi6, qXj1, qXj7            ;// j1-j7
12890c1bc742181ded4930842b46e9507372f0b1b963James Dong        LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
12900c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
12910c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXi2, qXj2, qXj6            ;// j2-j6
12920c1bc742181ded4930842b46e9507372f0b1b963James Dong        VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
12930c1bc742181ded4930842b46e9507372f0b1b963James Dong        VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
12940c1bc742181ded4930842b46e9507372f0b1b963James Dong        VSUB        qXi4, qXj5, qXj3            ;// j5-j3
12950c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
12960c1bc742181ded4930842b46e9507372f0b1b963James Dong
12970c1bc742181ded4930842b46e9507372f0b1b963James Dong
12980c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Scale 8x8 block input values with 32 bit scale values
12990c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13000c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This macro is used to pre-scale block of 8x8 input.
13010c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// This also do the Ist stage transformations of IDCT.
13020c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13030c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Input Registers:
13040c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13050c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnlo          - n th input D register with first four S16 values
13060c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXjnhi          - n th input D register with next four S16 values
13070c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// qXjn            - n th input Q register with eight S16 values
13080c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// pScale          - Pointer to 32bit scale values in Q23 format
13090c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13100c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// Output Registers:
13110c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13120c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
13130c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
13140c1bc742181ded4930842b46e9507372f0b1b963James Dong        ;//
13150c1bc742181ded4930842b46e9507372f0b1b963James Dong        MACRO
13160c1bc742181ded4930842b46e9507372f0b1b963James Dong        M_IDCT_PRESCALE32
13170c1bc742181ded4930842b46e9507372f0b1b963James DongqScale0lo       QN 0.S32
13180c1bc742181ded4930842b46e9507372f0b1b963James DongqScale0hi       QN 1.S32
13190c1bc742181ded4930842b46e9507372f0b1b963James DongqScale1lo       QN 2.S32
13200c1bc742181ded4930842b46e9507372f0b1b963James DongqScale1hi       QN 3.S32
13210c1bc742181ded4930842b46e9507372f0b1b963James DongqScale2lo       QN qScale1lo
13220c1bc742181ded4930842b46e9507372f0b1b963James DongqScale2hi       QN qScale1hi
13230c1bc742181ded4930842b46e9507372f0b1b963James DongqScale3lo       QN qScale1lo
13240c1bc742181ded4930842b46e9507372f0b1b963James DongqScale3hi       QN qScale1hi
13250c1bc742181ded4930842b46e9507372f0b1b963James DongqScale4lo       QN qScale1lo
13260c1bc742181ded4930842b46e9507372f0b1b963James DongqScale4hi       QN qScale1hi
13270c1bc742181ded4930842b46e9507372f0b1b963James DongqScale5lo       QN qScale0lo
13280c1bc742181ded4930842b46e9507372f0b1b963James DongqScale5hi       QN qScale0hi
13290c1bc742181ded4930842b46e9507372f0b1b963James DongqScale6lo       QN qScale0lo
13300c1bc742181ded4930842b46e9507372f0b1b963James DongqScale6hi       QN qScale0hi
13310c1bc742181ded4930842b46e9507372f0b1b963James DongqScale7lo       QN qScale0lo
13320c1bc742181ded4930842b46e9507372f0b1b963James DongqScale7hi       QN qScale0hi
13330c1bc742181ded4930842b46e9507372f0b1b963James Dong
13340c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc0lo         QN 4.S32
13350c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc0hi         QN 5.S32
13360c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc1lo         QN 6.S32
13370c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc1hi         QN Src4.S32
13380c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc2lo         QN qSrc0lo
13390c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc2hi         QN qSrc0hi
13400c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc3lo         QN qSrc0lo
13410c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc3hi         QN qSrc0hi
13420c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc4lo         QN qSrc0lo
13430c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc4hi         QN qSrc0hi
13440c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc5lo         QN qSrc1lo
13450c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc5hi         QN qSrc1hi
13460c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc6lo         QN qSrc1lo
13470c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc6hi         QN qSrc1hi
13480c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc7lo         QN qSrc0lo
13490c1bc742181ded4930842b46e9507372f0b1b963James DongqSrc7hi         QN qSrc0hi
13500c1bc742181ded4930842b46e9507372f0b1b963James Dong
13510c1bc742181ded4930842b46e9507372f0b1b963James DongqRes17lo        QN qScale0lo
13520c1bc742181ded4930842b46e9507372f0b1b963James DongqRes17hi        QN qScale0hi
13530c1bc742181ded4930842b46e9507372f0b1b963James DongqRes26lo        QN qScale0lo
13540c1bc742181ded4930842b46e9507372f0b1b963James DongqRes26hi        QN qScale0hi
13550c1bc742181ded4930842b46e9507372f0b1b963James DongqRes53lo        QN qScale0lo
13560c1bc742181ded4930842b46e9507372f0b1b963James DongqRes53hi        QN qScale0hi
13570c1bc742181ded4930842b46e9507372f0b1b963James Dong
13580c1bc742181ded4930842b46e9507372f0b1b963James Dong            ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
13590c1bc742181ded4930842b46e9507372f0b1b963James Dong
13600c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 0
13610c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale0lo, qScale0hi}, [pScale]!
13620c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc0lo, dXj0lo, #(12-1)
13630c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc0hi, dXj0hi, #(12-1)
13640c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale1lo, qScale1hi}, [pScale]!
13650c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
13660c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
13670c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale7lo, qScale7hi}, [pTemp]!
13680c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc1lo, dXj1lo, #(12-1)
13690c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc1hi, dXj1hi, #(12-1)
13700c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
13710c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi0hi, qSrc0hi
13720c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc7lo, dXj7lo, #(12-1)
13730c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc7hi, dXj7hi, #(12-1)
13740c1bc742181ded4930842b46e9507372f0b1b963James Dong            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
13750c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
13760c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
13770c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
13780c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
13790c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale2lo, qScale2hi}, [pScale]!
13800c1bc742181ded4930842b46e9507372f0b1b963James Dong
13810c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 1 & 7
13820c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
13830c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
13840c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi5lo, qRes17lo                ;// Output i5
13850c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi5hi, qRes17hi
13860c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
13870c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
13880c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi6lo, qRes17lo                ;// Output i6
13890c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi6hi, qRes17hi
13900c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc2lo, dXj2lo, #(12-1)
13910c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc2hi, dXj2hi, #(12-1)
13920c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale6lo, qScale6hi}, [pTemp]!
13930c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc6lo, dXj6lo, #(12-1)
13940c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc6hi, dXj6hi, #(12-1)
13950c1bc742181ded4930842b46e9507372f0b1b963James Dong            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
13960c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
13970c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
13980c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
13990c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
14000c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale3lo, qScale3hi}, [pScale]!
14010c1bc742181ded4930842b46e9507372f0b1b963James Dong
14020c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 2 & 6
14030c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
14040c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
14050c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi3lo, qRes26lo                ;// Output i3
14060c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi3hi, qRes26hi
14070c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
14080c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
14090c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi2lo, qRes26lo                ;// Output i2
14100c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi2hi, qRes26hi
14110c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc3lo, dXj3lo, #(12-1)
14120c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc3hi, dXj3hi, #(12-1)
14130c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale5lo, qScale5hi}, [pTemp]!
14140c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc5lo, dXj5lo, #(12-1)
14150c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc5hi, dXj5hi, #(12-1)
14160c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
14170c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
14180c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
14190c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
14200c1bc742181ded4930842b46e9507372f0b1b963James Dong
14210c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 3 & 5
14220c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
14230c1bc742181ded4930842b46e9507372f0b1b963James Dong            VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
14240c1bc742181ded4930842b46e9507372f0b1b963James Dong            SUB         pSrc, pSrc, #16*2*2
14250c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi7lo, qRes53lo                ;// Output i7
14260c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi7hi, qRes53hi
14270c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
14280c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
14290c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        qXj4, [pSrc @64]
14300c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi4lo, qRes53lo                ;// Output i4
14310c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi4hi, qRes53hi
14320c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc4lo, dXj4lo, #(12-1)
14330c1bc742181ded4930842b46e9507372f0b1b963James Dong            VSHLL       qSrc4hi, dXj4hi, #(12-1)
14340c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLD1        {qScale4lo, qScale4hi}, [pScale]
14350c1bc742181ded4930842b46e9507372f0b1b963James Dong            LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
14360c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
14370c1bc742181ded4930842b46e9507372f0b1b963James Dong            VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
14380c1bc742181ded4930842b46e9507372f0b1b963James Dong            VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
14390c1bc742181ded4930842b46e9507372f0b1b963James Dong            ;// Row 4
14400c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
14410c1bc742181ded4930842b46e9507372f0b1b963James Dong            VMOVN       dXi1hi, qSrc4hi
14420c1bc742181ded4930842b46e9507372f0b1b963James Dong
14430c1bc742181ded4930842b46e9507372f0b1b963James Dong        MEND
14440c1bc742181ded4930842b46e9507372f0b1b963James Dong
14450c1bc742181ded4930842b46e9507372f0b1b963James Dong        END
1446