armCOMM_IDCT_s.h revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;//
2;// This confidential and proprietary software may be used only as
3;// authorised by a licensing agreement from ARM Limited
4;//   (C) COPYRIGHT 2004 ARM Limited
5;//       ALL RIGHTS RESERVED
6;// The entire notice above must be reproduced on all authorised
7;// copies and copies may only be made to the extent permitted
8;// by a licensing agreement from ARM Limited.
9;//
10;// IDCT_s.s
11;//
12;// Inverse DCT module
13;//
14;//
15;// ALGORITHM DESCRIPTION
16;//
17;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
18;// column and then a 1D IDCT for each row.
19;//
20;// The 8-point 1D IDCT is defined by
21;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
22;//
23;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
24;//   c(u,x) = cos( (2x+1)*u*pi/16 )
25;//
26;// We compute the 8-point 1D IDCT using the reverse of
27;// the Arai-Agui-Nakajima flow graph which we split into
28;// 5 stages named in reverse order to identify with the
29;// forward DCT. Direct inversion of the forward formulae
30;// in file FDCT_s.s gives:
31;//
32;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
33;//             [ A(0) = 2*sqrt(2)
34;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
35;//
36;// IStage 4:   i0 = j0             i1 = j4
37;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
38;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
39;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
40;//
41;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
42;//             h2 = (i2*sqrt2)-i3  h3 = i3
43;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
44;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
45;//             [ The above two lines rotate by -(pi/8) ]
46;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2
47;//
48;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
49;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
50;//             g7 = h7             g6 = h6 - h7
51;//             g5 = h5 - g6        g4 = h4 - g5
52;//
53;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
54;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
55;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
56;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
57;//
58;// Note that most coefficients are halved 3 times during the
59;// above calculation. We can rescale the algorithm dividing
60;// the input by 8 to remove the halvings.
61;//
62;// IStage 5:   j(u) = T(u)*A(u)/8
63;//
64;// IStage 4:   i0 = j0             i1 = j4
65;//             i3 = j2 + j6        i2 = j2 - j6
66;//             i7 = j5 + j3        i4 = j5 - j3
67;//             i5 = j1 + j7        i6 = j1 - j7
68;//
69;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
70;//             h2 = (i2*sqrt2)-i3  h3 = i3
71;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
72;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
73;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7
74;//
75;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
76;//             g1 = h1 + h2        g2 = h1 - h2
77;//             g7 = h7             g6 = h6 - h7
78;//             g5 = h5 - g6        g4 = h4 - g5
79;//
80;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
81;//             f1 = g1 + g6        f6 = g1 - g6
82;//             f2 = g2 + g5        f5 = g2 - g5
83;//             f3 = g3 + g4        f4 = g3 - g4
84;//
85;// Note:
86;// 1. The scaling by A(u)/8 can often be combined with inverse
87;//    quantization. The column and row scalings can be combined.
88;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
89;//    to the above code but is otherwise identical.
90;// 3. The rotation by -pi/8 can be peformed using three multiplies
91;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
92;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
93;// 4. If |T(u)|<=1 then from the IDCT definition,
94;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
95;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
96;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
97;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
98;//            = (approx)2.64
99;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
100;//    The table below shows input patterns generating the maximum
101;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
102;//    InputPattern      Max |f(x)|
103;//      PPPPPPPP        |f0| =  2.64
104;//      PPPMMMMM        |f1| =  2.64
105;//      PPMMMPPP        |f2| =  2.64
106;//      PPMMPPMM        |f3| =  2.64
107;//      PMMPPMMP        |f4| =  2.64
108;//      PMMPMMPM        |f5| =  2.64
109;//      PMPPMPMP        |f6| =  2.64
110;//      PMPMPMPM        |f7| =  2.64
111;//   Note that this input pattern is the transpose of the
112;//   corresponding max input patter for the FDCT.
113
114;// Arguments
115
116pSrc    RN 0    ;// source data buffer
117Stride  RN 1    ;// destination stride in bytes
118pDest   RN 2    ;// destination data buffer
119pScale  RN 3    ;// pointer to scaling table
120
121
122        ;// DCT Inverse Macro
123        ;// The DCT code should be parametrized according
124        ;// to the following inputs:
125        ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
126        ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
127        ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
128        ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
129        ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
130        ;//
131        ;// Inputs:
132        ;// pSrc   = r0 = Pointer to input data
133        ;//               Range is -256 to +255 (9-bit)
134        ;// Stride = r1 = Stride between input lines
135        ;// pDest  = r2 = Pointer to output data
136        ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
137
138
139
140        MACRO
141        M_IDCT  $outsize, $inscale, $stride
142        LCLA    SHIFT
143
144
145        IF ARM1136JS
146
147;// REGISTER ALLOCATION
148;// This is hard since we have 8 values, 9 free registers and each
149;// butterfly requires a temporary register. We also want to
150;// maintain register order so we can use LDM/STM. The table below
151;// summarises the register allocation that meets all these criteria.
152;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
153;//
154;// r1  a01     g0  h0
155;// r4  b01 f0  g1  h1  i0
156;// r5  a23 f1  g2      i1
157;// r6  b23 f2  g3  h2  i2
158;// r7  a45 f3      h3  i3
159;// r8  b45 f4  g4  h4  i4
160;// r9  a67 f5  g5  h5  i5
161;// r10 b67 f6  g6  h6  i6
162;// r11     f7  g7  h7  i7
163;//
164ra01    RN 1
165rb01    RN 4
166ra23    RN 5
167rb23    RN 6
168ra45    RN 7
169rb45    RN 8
170ra67    RN 9
171rb67    RN 10
172rtmp    RN 11
173csPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
174LoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
175;// Transpose allocation
176xft     RN ra01
177xf0     RN rb01
178xf1     RN ra23
179xf2     RN rb23
180xf3     RN ra45
181xf4     RN rb45
182xf5     RN ra67
183xf6     RN rb67
184xf7     RN rtmp
185;// IStage 1 allocation
186xg0     RN xft
187xg1     RN xf0
188xg2     RN xf1
189xg3     RN xf2
190xgt     RN xf3
191xg4     RN xf4
192xg5     RN xf5
193xg6     RN xf6
194xg7     RN xf7
195;// IStage 2 allocation
196xh0     RN xg0
197xh1     RN xg1
198xht     RN xg2
199xh2     RN xg3
200xh3     RN xgt
201xh4     RN xg4
202xh5     RN xg5
203xh6     RN xg6
204xh7     RN xg7
205;// IStage 3,4 allocation
206xit     RN xh0
207xi0     RN xh1
208xi1     RN xht
209xi2     RN xh2
210xi3     RN xh3
211xi4     RN xh4
212xi5     RN xh5
213xi6     RN xh6
214xi7     RN xh7
215
216        M_STR   pDest,  ppDest
217        IF "$stride"="s"
218            M_STR   Stride, pStride
219        ENDIF
220        M_ADR   pDest,  pBlk
221        LDR     csPiBy8, =0x30fc7642
222        LDR     LoopRR2, =0x00005a82
223
224v6_idct_col$_F
225        ;// Load even values
226        LDR     xi4, [pSrc], #4  ;// j0
227        LDR     xi5, [pSrc, #4*16-4]  ;// j4
228        LDR     xi6, [pSrc, #2*16-4]  ;// j2
229        LDR     xi7, [pSrc, #6*16-4]  ;// j6
230
231        ;// Scale Even Values
232        IF "$inscale"="s16" ;// 16x16 mul
233SHIFT       SETA    12
234            LDR     xi0, [pScale], #4
235            LDR     xi1, [pScale, #4*16-4]
236            LDR     xi2, [pScale, #2*16-4]
237            MOV     xit, #1<<(SHIFT-1)
238            SMLABB  xi3, xi0, xi4, xit
239            SMLATT  xi4, xi0, xi4, xit
240            SMLABB  xi0, xi1, xi5, xit
241            SMLATT  xi5, xi1, xi5, xit
242            MOV     xi3, xi3, ASR #SHIFT
243            PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
244            LDR     xi3, [pScale, #6*16-4]
245            SMLABB  xi1, xi2, xi6, xit
246            SMLATT  xi6, xi2, xi6, xit
247            MOV     xi0, xi0, ASR #SHIFT
248            PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
249            SMLABB  xi2, xi3, xi7, xit
250            SMLATT  xi7, xi3, xi7, xit
251            MOV     xi1, xi1, ASR #SHIFT
252            PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
253            MOV     xi2, xi2, ASR #SHIFT
254            PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
255        ENDIF
256        IF "$inscale"="s32" ;// 32x16 mul
257SHIFT       SETA    (12+8-16)
258            MOV     xit, #1<<(SHIFT-1)
259            LDR     xi0, [pScale], #8
260            LDR     xi1, [pScale, #0*32+4-8]
261            LDR     xi2, [pScale, #4*32-8]
262            LDR     xi3, [pScale, #4*32+4-8]
263            SMLAWB  xi0, xi0, xi4, xit
264            SMLAWT  xi1, xi1, xi4, xit
265            SMLAWB  xi2, xi2, xi5, xit
266            SMLAWT  xi3, xi3, xi5, xit
267            MOV     xi0, xi0, ASR #SHIFT
268            PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
269            MOV     xi2, xi2, ASR #SHIFT
270            PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
271            LDR     xi0, [pScale, #2*32-8]
272            LDR     xi1, [pScale, #2*32+4-8]
273            LDR     xi2, [pScale, #6*32-8]
274            LDR     xi3, [pScale, #6*32+4-8]
275            SMLAWB  xi0, xi0, xi6, xit
276            SMLAWT  xi1, xi1, xi6, xit
277            SMLAWB  xi2, xi2, xi7, xit
278            SMLAWT  xi3, xi3, xi7, xit
279            MOV     xi0, xi0, ASR #SHIFT
280            PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
281            MOV     xi2, xi2, ASR #SHIFT
282            PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
283        ENDIF
284
285        ;// Load odd values
286        LDR     xi0, [pSrc, #1*16-4]      ;// j1
287        LDR     xi1, [pSrc, #7*16-4]      ;// j7
288        LDR     xi2, [pSrc, #5*16-4]      ;// j5
289        LDR     xi3, [pSrc, #3*16-4]      ;// j3
290
291        IF  {TRUE}
292            ;// shortcut if odd values 0
293            TEQ     xi0, #0
294            TEQEQ   xi1, #0
295            TEQEQ   xi2, #0
296            TEQEQ   xi3, #0
297            BEQ     v6OddZero$_F
298        ENDIF
299
300        ;// Store scaled even values
301        STMIA   pDest, {xi4, xi5, xi6, xi7}
302
303        ;// Scale odd values
304        IF "$inscale"="s16"
305            ;// Perform AAN Scale
306            LDR     xi4, [pScale, #1*16-4]
307            LDR     xi5, [pScale, #7*16-4]
308            LDR     xi6, [pScale, #5*16-4]
309            SMLABB  xi7, xi0, xi4, xit
310            SMLATT  xi0, xi0, xi4, xit
311            SMLABB  xi4, xi1, xi5, xit
312            SMLATT  xi1, xi1, xi5, xit
313            MOV     xi7, xi7, ASR #SHIFT
314            PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
315            LDR     xi7, [pScale, #3*16-4]
316            SMLABB  xi5, xi2, xi6, xit
317            SMLATT  xi2, xi2, xi6, xit
318            MOV     xi4, xi4, ASR #SHIFT
319            PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
320            SMLABB  xi6, xi3, xi7, xit
321            SMLATT  xi3, xi3, xi7, xit
322            MOV     xi5, xi5, ASR #SHIFT
323            PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
324            MOV     xi6, xi6, ASR #SHIFT
325            PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
326        ENDIF
327        IF "$inscale"="s32" ;// 32x16 mul
328            LDR     xi4, [pScale, #1*32-8]
329            LDR     xi5, [pScale, #1*32+4-8]
330            LDR     xi6, [pScale, #7*32-8]
331            LDR     xi7, [pScale, #7*32+4-8]
332            SMLAWB  xi4, xi4, xi0, xit
333            SMLAWT  xi5, xi5, xi0, xit
334            SMLAWB  xi6, xi6, xi1, xit
335            SMLAWT  xi7, xi7, xi1, xit
336            MOV     xi4, xi4, ASR #SHIFT
337            PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
338            MOV     xi6, xi6, ASR #SHIFT
339            PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
340            LDR     xi4, [pScale, #5*32-8]
341            LDR     xi5, [pScale, #5*32+4-8]
342            LDR     xi6, [pScale, #3*32-8]
343            LDR     xi7, [pScale, #3*32+4-8]
344            SMLAWB  xi4, xi4, xi2, xit
345            SMLAWT  xi5, xi5, xi2, xit
346            SMLAWB  xi6, xi6, xi3, xit
347            SMLAWT  xi7, xi7, xi3, xit
348            MOV     xi4, xi4, ASR #SHIFT
349            PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
350            MOV     xi6, xi6, ASR #SHIFT
351            PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
352        ENDIF
353
354        LDR     xit, =0x00010001        ;// rounding constant
355        SADD16 xi5, xi0, xi1           ;// (j1+j7)/2
356        SHADD16 xi5, xi5, xit
357
358        SSUB16  xi6, xi0, xi1           ;// j1-j7
359        SADD16 xi7, xi2, xi3           ;// (j5+j3)/2
360        SHADD16 xi7, xi7, xit
361
362        SSUB16  xi4, xi2, xi3           ;// j5-j3
363
364        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
365
366        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
367        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
368
369        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
370        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
371        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
372        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
373
374        SMULBB  xi1, xi3, LoopRR2
375        SMULTB  xi3, xi3, LoopRR2
376
377        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
378        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
379        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
380
381        ;// xi0,xi1,xi2,xi3 now free
382        ;// IStage 4,3, rows 2to3 x1/2
383
384        MOV     xi3, xi3, LSL #1
385        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
386        LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
387
388        ;// IStage 2, rows4to7
389        SSUB16  xg6, xh6, xh7
390        SSUB16  xg5, xh5, xg6
391        SSUB16  xg4, xh4, xg5
392
393        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
394
395        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
396
397        SMULBB  xi0, xi2, LoopRR2
398        SMULTB  xi2, xi2, LoopRR2
399
400        MOV     xi2, xi2, LSL #1
401        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
402
403        ;// xi0, xi1 now free
404        ;// IStage 4,3 rows 0to1 x 1/2
405        LDRD    xi0, [pDest]            ;// j0, j4 scaled
406        SSUB16  xh2, xh2, xi3
407        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
408
409        SHADD16 xh0, xi0, xi1
410        SHSUB16 xh1, xi0, xi1
411
412        ;// IStage 2 rows 0to3 x 1/2
413        SHSUB16 xg2, xh1, xh2
414        SHADD16 xg1, xh1, xh2
415        SHSUB16 xg3, xh0, xh3
416        SHADD16 xg0, xh0, xh3
417
418        ;// IStage 1 all rows
419        SADD16  xf3, xg3, xg4
420        SSUB16  xf4, xg3, xg4
421        SADD16  xf2, xg2, xg5
422        SSUB16  xf5, xg2, xg5
423        SADD16  xf1, xg1, xg6
424        SSUB16  xf6, xg1, xg6
425        SADD16  xf0, xg0, xg7
426        SSUB16  xf7, xg0, xg7
427
428        ;// Transpose, store and loop
429        PKHBT   ra01, xf0, xf1, LSL #16
430        PKHTB   rb01, xf1, xf0, ASR #16
431
432        PKHBT   ra23, xf2, xf3, LSL #16
433        PKHTB   rb23, xf3, xf2, ASR #16
434
435        PKHBT   ra45, xf4, xf5, LSL #16
436        PKHTB   rb45, xf5, xf4, ASR #16
437
438        PKHBT   ra67, xf6, xf7, LSL #16
439        STMIA   pDest!, {ra01, ra23, ra45, ra67}
440        PKHTB   rb67, xf7, xf6, ASR #16
441        STMIA   pDest!, {rb01, rb23, rb45, rb67}
442        BCC     v6_idct_col$_F
443
444        SUB     pSrc, pDest, #(64*2)
445        M_LDR   pDest, ppDest
446        IF "$stride"="s"
447            M_LDR   pScale, pStride
448        ENDIF
449        B       v6_idct_row$_F
450
451v6OddZero$_F
452        SSUB16  xi2, xi6, xi7           ;// (j2-j6)
453        SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
454
455        SMULBB  xi0, xi2, LoopRR2
456        SMULTB  xi2, xi2, LoopRR2
457
458        MOV     xi2, xi2, LSL #1
459        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
460        SSUB16  xh2, xh2, xi3
461
462        ;// xi0, xi1 now free
463        ;// IStage 4,3 rows 0to1 x 1/2
464
465        SHADD16 xh0, xi4, xi5
466        SHSUB16 xh1, xi4, xi5
467
468        ;// IStage 2 rows 0to3 x 1/2
469        SHSUB16 xg2, xh1, xh2
470        SHADD16 xg1, xh1, xh2
471        SHSUB16 xg3, xh0, xh3
472        SHADD16 xg0, xh0, xh3
473
474        ;// IStage 1 all rows
475        MOV  xf3, xg3
476        MOV  xf4, xg3
477        MOV  xf2, xg2
478        MOV  xf5, xg2
479        MOV  xf1, xg1
480        MOV  xf6, xg1
481        MOV  xf0, xg0
482        MOV  xf7, xg0
483
484        ;// Transpose
485        PKHBT   ra01, xf0, xf1, LSL #16
486        PKHTB   rb01, xf1, xf0, ASR #16
487
488        PKHBT   ra23, xf2, xf3, LSL #16
489        PKHTB   rb23, xf3, xf2, ASR #16
490
491        PKHBT   ra45, xf4, xf5, LSL #16
492        PKHTB   rb45, xf5, xf4, ASR #16
493
494        PKHBT   ra67, xf6, xf7, LSL #16
495        PKHTB   rb67, xf7, xf6, ASR #16
496
497        STMIA   pDest!, {ra01, ra23, ra45, ra67}
498        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
499        STMIA   pDest!, {rb01, rb23, rb45, rb67}
500
501        BCC     v6_idct_col$_F
502        SUB     pSrc, pDest, #(64*2)
503        M_LDR   pDest, ppDest
504        IF "$stride"="s"
505            M_LDR   pScale, pStride
506        ENDIF
507
508
509v6_idct_row$_F
510        ;// IStage 4,3, rows4to7 x1/4
511        LDR     xit, =0x00010001        ;// rounding constant
512        LDR     xi0, [pSrc, #1*16]      ;// j1
513        LDR     xi1, [pSrc, #7*16]      ;// 4*j7
514        LDR     xi2, [pSrc, #5*16]      ;// j5
515        LDR     xi3, [pSrc, #3*16]      ;// j3
516
517        SHADD16 xi1, xi1, xit           ;// 2*j7
518        SHADD16 xi1, xi1, xit           ;// j7
519
520        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
521        SSUB16  xi6, xi0, xi1           ;// j1-j7
522        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
523        SSUB16  xi4, xi2, xi3           ;// j5-j3
524
525        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
526
527        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
528        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
529
530        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
531        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
532        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
533        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
534
535        SMULBB  xi1, xi3, LoopRR2
536        SMULTB  xi3, xi3, LoopRR2
537
538        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
539        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
540        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
541
542        MOV     xi3, xi3, LSL #1
543        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
544
545        ;// xi0,xi1,xi2,xi3 now free
546        ;// IStage 4,3, rows 2to3 x1/2
547
548        LDR     xi0, [pSrc, #2*16]      ;// j2
549        LDR     xi1, [pSrc, #6*16]      ;// 2*j6
550
551        ;// IStage 2, rows4to7
552        SSUB16  xg6, xh6, xh7
553        SSUB16  xg5, xh5, xg6
554        SSUB16  xg4, xh4, xg5
555
556        SHADD16 xi1, xi1, xit           ;// j6
557        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
558        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
559
560        SMULBB  xi0, xi2, LoopRR2
561        SMULTB  xi2, xi2, LoopRR2
562
563        MOV     xi2, xi2, LSL #1
564
565        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
566
567        ;// xi0, xi1 now free
568        ;// IStage 4,3 rows 0to1 x 1/2
569        LDR     xi1, [pSrc, #4*16]      ;// j4
570        LDR     xi0, [pSrc], #4         ;// j0
571
572        SSUB16  xh2, xh2, xi3
573        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
574
575        ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
576        SHADD16 xh0, xi0, xi1           ;// of DC result
577        SHSUB16 xh1, xi0, xi1
578
579        ;// IStage 2 rows 0to3 x 1/2
580        SHSUB16 xg2, xh1, xh2
581        SHADD16 xg1, xh1, xh2
582        SHSUB16 xg3, xh0, xh3
583        SHADD16 xg0, xh0, xh3
584
585        ;// IStage 1 all rows
586        SHADD16 xf3, xg3, xg4
587        SHSUB16 xf4, xg3, xg4
588        SHADD16 xf2, xg2, xg5
589        SHSUB16 xf5, xg2, xg5
590        SHADD16 xf1, xg1, xg6
591        SHSUB16 xf6, xg1, xg6
592        SHADD16 xf0, xg0, xg7
593        SHSUB16 xf7, xg0, xg7
594
595        ;// Saturate
596        IF ("$outsize"="u8")
597            USAT16  xf0, #8, xf0
598            USAT16  xf1, #8, xf1
599            USAT16  xf2, #8, xf2
600            USAT16  xf3, #8, xf3
601            USAT16  xf4, #8, xf4
602            USAT16  xf5, #8, xf5
603            USAT16  xf6, #8, xf6
604            USAT16  xf7, #8, xf7
605        ENDIF
606        IF ("$outsize"="s9")
607            SSAT16  xf0, #9, xf0
608            SSAT16  xf1, #9, xf1
609            SSAT16  xf2, #9, xf2
610            SSAT16  xf3, #9, xf3
611            SSAT16  xf4, #9, xf4
612            SSAT16  xf5, #9, xf5
613            SSAT16  xf6, #9, xf6
614            SSAT16  xf7, #9, xf7
615        ENDIF
616
617        ;// Transpose to Row, Pack and store
618        IF ("$outsize"="u8")
619            ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
620            ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
621            ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
622            ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
623            PKHBT   ra01, xf0, xf2, LSL #16
624            PKHTB   rb01, xf2, xf0, ASR #16
625            PKHBT   ra23, xf4, xf6, LSL #16
626            PKHTB   rb23, xf6, xf4, ASR #16
627            STMIA   pDest, {ra01, ra23}
628            IF "$stride"="s"
629                ADD     pDest, pDest, pScale
630                STMIA   pDest, {rb01, rb23}
631                ADD     pDest, pDest, pScale
632            ELSE
633                ADD     pDest, pDest, #($stride)
634                STMIA   pDest, {rb01, rb23}
635                ADD     pDest, pDest, #($stride)
636            ENDIF
637        ENDIF
638        IF ("$outsize"="s9"):LOR:("$outsize"="s16")
639            PKHBT   ra01, xf0, xf1, LSL #16
640            PKHTB   rb01, xf1, xf0, ASR #16
641
642            PKHBT   ra23, xf2, xf3, LSL #16
643            PKHTB   rb23, xf3, xf2, ASR #16
644
645            PKHBT   ra45, xf4, xf5, LSL #16
646            PKHTB   rb45, xf5, xf4, ASR #16
647
648            PKHBT   ra67, xf6, xf7, LSL #16
649            PKHTB   rb67, xf7, xf6, ASR #16
650
651            STMIA   pDest, {ra01, ra23, ra45, ra67}
652            IF "$stride"="s"
653                ADD     pDest, pDest, pScale
654                STMIA   pDest, {rb01, rb23, rb45, rb67}
655                ADD     pDest, pDest, pScale
656            ELSE
657                ADD     pDest, pDest, #($stride)
658                STMIA   pDest, {rb01, rb23, rb45, rb67}
659                ADD     pDest, pDest, #($stride)
660            ENDIF
661        ENDIF
662
663        BCC     v6_idct_row$_F
664        ENDIF ;// ARM1136JS
665
666
667        IF CortexA8
668
669Src0            EQU  7
670Src1            EQU  8
671Src2            EQU  9
672Src3            EQU  10
673Src4            EQU  11
674Src5            EQU  12
675Src6            EQU  13
676Src7            EQU  14
677Tmp             EQU  15
678
679qXj0            QN Src0.S16
680qXj1            QN Src1.S16
681qXj2            QN Src2.S16
682qXj3            QN Src3.S16
683qXj4            QN Src4.S16
684qXj5            QN Src5.S16
685qXj6            QN Src6.S16
686qXj7            QN Src7.S16
687qXjt            QN Tmp.S16
688
689dXj0lo          DN (Src0*2).S16
690dXj0hi          DN (Src0*2+1).S16
691dXj1lo          DN (Src1*2).S16
692dXj1hi          DN (Src1*2+1).S16
693dXj2lo          DN (Src2*2).S16
694dXj2hi          DN (Src2*2+1).S16
695dXj3lo          DN (Src3*2).S16
696dXj3hi          DN (Src3*2+1).S16
697dXj4lo          DN (Src4*2).S16
698dXj4hi          DN (Src4*2+1).S16
699dXj5lo          DN (Src5*2).S16
700dXj5hi          DN (Src5*2+1).S16
701dXj6lo          DN (Src6*2).S16
702dXj6hi          DN (Src6*2+1).S16
703dXj7lo          DN (Src7*2).S16
704dXj7hi          DN (Src7*2+1).S16
705dXjtlo          DN (Tmp*2).S16
706dXjthi          DN (Tmp*2+1).S16
707
708qXi0            QN qXj0
709qXi1            QN qXj4
710qXi2            QN qXj2
711qXi3            QN qXj7
712qXi4            QN qXj5
713qXi5            QN qXjt
714qXi6            QN qXj1
715qXi7            QN qXj6
716qXit            QN qXj3
717
718dXi0lo          DN dXj0lo
719dXi0hi          DN dXj0hi
720dXi1lo          DN dXj4lo
721dXi1hi          DN dXj4hi
722dXi2lo          DN dXj2lo
723dXi2hi          DN dXj2hi
724dXi3lo          DN dXj7lo
725dXi3hi          DN dXj7hi
726dXi4lo          DN dXj5lo
727dXi4hi          DN dXj5hi
728dXi5lo          DN dXjtlo
729dXi5hi          DN dXjthi
730dXi6lo          DN dXj1lo
731dXi6hi          DN dXj1hi
732dXi7lo          DN dXj6lo
733dXi7hi          DN dXj6hi
734dXitlo          DN dXj3lo
735dXithi          DN dXj3hi
736
737qXh0            QN qXit
738qXh1            QN qXi0
739qXh2            QN qXi2
740qXh3            QN qXi3
741qXh4            QN qXi7
742qXh5            QN qXi5
743qXh6            QN qXi4
744qXh7            QN qXi1
745qXht            QN qXi6
746
747dXh0lo          DN dXitlo
748dXh0hi          DN dXithi
749dXh1lo          DN dXi0lo
750dXh1hi          DN dXi0hi
751dXh2lo          DN dXi2lo
752dXh2hi          DN dXi2hi
753dXh3lo          DN dXi3lo
754dXh3hi          DN dXi3hi
755dXh4lo          DN dXi7lo
756dXh4hi          DN dXi7hi
757dXh5lo          DN dXi5lo
758dXh5hi          DN dXi5hi
759dXh6lo          DN dXi4lo
760dXh6hi          DN dXi4hi
761dXh7lo          DN dXi1lo
762dXh7hi          DN dXi1hi
763dXhtlo          DN dXi6lo
764dXhthi          DN dXi6hi
765
766qXg0            QN qXh2
767qXg1            QN qXht
768qXg2            QN qXh1
769qXg3            QN qXh0
770qXg4            QN qXh4
771qXg5            QN qXh5
772qXg6            QN qXh6
773qXg7            QN qXh7
774qXgt            QN qXh3
775
776qXf0            QN qXg6
777qXf1            QN qXg5
778qXf2            QN qXg4
779qXf3            QN qXgt
780qXf4            QN qXg3
781qXf5            QN qXg2
782qXf6            QN qXg1
783qXf7            QN qXg0
784qXft            QN qXg7
785
786
787qXt0            QN 1.S32
788qXt1            QN 2.S32
789qT0lo           QN 1.S32
790qT0hi           QN 2.S32
791qT1lo           QN 3.S32
792qT1hi           QN 4.S32
793qScalelo        QN 5.S32        ;// used to read post scale values
794qScalehi        QN 6.S32
795qTemp0          QN 5.S32
796qTemp1          QN 6.S32
797
798
799Scale1          EQU 6
800Scale2          EQU 15
801qScale1         QN Scale1.S16
802qScale2         QN Scale2.S16
803dScale1lo       DN (Scale1*2).S16
804dScale1hi       DN (Scale1*2+1).S16
805dScale2lo       DN (Scale2*2).S16
806dScale2hi       DN (Scale2*2+1).S16
807
808dCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
809InvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
810S               DN dCoefs[1]    ;// Sin(PI/8) in Q15
811C               DN dCoefs[2]    ;// Cos(PI/8) in Q15
812
813pTemp           RN 12
814
815
816        IMPORT  armCOMM_IDCTCoef
817
818        VLD1        {qXj0,qXj1}, [pSrc @64]!
819        VLD1        {qXj2,qXj3}, [pSrc @64]!
820        VLD1        {qXj4,qXj5}, [pSrc @64]!
821        VLD1        {qXj6,qXj7}, [pSrc @64]!
822
823        ;// Load PreScale and multiply with Src
824        ;// IStage 4
825
826        IF "$inscale"="s16"                         ;// 16X16 Mul
827            M_IDCT_PRESCALE16
828        ENDIF
829
830        IF "$inscale"="s32"                         ;// 32X32 ,ul
831            M_IDCT_PRESCALE32
832        ENDIF
833
834        ;// IStage 3
835        VQDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
836        VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
837        VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
838        VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
839        VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
840        VQDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
841        VSUB        qXh2, qXi2, qXi3                ;// h2, h3
842
843        VMULL       qXt0, dXi4lo, C                 ;// c*i4
844        VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
845        VMULL       qXt1, dXi4hi, C
846        VMLAL       qXt1, dXi6hi, S
847        VSHRN       dXh4lo, qXt0, #16               ;// h4
848        VSHRN       dXh4hi, qXt1, #16
849
850        VMULL       qXt0, dXi6lo, C                 ;// c*i6
851        VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
852        VMULL       qXt1, dXi6hi, C
853        VMLSL       qXt1, dXi4hi, S
854        VSHRN       dXh6lo, qXt0, #16               ;// h6
855        VSHRN       dXh6hi, qXt1, #16
856
857        ;// IStage 2
858        VSUB        qXg6, qXh6, qXh7
859        VSUB        qXg5, qXh5, qXg6
860        VSUB        qXg4, qXh4, qXg5
861        VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
862        VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
863        VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
864        VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
865
866        ;// IStage 1 all rows
867        VADD        qXf3, qXg3, qXg4
868        VSUB        qXf4, qXg3, qXg4
869        VADD        qXf2, qXg2, qXg5
870        VSUB        qXf5, qXg2, qXg5
871        VADD        qXf1, qXg1, qXg6
872        VSUB        qXf6, qXg1, qXg6
873        VADD        qXf0, qXg0, qXg7
874        VSUB        qXf7, qXg0, qXg7
875
876        ;// Transpose, store and loop
877XTR0            EQU Src5
878XTR1            EQU Tmp
879XTR2            EQU Src6
880XTR3            EQU Src7
881XTR4            EQU Src3
882XTR5            EQU Src0
883XTR6            EQU Src1
884XTR7            EQU Src2
885XTRt            EQU Src4
886
887qA0             QN  XTR0.S32  ;// for XTRpose
888qA1             QN  XTR1.S32
889qA2             QN  XTR2.S32
890qA3             QN  XTR3.S32
891qA4             QN  XTR4.S32
892qA5             QN  XTR5.S32
893qA6             QN  XTR6.S32
894qA7             QN  XTR7.S32
895
896dB0             DN  XTR0*2+1      ;// for using VSWP
897dB1             DN  XTR1*2+1
898dB2             DN  XTR2*2+1
899dB3             DN  XTR3*2+1
900dB4             DN  XTR4*2
901dB5             DN  XTR5*2
902dB6             DN  XTR6*2
903dB7             DN  XTR7*2
904
905
906        VTRN        qXf0, qXf1
907        VTRN        qXf2, qXf3
908        VTRN        qXf4, qXf5
909        VTRN        qXf6, qXf7
910        VTRN        qA0, qA2
911        VTRN        qA1, qA3
912        VTRN        qA4, qA6
913        VTRN        qA5, qA7
914        VSWP        dB0, dB4
915        VSWP        dB1, dB5
916        VSWP        dB2, dB6
917        VSWP        dB3, dB7
918
919
920qYj0            QN qXf0
921qYj1            QN qXf1
922qYj2            QN qXf2
923qYj3            QN qXf3
924qYj4            QN qXf4
925qYj5            QN qXf5
926qYj6            QN qXf6
927qYj7            QN qXf7
928qYjt            QN qXft
929
930dYj0lo          DN (XTR0*2).S16
931dYj0hi          DN (XTR0*2+1).S16
932dYj1lo          DN (XTR1*2).S16
933dYj1hi          DN (XTR1*2+1).S16
934dYj2lo          DN (XTR2*2).S16
935dYj2hi          DN (XTR2*2+1).S16
936dYj3lo          DN (XTR3*2).S16
937dYj3hi          DN (XTR3*2+1).S16
938dYj4lo          DN (XTR4*2).S16
939dYj4hi          DN (XTR4*2+1).S16
940dYj5lo          DN (XTR5*2).S16
941dYj5hi          DN (XTR5*2+1).S16
942dYj6lo          DN (XTR6*2).S16
943dYj6hi          DN (XTR6*2+1).S16
944dYj7lo          DN (XTR7*2).S16
945dYj7hi          DN (XTR7*2+1).S16
946dYjtlo          DN (XTRt*2).S16
947dYjthi          DN (XTRt*2+1).S16
948
949qYi0            QN qYj0
950qYi1            QN qYj4
951qYi2            QN qYj2
952qYi3            QN qYj7
953qYi4            QN qYj5
954qYi5            QN qYjt
955qYi6            QN qYj1
956qYi7            QN qYj6
957qYit            QN qYj3
958
959dYi0lo          DN dYj0lo
960dYi0hi          DN dYj0hi
961dYi1lo          DN dYj4lo
962dYi1hi          DN dYj4hi
963dYi2lo          DN dYj2lo
964dYi2hi          DN dYj2hi
965dYi3lo          DN dYj7lo
966dYi3hi          DN dYj7hi
967dYi4lo          DN dYj5lo
968dYi4hi          DN dYj5hi
969dYi5lo          DN dYjtlo
970dYi5hi          DN dYjthi
971dYi6lo          DN dYj1lo
972dYi6hi          DN dYj1hi
973dYi7lo          DN dYj6lo
974dYi7hi          DN dYj6hi
975dYitlo          DN dYj3lo
976dYithi          DN dYj3hi
977
978qYh0            QN qYit
979qYh1            QN qYi0
980qYh2            QN qYi2
981qYh3            QN qYi3
982qYh4            QN qYi7
983qYh5            QN qYi5
984qYh6            QN qYi4
985qYh7            QN qYi1
986qYht            QN qYi6
987
988dYh0lo          DN dYitlo
989dYh0hi          DN dYithi
990dYh1lo          DN dYi0lo
991dYh1hi          DN dYi0hi
992dYh2lo          DN dYi2lo
993dYh2hi          DN dYi2hi
994dYh3lo          DN dYi3lo
995dYh3hi          DN dYi3hi
996dYh4lo          DN dYi7lo
997dYh4hi          DN dYi7hi
998dYh5lo          DN dYi5lo
999dYh5hi          DN dYi5hi
1000dYh6lo          DN dYi4lo
1001dYh6hi          DN dYi4hi
1002dYh7lo          DN dYi1lo
1003dYh7hi          DN dYi1hi
1004dYhtlo          DN dYi6lo
1005dYhthi          DN dYi6hi
1006
1007qYg0            QN qYh2
1008qYg1            QN qYht
1009qYg2            QN qYh1
1010qYg3            QN qYh0
1011qYg4            QN qYh4
1012qYg5            QN qYh5
1013qYg6            QN qYh6
1014qYg7            QN qYh7
1015qYgt            QN qYh3
1016
1017qYf0            QN qYg6
1018qYf1            QN qYg5
1019qYf2            QN qYg4
1020qYf3            QN qYgt
1021qYf4            QN qYg3
1022qYf5            QN qYg2
1023qYf6            QN qYg1
1024qYf7            QN qYg0
1025qYft            QN qYg7
1026
1027        VRSHR       qYj7, qYj7, #2
1028        VRSHR       qYj6, qYj6, #1
1029
1030        VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
1031        VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
1032        VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
1033        VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
1034        VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
1035        VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
1036
1037        VQDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
1038        ;// IStage 4,3 rows 0to1 x 1/2
1039
1040        MOV         pTemp, #0x4             ;// ensure correct round
1041        VDUP        qScale1, pTemp           ;// of DC result
1042        VADD        qYi0, qYi0, qScale1
1043
1044        VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
1045        VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
1046
1047        VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
1048        VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
1049        VSUB        qYh2, qYi2, qYi3        ;// h2, h3
1050        VQDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
1051
1052        VMULL       qXt0, dYi4lo, C         ;// c*i4
1053        VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
1054        VMULL       qXt1, dYi4hi, C
1055        VMLAL       qXt1, dYi6hi, S
1056        VSHRN       dYh4lo, qXt0, #16       ;// h4
1057        VSHRN       dYh4hi, qXt1, #16
1058
1059        VMULL       qXt0, dYi6lo, C         ;// c*i6
1060        VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
1061        VMULL       qXt1, dYi6hi, C
1062        VMLSL       qXt1, dYi4hi, S
1063        VSHRN       dYh6lo, qXt0, #16       ;// h6
1064        VSHRN       dYh6hi, qXt1, #16
1065
1066        VSUB        qYg6, qYh6, qYh7
1067        VSUB        qYg5, qYh5, qYg6
1068        VSUB        qYg4, qYh4, qYg5
1069
1070        ;// IStage 2 rows 0to3 x 1/2
1071        VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
1072        VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
1073        VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
1074        VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
1075
1076
1077        ;// IStage 1 all rows
1078        VHADD        qYf3, qYg3, qYg4
1079        VHSUB        qYf4, qYg3, qYg4
1080        VHADD        qYf2, qYg2, qYg5
1081        VHSUB        qYf5, qYg2, qYg5
1082        VHADD        qYf1, qYg1, qYg6
1083        VHSUB        qYf6, qYg1, qYg6
1084        VHADD        qYf0, qYg0, qYg7
1085        VHSUB        qYf7, qYg0, qYg7
1086
1087YTR0            EQU Src0
1088YTR1            EQU Src4
1089YTR2            EQU Src1
1090YTR3            EQU Src2
1091YTR4            EQU Src7
1092YTR5            EQU Src5
1093YTR6            EQU Tmp
1094YTR7            EQU Src6
1095YTRt            EQU Src3
1096
1097qC0             QN  YTR0.S32                ;// for YTRpose
1098qC1             QN  YTR1.S32
1099qC2             QN  YTR2.S32
1100qC3             QN  YTR3.S32
1101qC4             QN  YTR4.S32
1102qC5             QN  YTR5.S32
1103qC6             QN  YTR6.S32
1104qC7             QN  YTR7.S32
1105
1106dD0             DN  YTR0*2+1                ;// for using VSWP
1107dD1             DN  YTR1*2+1
1108dD2             DN  YTR2*2+1
1109dD3             DN  YTR3*2+1
1110dD4             DN  YTR4*2
1111dD5             DN  YTR5*2
1112dD6             DN  YTR6*2
1113dD7             DN  YTR7*2
1114
1115        VTRN        qYf0, qYf1
1116        VTRN        qYf2, qYf3
1117        VTRN        qYf4, qYf5
1118        VTRN        qYf6, qYf7
1119        VTRN        qC0, qC2
1120        VTRN        qC1, qC3
1121        VTRN        qC4, qC6
1122        VTRN        qC5, qC7
1123        VSWP        dD0, dD4
1124        VSWP        dD1, dD5
1125        VSWP        dD2, dD6
1126        VSWP        dD3, dD7
1127
1128
1129dYf0U8          DN YTR0*2.U8
1130dYf1U8          DN YTR1*2.U8
1131dYf2U8          DN YTR2*2.U8
1132dYf3U8          DN YTR3*2.U8
1133dYf4U8          DN YTR4*2.U8
1134dYf5U8          DN YTR5*2.U8
1135dYf6U8          DN YTR6*2.U8
1136dYf7U8          DN YTR7*2.U8
1137
1138        ;//
1139        ;// Do saturation if outsize is other than S16
1140        ;//
1141
1142        IF ("$outsize"="u8")
1143            ;// Output range [0-255]
1144            VQMOVN            dYf0U8, qYf0
1145            VQMOVN            dYf1U8, qYf1
1146            VQMOVN            dYf2U8, qYf2
1147            VQMOVN            dYf3U8, qYf3
1148            VQMOVN            dYf4U8, qYf4
1149            VQMOVN            dYf5U8, qYf5
1150            VQMOVN            dYf6U8, qYf6
1151            VQMOVN            dYf7U8, qYf7
1152        ENDIF
1153
1154        IF ("$outsize"="s9")
1155            ;// Output range [-256 to +255]
1156            VQSHL            qYf0, qYf0, #16-9
1157            VQSHL            qYf1, qYf1, #16-9
1158            VQSHL            qYf2, qYf2, #16-9
1159            VQSHL            qYf3, qYf3, #16-9
1160            VQSHL            qYf4, qYf4, #16-9
1161            VQSHL            qYf5, qYf5, #16-9
1162            VQSHL            qYf6, qYf6, #16-9
1163            VQSHL            qYf7, qYf7, #16-9
1164
1165            VSHR             qYf0, qYf0, #16-9
1166            VSHR             qYf1, qYf1, #16-9
1167            VSHR             qYf2, qYf2, #16-9
1168            VSHR             qYf3, qYf3, #16-9
1169            VSHR             qYf4, qYf4, #16-9
1170            VSHR             qYf5, qYf5, #16-9
1171            VSHR             qYf6, qYf6, #16-9
1172            VSHR             qYf7, qYf7, #16-9
1173        ENDIF
1174
1175        ;// Store output depending on the Stride size
1176        IF "$stride"="s"
1177            VST1        qYf0, [pDest @64], Stride
1178            VST1        qYf1, [pDest @64], Stride
1179            VST1        qYf2, [pDest @64], Stride
1180            VST1        qYf3, [pDest @64], Stride
1181            VST1        qYf4, [pDest @64], Stride
1182            VST1        qYf5, [pDest @64], Stride
1183            VST1        qYf6, [pDest @64], Stride
1184            VST1        qYf7, [pDest @64]
1185        ELSE
1186            IF ("$outsize"="u8")
1187                VST1        dYf0U8, [pDest @64], #8
1188                VST1        dYf1U8, [pDest @64], #8
1189                VST1        dYf2U8, [pDest @64], #8
1190                VST1        dYf3U8, [pDest @64], #8
1191                VST1        dYf4U8, [pDest @64], #8
1192                VST1        dYf5U8, [pDest @64], #8
1193                VST1        dYf6U8, [pDest @64], #8
1194                VST1        dYf7U8, [pDest @64]
1195            ELSE
1196                ;// ("$outsize"="s9") or ("$outsize"="s16")
1197                VST1        qYf0, [pDest @64], #16
1198                VST1        qYf1, [pDest @64], #16
1199                VST1        qYf2, [pDest @64], #16
1200                VST1        qYf3, [pDest @64], #16
1201                VST1        qYf4, [pDest @64], #16
1202                VST1        qYf5, [pDest @64], #16
1203                VST1        qYf6, [pDest @64], #16
1204                VST1        qYf7, [pDest @64]
1205            ENDIF
1206
1207        ENDIF
1208
1209
1210
1211        ENDIF ;// CortexA8
1212
1213
1214
1215        MEND
1216
1217        ;// Scale TWO input rows with TWO rows of 16 bit scale values
1218        ;//
1219        ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
1220        ;// input (Eight input values) with one row of scale values. Also
1221        ;// Loads next scale values from pScale, if $LastRow flag is not set.
1222        ;//
1223        ;// Input Registers:
1224        ;//
1225        ;// $dAlo           - Input D register with first four S16 values of row n
1226        ;// $dAhi           - Input D register with next four S16 values of row n
1227        ;// $dBlo           - Input D register with first four S16 values of row n+1
1228        ;// $dBhi           - Input D register with next four S16 values of row n+1
1229        ;// pScale          - Pointer to next row of scale values
1230        ;// qT0lo           - Temporary scratch register
1231        ;// qT0hi           - Temporary scratch register
1232        ;// qT1lo           - Temporary scratch register
1233        ;// qT1hi           - Temporary scratch register
1234        ;// dScale1lo       - Scale value of row n
1235        ;// dScale1hi       - Scale value of row n
1236        ;// dScale2lo       - Scale value of row n+1
1237        ;// dScale2hi       - Scale value of row n+1
1238        ;//
1239        ;// Input Flag
1240        ;//
1241        ;// $LastRow        - Flag to indicate whether current row is last row
1242        ;//
1243        ;// Output Registers:
1244        ;//
1245        ;// $dAlo           - Scaled output values (first four S16 of row n)
1246        ;// $dAhi           - Scaled output values (next four S16 of row n)
1247        ;// $dBlo           - Scaled output values (first four S16 of row n+1)
1248        ;// $dBhi           - Scaled output values (next four S16 of row n+1)
1249        ;// qScale1         - Scale values for next row
1250        ;// qScale2         - Scale values for next row+1
1251        ;// pScale          - Pointer to next row of scale values
1252        ;//
1253        MACRO
1254        M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
1255        VMULL       qT0lo, $dAlo, dScale1lo
1256        VMULL       qT0hi, $dAhi, dScale1hi
1257        VMULL       qT1lo, $dBlo, dScale2lo
1258        VMULL       qT1hi, $dBhi, dScale2hi
1259        IF "$LastRow"="0"
1260            VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
1261            VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
1262        ENDIF
1263        VQRSHRN       $dAlo, qT0lo, #12
1264        VQRSHRN       $dAhi, qT0hi, #12
1265        VQRSHRN       $dBlo, qT1lo, #12
1266        VQRSHRN       $dBhi, qT1hi, #12
1267        MEND
1268
1269        ;// Scale 8x8 block input values with 16 bit scale values
1270        ;//
1271        ;// This macro is used to pre-scale block of 8x8 input.
1272        ;// This also do the Ist stage transformations of IDCT.
1273        ;//
1274        ;// Input Registers:
1275        ;//
1276        ;// dXjnlo          - n th input D register with first four S16 values
1277        ;// dXjnhi          - n th input D register with next four S16 values
1278        ;// qXjn            - n th input Q register with eight S16 values
1279        ;// pScale          - Pointer to scale values
1280        ;//
1281        ;// Output Registers:
1282        ;//
1283        ;// qXin            - n th output Q register with eight S16 output values of 1st stage
1284        ;//
1285        MACRO
1286        M_IDCT_PRESCALE16
1287        VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
1288        VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
1289        M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
1290        M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
1291        M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
1292        M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
1293        VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
1294        VSUB        qXi6, qXj1, qXj7            ;// j1-j7
1295        LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
1296        VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
1297        VSUB        qXi2, qXj2, qXj6            ;// j2-j6
1298        VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
1299        VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
1300        VSUB        qXi4, qXj5, qXj3            ;// j5-j3
1301        MEND
1302
1303
1304        ;// Scale 8x8 block input values with 32 bit scale values
1305        ;//
1306        ;// This macro is used to pre-scale block of 8x8 input.
1307        ;// This also do the Ist stage transformations of IDCT.
1308        ;//
1309        ;// Input Registers:
1310        ;//
1311        ;// dXjnlo          - n th input D register with first four S16 values
1312        ;// dXjnhi          - n th input D register with next four S16 values
1313        ;// qXjn            - n th input Q register with eight S16 values
1314        ;// pScale          - Pointer to 32bit scale values in Q23 format
1315        ;//
1316        ;// Output Registers:
1317        ;//
1318        ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
1319        ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
1320        ;//
1321        MACRO
1322        M_IDCT_PRESCALE32
1323qScale0lo       QN 0.S32
1324qScale0hi       QN 1.S32
1325qScale1lo       QN 2.S32
1326qScale1hi       QN 3.S32
1327qScale2lo       QN qScale1lo
1328qScale2hi       QN qScale1hi
1329qScale3lo       QN qScale1lo
1330qScale3hi       QN qScale1hi
1331qScale4lo       QN qScale1lo
1332qScale4hi       QN qScale1hi
1333qScale5lo       QN qScale0lo
1334qScale5hi       QN qScale0hi
1335qScale6lo       QN qScale0lo
1336qScale6hi       QN qScale0hi
1337qScale7lo       QN qScale0lo
1338qScale7hi       QN qScale0hi
1339
1340qSrc0lo         QN 4.S32
1341qSrc0hi         QN 5.S32
1342qSrc1lo         QN 6.S32
1343qSrc1hi         QN Src4.S32
1344qSrc2lo         QN qSrc0lo
1345qSrc2hi         QN qSrc0hi
1346qSrc3lo         QN qSrc0lo
1347qSrc3hi         QN qSrc0hi
1348qSrc4lo         QN qSrc0lo
1349qSrc4hi         QN qSrc0hi
1350qSrc5lo         QN qSrc1lo
1351qSrc5hi         QN qSrc1hi
1352qSrc6lo         QN qSrc1lo
1353qSrc6hi         QN qSrc1hi
1354qSrc7lo         QN qSrc0lo
1355qSrc7hi         QN qSrc0hi
1356
1357qRes17lo        QN qScale0lo
1358qRes17hi        QN qScale0hi
1359qRes26lo        QN qScale0lo
1360qRes26hi        QN qScale0hi
1361qRes53lo        QN qScale0lo
1362qRes53hi        QN qScale0hi
1363
1364            ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
1365
1366            ;// Row 0
1367            VLD1        {qScale0lo, qScale0hi}, [pScale]!
1368            VSHLL       qSrc0lo, dXj0lo, #(12-1)
1369            VSHLL       qSrc0hi, dXj0hi, #(12-1)
1370            VLD1        {qScale1lo, qScale1hi}, [pScale]!
1371            VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
1372            VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
1373            VLD1        {qScale7lo, qScale7hi}, [pTemp]!
1374            VSHLL       qSrc1lo, dXj1lo, #(12-1)
1375            VSHLL       qSrc1hi, dXj1hi, #(12-1)
1376            VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
1377            VMOVN       dXi0hi, qSrc0hi
1378            VSHLL       qSrc7lo, dXj7lo, #(12-1)
1379            VSHLL       qSrc7hi, dXj7hi, #(12-1)
1380            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
1381            VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
1382            VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
1383            VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
1384            VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
1385            VLD1        {qScale2lo, qScale2hi}, [pScale]!
1386
1387            ;// Row 1 & 7
1388            VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
1389            VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
1390            VMOVN       dXi5lo, qRes17lo                ;// Output i5
1391            VMOVN       dXi5hi, qRes17hi
1392            VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
1393            VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
1394            VMOVN       dXi6lo, qRes17lo                ;// Output i6
1395            VMOVN       dXi6hi, qRes17hi
1396            VSHLL       qSrc2lo, dXj2lo, #(12-1)
1397            VSHLL       qSrc2hi, dXj2hi, #(12-1)
1398            VLD1        {qScale6lo, qScale6hi}, [pTemp]!
1399            VSHLL       qSrc6lo, dXj6lo, #(12-1)
1400            VSHLL       qSrc6hi, dXj6hi, #(12-1)
1401            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
1402            VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
1403            VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
1404            VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
1405            VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
1406            VLD1        {qScale3lo, qScale3hi}, [pScale]!
1407
1408            ;// Row 2 & 6
1409            VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
1410            VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
1411            VMOVN       dXi3lo, qRes26lo                ;// Output i3
1412            VMOVN       dXi3hi, qRes26hi
1413            VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
1414            VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
1415            VMOVN       dXi2lo, qRes26lo                ;// Output i2
1416            VMOVN       dXi2hi, qRes26hi
1417            VSHLL       qSrc3lo, dXj3lo, #(12-1)
1418            VSHLL       qSrc3hi, dXj3hi, #(12-1)
1419            VLD1        {qScale5lo, qScale5hi}, [pTemp]!
1420            VSHLL       qSrc5lo, dXj5lo, #(12-1)
1421            VSHLL       qSrc5hi, dXj5hi, #(12-1)
1422            VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
1423            VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
1424            VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
1425            VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
1426
1427            ;// Row 3 & 5
1428            VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
1429            VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
1430            SUB         pSrc, pSrc, #16*2*2
1431            VMOVN       dXi7lo, qRes53lo                ;// Output i7
1432            VMOVN       dXi7hi, qRes53hi
1433            VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
1434            VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
1435            VLD1        qXj4, [pSrc @64]
1436            VMOVN       dXi4lo, qRes53lo                ;// Output i4
1437            VMOVN       dXi4hi, qRes53hi
1438            VSHLL       qSrc4lo, dXj4lo, #(12-1)
1439            VSHLL       qSrc4hi, dXj4hi, #(12-1)
1440            VLD1        {qScale4lo, qScale4hi}, [pScale]
1441            LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
1442            VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
1443            VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
1444            VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
1445            ;// Row 4
1446            VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
1447            VMOVN       dXi1hi, qSrc4hi
1448
1449        MEND
1450
1451        END
1452