1;//
2;// This confidential and proprietary software may be used only as
3;// authorised by a licensing agreement from ARM Limited
4;//   (C) COPYRIGHT 2004 ARM Limited
5;//       ALL RIGHTS RESERVED
6;// The entire notice above must be reproduced on all authorised
7;// copies and copies may only be made to the extent permitted
8;// by a licensing agreement from ARM Limited.
9;//
10;// IDCT_s.s
11;//
12;// Inverse DCT module
13;//
14;//
15;// ALGORITHM DESCRIPTION
16;//
17;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
18;// column and then a 1D IDCT for each row.
19;//
20;// The 8-point 1D IDCT is defined by
21;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
22;//
23;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
24;//   c(u,x) = cos( (2x+1)*u*pi/16 )
25;//
26;// We compute the 8-point 1D IDCT using the reverse of
27;// the Arai-Agui-Nakajima flow graph which we split into
28;// 5 stages named in reverse order to identify with the
29;// forward DCT. Direct inversion of the forward formulae
30;// in file FDCT_s.s gives:
31;//
32;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
33;//             [ A(0) = 2*sqrt(2)
34;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
35;//
36;// IStage 4:   i0 = j0             i1 = j4
37;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
38;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
39;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
40;//
41;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
42;//             h2 = (i2*sqrt2)-i3  h3 = i3
43;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
44;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
45;//             [ The above two lines rotate by -(pi/8) ]
46;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2
47;//
48;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
49;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
50;//             g7 = h7             g6 = h6 - h7
51;//             g5 = h5 - g6        g4 = h4 - g5
52;//
53;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
54;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
55;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
56;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
57;//
58;// Note that most coefficients are halved 3 times during the
59;// above calculation. We can rescale the algorithm dividing
60;// the input by 8 to remove the halvings.
61;//
62;// IStage 5:   j(u) = T(u)*A(u)/8
63;//
64;// IStage 4:   i0 = j0             i1 = j4
65;//             i3 = j2 + j6        i2 = j2 - j6
66;//             i7 = j5 + j3        i4 = j5 - j3
67;//             i5 = j1 + j7        i6 = j1 - j7
68;//
69;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
70;//             h2 = (i2*sqrt2)-i3  h3 = i3
71;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
72;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
73;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7
74;//
75;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
76;//             g1 = h1 + h2        g2 = h1 - h2
77;//             g7 = h7             g6 = h6 - h7
78;//             g5 = h5 - g6        g4 = h4 - g5
79;//
80;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
81;//             f1 = g1 + g6        f6 = g1 - g6
82;//             f2 = g2 + g5        f5 = g2 - g5
83;//             f3 = g3 + g4        f4 = g3 - g4
84;//
85;// Note:
86;// 1. The scaling by A(u)/8 can often be combined with inverse
87;//    quantization. The column and row scalings can be combined.
88;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
89;//    to the above code but is otherwise identical.
90;// 3. The rotation by -pi/8 can be peformed using three multiplies
91;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
92;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
93;// 4. If |T(u)|<=1 then from the IDCT definition,
94;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
95;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
96;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
97;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
98;//            = (approx)2.64
99;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
100;//    The table below shows input patterns generating the maximum
101;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
102;//    InputPattern      Max |f(x)|
103;//      PPPPPPPP        |f0| =  2.64
104;//      PPPMMMMM        |f1| =  2.64
105;//      PPMMMPPP        |f2| =  2.64
106;//      PPMMPPMM        |f3| =  2.64
107;//      PMMPPMMP        |f4| =  2.64
108;//      PMMPMMPM        |f5| =  2.64
109;//      PMPPMPMP        |f6| =  2.64
110;//      PMPMPMPM        |f7| =  2.64
111;//   Note that this input pattern is the transpose of the
112;//   corresponding max input patter for the FDCT.
113
114;// Arguments
115
116pSrc    RN 0    ;// source data buffer
117Stride  RN 1    ;// destination stride in bytes
118pDest   RN 2    ;// destination data buffer
119pScale  RN 3    ;// pointer to scaling table
120
121
122        ;// DCT Inverse Macro
123        ;// The DCT code should be parametrized according
124        ;// to the following inputs:
125        ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
126        ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
127        ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
128        ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
129        ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
130        ;//
131        ;// Inputs:
132        ;// pSrc   = r0 = Pointer to input data
133        ;//               Range is -256 to +255 (9-bit)
134        ;// Stride = r1 = Stride between input lines
135        ;// pDest  = r2 = Pointer to output data
136        ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
137
138
139
140        MACRO
141        M_IDCT  $outsize, $inscale, $stride
142        LCLA    SHIFT
143
144
145        IF ARM1136JS
146
147;// REGISTER ALLOCATION
148;// This is hard since we have 8 values, 9 free registers and each
149;// butterfly requires a temporary register. We also want to
150;// maintain register order so we can use LDM/STM. The table below
151;// summarises the register allocation that meets all these criteria.
152;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
153;//
154;// r1  a01     g0  h0
155;// r4  b01 f0  g1  h1  i0
156;// r5  a23 f1  g2      i1
157;// r6  b23 f2  g3  h2  i2
158;// r7  a45 f3      h3  i3
159;// r8  b45 f4  g4  h4  i4
160;// r9  a67 f5  g5  h5  i5
161;// r10 b67 f6  g6  h6  i6
162;// r11     f7  g7  h7  i7
163;//
164ra01    RN 1
165rb01    RN 4
166ra23    RN 5
167rb23    RN 6
168ra45    RN 7
169rb45    RN 8
170ra67    RN 9
171rb67    RN 10
172rtmp    RN 11
173csPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
174LoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
175;// Transpose allocation
176xft     RN ra01
177xf0     RN rb01
178xf1     RN ra23
179xf2     RN rb23
180xf3     RN ra45
181xf4     RN rb45
182xf5     RN ra67
183xf6     RN rb67
184xf7     RN rtmp
185;// IStage 1 allocation
186xg0     RN xft
187xg1     RN xf0
188xg2     RN xf1
189xg3     RN xf2
190xgt     RN xf3
191xg4     RN xf4
192xg5     RN xf5
193xg6     RN xf6
194xg7     RN xf7
195;// IStage 2 allocation
196xh0     RN xg0
197xh1     RN xg1
198xht     RN xg2
199xh2     RN xg3
200xh3     RN xgt
201xh4     RN xg4
202xh5     RN xg5
203xh6     RN xg6
204xh7     RN xg7
205;// IStage 3,4 allocation
206xit     RN xh0
207xi0     RN xh1
208xi1     RN xht
209xi2     RN xh2
210xi3     RN xh3
211xi4     RN xh4
212xi5     RN xh5
213xi6     RN xh6
214xi7     RN xh7
215
216        M_STR   pDest,  ppDest
217        IF "$stride"="s"
218            M_STR   Stride, pStride
219        ENDIF
220        M_ADR   pDest,  pBlk
221        LDR     csPiBy8, =0x30fc7642
222        LDR     LoopRR2, =0x00005a82
223
224v6_idct_col$_F
225        ;// Load even values
226        LDR     xi4, [pSrc], #4  ;// j0
227        LDR     xi5, [pSrc, #4*16-4]  ;// j4
228        LDR     xi6, [pSrc, #2*16-4]  ;// j2
229        LDR     xi7, [pSrc, #6*16-4]  ;// j6
230
231        ;// Scale Even Values
232        IF "$inscale"="s16" ;// 16x16 mul
233SHIFT       SETA    12
234            LDR     xi0, [pScale], #4
235            LDR     xi1, [pScale, #4*16-4]
236            LDR     xi2, [pScale, #2*16-4]
237            MOV     xit, #1<<(SHIFT-1)
238            SMLABB  xi3, xi0, xi4, xit
239            SMLATT  xi4, xi0, xi4, xit
240            SMLABB  xi0, xi1, xi5, xit
241            SMLATT  xi5, xi1, xi5, xit
242            MOV     xi3, xi3, ASR #SHIFT
243            PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
244            LDR     xi3, [pScale, #6*16-4]
245            SMLABB  xi1, xi2, xi6, xit
246            SMLATT  xi6, xi2, xi6, xit
247            MOV     xi0, xi0, ASR #SHIFT
248            PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
249            SMLABB  xi2, xi3, xi7, xit
250            SMLATT  xi7, xi3, xi7, xit
251            MOV     xi1, xi1, ASR #SHIFT
252            PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
253            MOV     xi2, xi2, ASR #SHIFT
254            PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
255        ENDIF
256        IF "$inscale"="s32" ;// 32x16 mul
257SHIFT       SETA    (12+8-16)
258            MOV     xit, #1<<(SHIFT-1)
259            LDR     xi0, [pScale], #8
260            LDR     xi1, [pScale, #0*32+4-8]
261            LDR     xi2, [pScale, #4*32-8]
262            LDR     xi3, [pScale, #4*32+4-8]
263            SMLAWB  xi0, xi0, xi4, xit
264            SMLAWT  xi1, xi1, xi4, xit
265            SMLAWB  xi2, xi2, xi5, xit
266            SMLAWT  xi3, xi3, xi5, xit
267            MOV     xi0, xi0, ASR #SHIFT
268            PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
269            MOV     xi2, xi2, ASR #SHIFT
270            PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
271            LDR     xi0, [pScale, #2*32-8]
272            LDR     xi1, [pScale, #2*32+4-8]
273            LDR     xi2, [pScale, #6*32-8]
274            LDR     xi3, [pScale, #6*32+4-8]
275            SMLAWB  xi0, xi0, xi6, xit
276            SMLAWT  xi1, xi1, xi6, xit
277            SMLAWB  xi2, xi2, xi7, xit
278            SMLAWT  xi3, xi3, xi7, xit
279            MOV     xi0, xi0, ASR #SHIFT
280            PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
281            MOV     xi2, xi2, ASR #SHIFT
282            PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
283        ENDIF
284
285        ;// Load odd values
286        LDR     xi0, [pSrc, #1*16-4]      ;// j1
287        LDR     xi1, [pSrc, #7*16-4]      ;// j7
288        LDR     xi2, [pSrc, #5*16-4]      ;// j5
289        LDR     xi3, [pSrc, #3*16-4]      ;// j3
290
291        IF  {TRUE}
292            ;// shortcut if odd values 0
293            TEQ     xi0, #0
294            TEQEQ   xi1, #0
295            TEQEQ   xi2, #0
296            TEQEQ   xi3, #0
297            BEQ     v6OddZero$_F
298        ENDIF
299
300        ;// Store scaled even values
301        STMIA   pDest, {xi4, xi5, xi6, xi7}
302
303        ;// Scale odd values
304        IF "$inscale"="s16"
305            ;// Perform AAN Scale
306            LDR     xi4, [pScale, #1*16-4]
307            LDR     xi5, [pScale, #7*16-4]
308            LDR     xi6, [pScale, #5*16-4]
309            SMLABB  xi7, xi0, xi4, xit
310            SMLATT  xi0, xi0, xi4, xit
311            SMLABB  xi4, xi1, xi5, xit
312            SMLATT  xi1, xi1, xi5, xit
313            MOV     xi7, xi7, ASR #SHIFT
314            PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
315            LDR     xi7, [pScale, #3*16-4]
316            SMLABB  xi5, xi2, xi6, xit
317            SMLATT  xi2, xi2, xi6, xit
318            MOV     xi4, xi4, ASR #SHIFT
319            PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
320            SMLABB  xi6, xi3, xi7, xit
321            SMLATT  xi3, xi3, xi7, xit
322            MOV     xi5, xi5, ASR #SHIFT
323            PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
324            MOV     xi6, xi6, ASR #SHIFT
325            PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
326        ENDIF
327        IF "$inscale"="s32" ;// 32x16 mul
328            LDR     xi4, [pScale, #1*32-8]
329            LDR     xi5, [pScale, #1*32+4-8]
330            LDR     xi6, [pScale, #7*32-8]
331            LDR     xi7, [pScale, #7*32+4-8]
332            SMLAWB  xi4, xi4, xi0, xit
333            SMLAWT  xi5, xi5, xi0, xit
334            SMLAWB  xi6, xi6, xi1, xit
335            SMLAWT  xi7, xi7, xi1, xit
336            MOV     xi4, xi4, ASR #SHIFT
337            PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
338            MOV     xi6, xi6, ASR #SHIFT
339            PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
340            LDR     xi4, [pScale, #5*32-8]
341            LDR     xi5, [pScale, #5*32+4-8]
342            LDR     xi6, [pScale, #3*32-8]
343            LDR     xi7, [pScale, #3*32+4-8]
344            SMLAWB  xi4, xi4, xi2, xit
345            SMLAWT  xi5, xi5, xi2, xit
346            SMLAWB  xi6, xi6, xi3, xit
347            SMLAWT  xi7, xi7, xi3, xit
348            MOV     xi4, xi4, ASR #SHIFT
349            PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
350            MOV     xi6, xi6, ASR #SHIFT
351            PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
352        ENDIF
353
354        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
355        SSUB16  xi6, xi0, xi1           ;// j1-j7
356        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
357        SSUB16  xi4, xi2, xi3           ;// j5-j3
358
359        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
360
361        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
362        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
363
364        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
365        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
366        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
367        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
368
369        SMULBB  xi1, xi3, LoopRR2
370        SMULTB  xi3, xi3, LoopRR2
371
372        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
373        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
374        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
375
376        ;// xi0,xi1,xi2,xi3 now free
377        ;// IStage 4,3, rows 2to3 x1/2
378
379        MOV     xi3, xi3, LSL #1
380        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
381        LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
382
383        ;// IStage 2, rows4to7
384        SSUB16  xg6, xh6, xh7
385        SSUB16  xg5, xh5, xg6
386        SSUB16  xg4, xh4, xg5
387
388        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
389        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
390
391        SMULBB  xi0, xi2, LoopRR2
392        SMULTB  xi2, xi2, LoopRR2
393
394        MOV     xi2, xi2, LSL #1
395        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
396
397        ;// xi0, xi1 now free
398        ;// IStage 4,3 rows 0to1 x 1/2
399        LDRD    xi0, [pDest]            ;// j0, j4 scaled
400        SSUB16  xh2, xh2, xi3
401        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
402
403        SHADD16 xh0, xi0, xi1
404        SHSUB16 xh1, xi0, xi1
405
406        ;// IStage 2 rows 0to3 x 1/2
407        SHSUB16 xg2, xh1, xh2
408        SHADD16 xg1, xh1, xh2
409        SHSUB16 xg3, xh0, xh3
410        SHADD16 xg0, xh0, xh3
411
412        ;// IStage 1 all rows
413        SADD16  xf3, xg3, xg4
414        SSUB16  xf4, xg3, xg4
415        SADD16  xf2, xg2, xg5
416        SSUB16  xf5, xg2, xg5
417        SADD16  xf1, xg1, xg6
418        SSUB16  xf6, xg1, xg6
419        SADD16  xf0, xg0, xg7
420        SSUB16  xf7, xg0, xg7
421
422        ;// Transpose, store and loop
423        PKHBT   ra01, xf0, xf1, LSL #16
424        PKHTB   rb01, xf1, xf0, ASR #16
425
426        PKHBT   ra23, xf2, xf3, LSL #16
427        PKHTB   rb23, xf3, xf2, ASR #16
428
429        PKHBT   ra45, xf4, xf5, LSL #16
430        PKHTB   rb45, xf5, xf4, ASR #16
431
432        PKHBT   ra67, xf6, xf7, LSL #16
433        STMIA   pDest!, {ra01, ra23, ra45, ra67}
434        PKHTB   rb67, xf7, xf6, ASR #16
435        STMIA   pDest!, {rb01, rb23, rb45, rb67}
436        BCC     v6_idct_col$_F
437
438        SUB     pSrc, pDest, #(64*2)
439        M_LDR   pDest, ppDest
440        IF "$stride"="s"
441            M_LDR   pScale, pStride
442        ENDIF
443        B       v6_idct_row$_F
444
445v6OddZero$_F
446        SSUB16  xi2, xi6, xi7           ;// (j2-j6)
447        SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
448
449        SMULBB  xi0, xi2, LoopRR2
450        SMULTB  xi2, xi2, LoopRR2
451
452        MOV     xi2, xi2, LSL #1
453        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
454        SSUB16  xh2, xh2, xi3
455
456        ;// xi0, xi1 now free
457        ;// IStage 4,3 rows 0to1 x 1/2
458
459        SHADD16 xh0, xi4, xi5
460        SHSUB16 xh1, xi4, xi5
461
462        ;// IStage 2 rows 0to3 x 1/2
463        SHSUB16 xg2, xh1, xh2
464        SHADD16 xg1, xh1, xh2
465        SHSUB16 xg3, xh0, xh3
466        SHADD16 xg0, xh0, xh3
467
468        ;// IStage 1 all rows
469        MOV  xf3, xg3
470        MOV  xf4, xg3
471        MOV  xf2, xg2
472        MOV  xf5, xg2
473        MOV  xf1, xg1
474        MOV  xf6, xg1
475        MOV  xf0, xg0
476        MOV  xf7, xg0
477
478        ;// Transpose
479        PKHBT   ra01, xf0, xf1, LSL #16
480        PKHTB   rb01, xf1, xf0, ASR #16
481
482        PKHBT   ra23, xf2, xf3, LSL #16
483        PKHTB   rb23, xf3, xf2, ASR #16
484
485        PKHBT   ra45, xf4, xf5, LSL #16
486        PKHTB   rb45, xf5, xf4, ASR #16
487
488        PKHBT   ra67, xf6, xf7, LSL #16
489        PKHTB   rb67, xf7, xf6, ASR #16
490
491        STMIA   pDest!, {ra01, ra23, ra45, ra67}
492        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
493        STMIA   pDest!, {rb01, rb23, rb45, rb67}
494
495        BCC     v6_idct_col$_F
496        SUB     pSrc, pDest, #(64*2)
497        M_LDR   pDest, ppDest
498        IF "$stride"="s"
499            M_LDR   pScale, pStride
500        ENDIF
501
502
503v6_idct_row$_F
504        ;// IStage 4,3, rows4to7 x1/4
505        LDR     xit, =0x00010001        ;// rounding constant
506        LDR     xi0, [pSrc, #1*16]      ;// j1
507        LDR     xi1, [pSrc, #7*16]      ;// 4*j7
508        LDR     xi2, [pSrc, #5*16]      ;// j5
509        LDR     xi3, [pSrc, #3*16]      ;// j3
510
511        SHADD16 xi1, xi1, xit           ;// 2*j7
512        SHADD16 xi1, xi1, xit           ;// j7
513
514        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
515        SSUB16  xi6, xi0, xi1           ;// j1-j7
516        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
517        SSUB16  xi4, xi2, xi3           ;// j5-j3
518
519        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
520
521        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
522        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
523
524        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
525        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
526        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
527        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
528
529        SMULBB  xi1, xi3, LoopRR2
530        SMULTB  xi3, xi3, LoopRR2
531
532        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
533        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
534        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
535
536        MOV     xi3, xi3, LSL #1
537        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
538
539        ;// xi0,xi1,xi2,xi3 now free
540        ;// IStage 4,3, rows 2to3 x1/2
541
542        LDR     xi0, [pSrc, #2*16]      ;// j2
543        LDR     xi1, [pSrc, #6*16]      ;// 2*j6
544
545        ;// IStage 2, rows4to7
546        SSUB16  xg6, xh6, xh7
547        SSUB16  xg5, xh5, xg6
548        SSUB16  xg4, xh4, xg5
549
550        SHADD16 xi1, xi1, xit           ;// j6
551        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
552        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
553
554        SMULBB  xi0, xi2, LoopRR2
555        SMULTB  xi2, xi2, LoopRR2
556
557        MOV     xi2, xi2, LSL #1
558
559        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
560
561        ;// xi0, xi1 now free
562        ;// IStage 4,3 rows 0to1 x 1/2
563        LDR     xi1, [pSrc, #4*16]      ;// j4
564        LDR     xi0, [pSrc], #4         ;// j0
565
566        SSUB16  xh2, xh2, xi3
567        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
568
569        ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
570        SHADD16 xh0, xi0, xi1           ;// of DC result
571        SHSUB16 xh1, xi0, xi1
572
573        ;// IStage 2 rows 0to3 x 1/2
574        SHSUB16 xg2, xh1, xh2
575        SHADD16 xg1, xh1, xh2
576        SHSUB16 xg3, xh0, xh3
577        SHADD16 xg0, xh0, xh3
578
579        ;// IStage 1 all rows
580        SHADD16 xf3, xg3, xg4
581        SHSUB16 xf4, xg3, xg4
582        SHADD16 xf2, xg2, xg5
583        SHSUB16 xf5, xg2, xg5
584        SHADD16 xf1, xg1, xg6
585        SHSUB16 xf6, xg1, xg6
586        SHADD16 xf0, xg0, xg7
587        SHSUB16 xf7, xg0, xg7
588
589        ;// Saturate
590        IF ("$outsize"="u8")
591            USAT16  xf0, #8, xf0
592            USAT16  xf1, #8, xf1
593            USAT16  xf2, #8, xf2
594            USAT16  xf3, #8, xf3
595            USAT16  xf4, #8, xf4
596            USAT16  xf5, #8, xf5
597            USAT16  xf6, #8, xf6
598            USAT16  xf7, #8, xf7
599        ENDIF
600        IF ("$outsize"="s9")
601            SSAT16  xf0, #9, xf0
602            SSAT16  xf1, #9, xf1
603            SSAT16  xf2, #9, xf2
604            SSAT16  xf3, #9, xf3
605            SSAT16  xf4, #9, xf4
606            SSAT16  xf5, #9, xf5
607            SSAT16  xf6, #9, xf6
608            SSAT16  xf7, #9, xf7
609        ENDIF
610
611        ;// Transpose to Row, Pack and store
612        IF ("$outsize"="u8")
613            ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
614            ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
615            ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
616            ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
617            PKHBT   ra01, xf0, xf2, LSL #16
618            PKHTB   rb01, xf2, xf0, ASR #16
619            PKHBT   ra23, xf4, xf6, LSL #16
620            PKHTB   rb23, xf6, xf4, ASR #16
621            STMIA   pDest, {ra01, ra23}
622            IF "$stride"="s"
623                ADD     pDest, pDest, pScale
624                STMIA   pDest, {rb01, rb23}
625                ADD     pDest, pDest, pScale
626            ELSE
627                ADD     pDest, pDest, #($stride)
628                STMIA   pDest, {rb01, rb23}
629                ADD     pDest, pDest, #($stride)
630            ENDIF
631        ENDIF
632        IF ("$outsize"="s9"):LOR:("$outsize"="s16")
633            PKHBT   ra01, xf0, xf1, LSL #16
634            PKHTB   rb01, xf1, xf0, ASR #16
635
636            PKHBT   ra23, xf2, xf3, LSL #16
637            PKHTB   rb23, xf3, xf2, ASR #16
638
639            PKHBT   ra45, xf4, xf5, LSL #16
640            PKHTB   rb45, xf5, xf4, ASR #16
641
642            PKHBT   ra67, xf6, xf7, LSL #16
643            PKHTB   rb67, xf7, xf6, ASR #16
644
645            STMIA   pDest, {ra01, ra23, ra45, ra67}
646            IF "$stride"="s"
647                ADD     pDest, pDest, pScale
648                STMIA   pDest, {rb01, rb23, rb45, rb67}
649                ADD     pDest, pDest, pScale
650            ELSE
651                ADD     pDest, pDest, #($stride)
652                STMIA   pDest, {rb01, rb23, rb45, rb67}
653                ADD     pDest, pDest, #($stride)
654            ENDIF
655        ENDIF
656
657        BCC     v6_idct_row$_F
658        ENDIF ;// ARM1136JS
659
660
661        IF CortexA8
662
663Src0            EQU  7
664Src1            EQU  8
665Src2            EQU  9
666Src3            EQU  10
667Src4            EQU  11
668Src5            EQU  12
669Src6            EQU  13
670Src7            EQU  14
671Tmp             EQU  15
672
673qXj0            QN Src0.S16
674qXj1            QN Src1.S16
675qXj2            QN Src2.S16
676qXj3            QN Src3.S16
677qXj4            QN Src4.S16
678qXj5            QN Src5.S16
679qXj6            QN Src6.S16
680qXj7            QN Src7.S16
681qXjt            QN Tmp.S16
682
683dXj0lo          DN (Src0*2).S16
684dXj0hi          DN (Src0*2+1).S16
685dXj1lo          DN (Src1*2).S16
686dXj1hi          DN (Src1*2+1).S16
687dXj2lo          DN (Src2*2).S16
688dXj2hi          DN (Src2*2+1).S16
689dXj3lo          DN (Src3*2).S16
690dXj3hi          DN (Src3*2+1).S16
691dXj4lo          DN (Src4*2).S16
692dXj4hi          DN (Src4*2+1).S16
693dXj5lo          DN (Src5*2).S16
694dXj5hi          DN (Src5*2+1).S16
695dXj6lo          DN (Src6*2).S16
696dXj6hi          DN (Src6*2+1).S16
697dXj7lo          DN (Src7*2).S16
698dXj7hi          DN (Src7*2+1).S16
699dXjtlo          DN (Tmp*2).S16
700dXjthi          DN (Tmp*2+1).S16
701
702qXi0            QN qXj0
703qXi1            QN qXj4
704qXi2            QN qXj2
705qXi3            QN qXj7
706qXi4            QN qXj5
707qXi5            QN qXjt
708qXi6            QN qXj1
709qXi7            QN qXj6
710qXit            QN qXj3
711
712dXi0lo          DN dXj0lo
713dXi0hi          DN dXj0hi
714dXi1lo          DN dXj4lo
715dXi1hi          DN dXj4hi
716dXi2lo          DN dXj2lo
717dXi2hi          DN dXj2hi
718dXi3lo          DN dXj7lo
719dXi3hi          DN dXj7hi
720dXi4lo          DN dXj5lo
721dXi4hi          DN dXj5hi
722dXi5lo          DN dXjtlo
723dXi5hi          DN dXjthi
724dXi6lo          DN dXj1lo
725dXi6hi          DN dXj1hi
726dXi7lo          DN dXj6lo
727dXi7hi          DN dXj6hi
728dXitlo          DN dXj3lo
729dXithi          DN dXj3hi
730
731qXh0            QN qXit
732qXh1            QN qXi0
733qXh2            QN qXi2
734qXh3            QN qXi3
735qXh4            QN qXi7
736qXh5            QN qXi5
737qXh6            QN qXi4
738qXh7            QN qXi1
739qXht            QN qXi6
740
741dXh0lo          DN dXitlo
742dXh0hi          DN dXithi
743dXh1lo          DN dXi0lo
744dXh1hi          DN dXi0hi
745dXh2lo          DN dXi2lo
746dXh2hi          DN dXi2hi
747dXh3lo          DN dXi3lo
748dXh3hi          DN dXi3hi
749dXh4lo          DN dXi7lo
750dXh4hi          DN dXi7hi
751dXh5lo          DN dXi5lo
752dXh5hi          DN dXi5hi
753dXh6lo          DN dXi4lo
754dXh6hi          DN dXi4hi
755dXh7lo          DN dXi1lo
756dXh7hi          DN dXi1hi
757dXhtlo          DN dXi6lo
758dXhthi          DN dXi6hi
759
760qXg0            QN qXh2
761qXg1            QN qXht
762qXg2            QN qXh1
763qXg3            QN qXh0
764qXg4            QN qXh4
765qXg5            QN qXh5
766qXg6            QN qXh6
767qXg7            QN qXh7
768qXgt            QN qXh3
769
770qXf0            QN qXg6
771qXf1            QN qXg5
772qXf2            QN qXg4
773qXf3            QN qXgt
774qXf4            QN qXg3
775qXf5            QN qXg2
776qXf6            QN qXg1
777qXf7            QN qXg0
778qXft            QN qXg7
779
780
781qXt0            QN 1.S32
782qXt1            QN 2.S32
783qT0lo           QN 1.S32
784qT0hi           QN 2.S32
785qT1lo           QN 3.S32
786qT1hi           QN 4.S32
787qScalelo        QN 5.S32        ;// used to read post scale values
788qScalehi        QN 6.S32
789qTemp0          QN 5.S32
790qTemp1          QN 6.S32
791
792
793Scale1          EQU 6
794Scale2          EQU 15
795qScale1         QN Scale1.S16
796qScale2         QN Scale2.S16
797dScale1lo       DN (Scale1*2).S16
798dScale1hi       DN (Scale1*2+1).S16
799dScale2lo       DN (Scale2*2).S16
800dScale2hi       DN (Scale2*2+1).S16
801
802dCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
803InvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
804S               DN dCoefs[1]    ;// Sin(PI/8) in Q15
805C               DN dCoefs[2]    ;// Cos(PI/8) in Q15
806
807pTemp           RN 12
808
809
810        IMPORT  armCOMM_IDCTCoef
811
812        VLD1        {qXj0,qXj1}, [pSrc @64]!
813        VLD1        {qXj2,qXj3}, [pSrc @64]!
814        VLD1        {qXj4,qXj5}, [pSrc @64]!
815        VLD1        {qXj6,qXj7}, [pSrc @64]!
816
817        ;// Load PreScale and multiply with Src
818        ;// IStage 4
819
820        IF "$inscale"="s16"                         ;// 16X16 Mul
821            M_IDCT_PRESCALE16
822        ENDIF
823
824        IF "$inscale"="s32"                         ;// 32X32 ,ul
825            M_IDCT_PRESCALE32
826        ENDIF
827
828        ;// IStage 3
829        VQRDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
830        VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
831        VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
832        VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
833        VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
834        VQRDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
835        VSUB        qXh2, qXi2, qXi3                ;// h2, h3
836
837        VMULL       qXt0, dXi4lo, C                 ;// c*i4
838        VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
839        VMULL       qXt1, dXi4hi, C
840        VMLAL       qXt1, dXi6hi, S
841        VSHRN       dXh4lo, qXt0, #16               ;// h4
842        VSHRN       dXh4hi, qXt1, #16
843
844        VMULL       qXt0, dXi6lo, C                 ;// c*i6
845        VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
846        VMULL       qXt1, dXi6hi, C
847        VMLSL       qXt1, dXi4hi, S
848        VSHRN       dXh6lo, qXt0, #16               ;// h6
849        VSHRN       dXh6hi, qXt1, #16
850
851        ;// IStage 2
852        VSUB        qXg6, qXh6, qXh7
853        VSUB        qXg5, qXh5, qXg6
854        VSUB        qXg4, qXh4, qXg5
855        VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
856        VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
857        VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
858        VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
859
860        ;// IStage 1 all rows
861        VADD        qXf3, qXg3, qXg4
862        VSUB        qXf4, qXg3, qXg4
863        VADD        qXf2, qXg2, qXg5
864        VSUB        qXf5, qXg2, qXg5
865        VADD        qXf1, qXg1, qXg6
866        VSUB        qXf6, qXg1, qXg6
867        VADD        qXf0, qXg0, qXg7
868        VSUB        qXf7, qXg0, qXg7
869
870        ;// Transpose, store and loop
871XTR0            EQU Src5
872XTR1            EQU Tmp
873XTR2            EQU Src6
874XTR3            EQU Src7
875XTR4            EQU Src3
876XTR5            EQU Src0
877XTR6            EQU Src1
878XTR7            EQU Src2
879XTRt            EQU Src4
880
881qA0             QN  XTR0.S32  ;// for XTRpose
882qA1             QN  XTR1.S32
883qA2             QN  XTR2.S32
884qA3             QN  XTR3.S32
885qA4             QN  XTR4.S32
886qA5             QN  XTR5.S32
887qA6             QN  XTR6.S32
888qA7             QN  XTR7.S32
889
890dB0             DN  XTR0*2+1      ;// for using VSWP
891dB1             DN  XTR1*2+1
892dB2             DN  XTR2*2+1
893dB3             DN  XTR3*2+1
894dB4             DN  XTR4*2
895dB5             DN  XTR5*2
896dB6             DN  XTR6*2
897dB7             DN  XTR7*2
898
899
900        VTRN        qXf0, qXf1
901        VTRN        qXf2, qXf3
902        VTRN        qXf4, qXf5
903        VTRN        qXf6, qXf7
904        VTRN        qA0, qA2
905        VTRN        qA1, qA3
906        VTRN        qA4, qA6
907        VTRN        qA5, qA7
908        VSWP        dB0, dB4
909        VSWP        dB1, dB5
910        VSWP        dB2, dB6
911        VSWP        dB3, dB7
912
913
914qYj0            QN qXf0
915qYj1            QN qXf1
916qYj2            QN qXf2
917qYj3            QN qXf3
918qYj4            QN qXf4
919qYj5            QN qXf5
920qYj6            QN qXf6
921qYj7            QN qXf7
922qYjt            QN qXft
923
924dYj0lo          DN (XTR0*2).S16
925dYj0hi          DN (XTR0*2+1).S16
926dYj1lo          DN (XTR1*2).S16
927dYj1hi          DN (XTR1*2+1).S16
928dYj2lo          DN (XTR2*2).S16
929dYj2hi          DN (XTR2*2+1).S16
930dYj3lo          DN (XTR3*2).S16
931dYj3hi          DN (XTR3*2+1).S16
932dYj4lo          DN (XTR4*2).S16
933dYj4hi          DN (XTR4*2+1).S16
934dYj5lo          DN (XTR5*2).S16
935dYj5hi          DN (XTR5*2+1).S16
936dYj6lo          DN (XTR6*2).S16
937dYj6hi          DN (XTR6*2+1).S16
938dYj7lo          DN (XTR7*2).S16
939dYj7hi          DN (XTR7*2+1).S16
940dYjtlo          DN (XTRt*2).S16
941dYjthi          DN (XTRt*2+1).S16
942
943qYi0            QN qYj0
944qYi1            QN qYj4
945qYi2            QN qYj2
946qYi3            QN qYj7
947qYi4            QN qYj5
948qYi5            QN qYjt
949qYi6            QN qYj1
950qYi7            QN qYj6
951qYit            QN qYj3
952
953dYi0lo          DN dYj0lo
954dYi0hi          DN dYj0hi
955dYi1lo          DN dYj4lo
956dYi1hi          DN dYj4hi
957dYi2lo          DN dYj2lo
958dYi2hi          DN dYj2hi
959dYi3lo          DN dYj7lo
960dYi3hi          DN dYj7hi
961dYi4lo          DN dYj5lo
962dYi4hi          DN dYj5hi
963dYi5lo          DN dYjtlo
964dYi5hi          DN dYjthi
965dYi6lo          DN dYj1lo
966dYi6hi          DN dYj1hi
967dYi7lo          DN dYj6lo
968dYi7hi          DN dYj6hi
969dYitlo          DN dYj3lo
970dYithi          DN dYj3hi
971
972qYh0            QN qYit
973qYh1            QN qYi0
974qYh2            QN qYi2
975qYh3            QN qYi3
976qYh4            QN qYi7
977qYh5            QN qYi5
978qYh6            QN qYi4
979qYh7            QN qYi1
980qYht            QN qYi6
981
982dYh0lo          DN dYitlo
983dYh0hi          DN dYithi
984dYh1lo          DN dYi0lo
985dYh1hi          DN dYi0hi
986dYh2lo          DN dYi2lo
987dYh2hi          DN dYi2hi
988dYh3lo          DN dYi3lo
989dYh3hi          DN dYi3hi
990dYh4lo          DN dYi7lo
991dYh4hi          DN dYi7hi
992dYh5lo          DN dYi5lo
993dYh5hi          DN dYi5hi
994dYh6lo          DN dYi4lo
995dYh6hi          DN dYi4hi
996dYh7lo          DN dYi1lo
997dYh7hi          DN dYi1hi
998dYhtlo          DN dYi6lo
999dYhthi          DN dYi6hi
1000
1001qYg0            QN qYh2
1002qYg1            QN qYht
1003qYg2            QN qYh1
1004qYg3            QN qYh0
1005qYg4            QN qYh4
1006qYg5            QN qYh5
1007qYg6            QN qYh6
1008qYg7            QN qYh7
1009qYgt            QN qYh3
1010
1011qYf0            QN qYg6
1012qYf1            QN qYg5
1013qYf2            QN qYg4
1014qYf3            QN qYgt
1015qYf4            QN qYg3
1016qYf5            QN qYg2
1017qYf6            QN qYg1
1018qYf7            QN qYg0
1019qYft            QN qYg7
1020
1021        VRSHR       qYj7, qYj7, #2
1022        VRSHR       qYj6, qYj6, #1
1023
1024        VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
1025        VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
1026        VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
1027        VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
1028        VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
1029        VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
1030
1031        VQRDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
1032        ;// IStage 4,3 rows 0to1 x 1/2
1033
1034        MOV         pTemp, #0x4             ;// ensure correct round
1035        VDUP        qScale1, pTemp           ;// of DC result
1036        VADD        qYi0, qYi0, qScale1
1037
1038        VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
1039        VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
1040
1041        VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
1042        VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
1043        VSUB        qYh2, qYi2, qYi3        ;// h2, h3
1044        VQRDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
1045
1046        VMULL       qXt0, dYi4lo, C         ;// c*i4
1047        VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
1048        VMULL       qXt1, dYi4hi, C
1049        VMLAL       qXt1, dYi6hi, S
1050        VSHRN       dYh4lo, qXt0, #16       ;// h4
1051        VSHRN       dYh4hi, qXt1, #16
1052
1053        VMULL       qXt0, dYi6lo, C         ;// c*i6
1054        VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
1055        VMULL       qXt1, dYi6hi, C
1056        VMLSL       qXt1, dYi4hi, S
1057        VSHRN       dYh6lo, qXt0, #16       ;// h6
1058        VSHRN       dYh6hi, qXt1, #16
1059
1060        VSUB        qYg6, qYh6, qYh7
1061        VSUB        qYg5, qYh5, qYg6
1062        VSUB        qYg4, qYh4, qYg5
1063
1064        ;// IStage 2 rows 0to3 x 1/2
1065        VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
1066        VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
1067        VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
1068        VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
1069
1070
1071        ;// IStage 1 all rows
1072        VHADD        qYf3, qYg3, qYg4
1073        VHSUB        qYf4, qYg3, qYg4
1074        VHADD        qYf2, qYg2, qYg5
1075        VHSUB        qYf5, qYg2, qYg5
1076        VHADD        qYf1, qYg1, qYg6
1077        VHSUB        qYf6, qYg1, qYg6
1078        VHADD        qYf0, qYg0, qYg7
1079        VHSUB        qYf7, qYg0, qYg7
1080
1081YTR0            EQU Src0
1082YTR1            EQU Src4
1083YTR2            EQU Src1
1084YTR3            EQU Src2
1085YTR4            EQU Src7
1086YTR5            EQU Src5
1087YTR6            EQU Tmp
1088YTR7            EQU Src6
1089YTRt            EQU Src3
1090
1091qC0             QN  YTR0.S32                ;// for YTRpose
1092qC1             QN  YTR1.S32
1093qC2             QN  YTR2.S32
1094qC3             QN  YTR3.S32
1095qC4             QN  YTR4.S32
1096qC5             QN  YTR5.S32
1097qC6             QN  YTR6.S32
1098qC7             QN  YTR7.S32
1099
1100dD0             DN  YTR0*2+1                ;// for using VSWP
1101dD1             DN  YTR1*2+1
1102dD2             DN  YTR2*2+1
1103dD3             DN  YTR3*2+1
1104dD4             DN  YTR4*2
1105dD5             DN  YTR5*2
1106dD6             DN  YTR6*2
1107dD7             DN  YTR7*2
1108
1109        VTRN        qYf0, qYf1
1110        VTRN        qYf2, qYf3
1111        VTRN        qYf4, qYf5
1112        VTRN        qYf6, qYf7
1113        VTRN        qC0, qC2
1114        VTRN        qC1, qC3
1115        VTRN        qC4, qC6
1116        VTRN        qC5, qC7
1117        VSWP        dD0, dD4
1118        VSWP        dD1, dD5
1119        VSWP        dD2, dD6
1120        VSWP        dD3, dD7
1121
1122
1123dYf0U8          DN YTR0*2.U8
1124dYf1U8          DN YTR1*2.U8
1125dYf2U8          DN YTR2*2.U8
1126dYf3U8          DN YTR3*2.U8
1127dYf4U8          DN YTR4*2.U8
1128dYf5U8          DN YTR5*2.U8
1129dYf6U8          DN YTR6*2.U8
1130dYf7U8          DN YTR7*2.U8
1131
1132        ;//
1133        ;// Do saturation if outsize is other than S16
1134        ;//
1135
1136        IF ("$outsize"="u8")
1137            ;// Output range [0-255]
1138            VQMOVN            dYf0U8, qYf0
1139            VQMOVN            dYf1U8, qYf1
1140            VQMOVN            dYf2U8, qYf2
1141            VQMOVN            dYf3U8, qYf3
1142            VQMOVN            dYf4U8, qYf4
1143            VQMOVN            dYf5U8, qYf5
1144            VQMOVN            dYf6U8, qYf6
1145            VQMOVN            dYf7U8, qYf7
1146        ENDIF
1147
1148        IF ("$outsize"="s9")
1149            ;// Output range [-256 to +255]
1150            VQSHL            qYf0, qYf0, #16-9
1151            VQSHL            qYf1, qYf1, #16-9
1152            VQSHL            qYf2, qYf2, #16-9
1153            VQSHL            qYf3, qYf3, #16-9
1154            VQSHL            qYf4, qYf4, #16-9
1155            VQSHL            qYf5, qYf5, #16-9
1156            VQSHL            qYf6, qYf6, #16-9
1157            VQSHL            qYf7, qYf7, #16-9
1158
1159            VSHR             qYf0, qYf0, #16-9
1160            VSHR             qYf1, qYf1, #16-9
1161            VSHR             qYf2, qYf2, #16-9
1162            VSHR             qYf3, qYf3, #16-9
1163            VSHR             qYf4, qYf4, #16-9
1164            VSHR             qYf5, qYf5, #16-9
1165            VSHR             qYf6, qYf6, #16-9
1166            VSHR             qYf7, qYf7, #16-9
1167        ENDIF
1168
1169        ;// Store output depending on the Stride size
1170        IF "$stride"="s"
1171            VST1        qYf0, [pDest @64], Stride
1172            VST1        qYf1, [pDest @64], Stride
1173            VST1        qYf2, [pDest @64], Stride
1174            VST1        qYf3, [pDest @64], Stride
1175            VST1        qYf4, [pDest @64], Stride
1176            VST1        qYf5, [pDest @64], Stride
1177            VST1        qYf6, [pDest @64], Stride
1178            VST1        qYf7, [pDest @64]
1179        ELSE
1180            IF ("$outsize"="u8")
1181                VST1        dYf0U8, [pDest @64], #8
1182                VST1        dYf1U8, [pDest @64], #8
1183                VST1        dYf2U8, [pDest @64], #8
1184                VST1        dYf3U8, [pDest @64], #8
1185                VST1        dYf4U8, [pDest @64], #8
1186                VST1        dYf5U8, [pDest @64], #8
1187                VST1        dYf6U8, [pDest @64], #8
1188                VST1        dYf7U8, [pDest @64]
1189            ELSE
1190                ;// ("$outsize"="s9") or ("$outsize"="s16")
1191                VST1        qYf0, [pDest @64], #16
1192                VST1        qYf1, [pDest @64], #16
1193                VST1        qYf2, [pDest @64], #16
1194                VST1        qYf3, [pDest @64], #16
1195                VST1        qYf4, [pDest @64], #16
1196                VST1        qYf5, [pDest @64], #16
1197                VST1        qYf6, [pDest @64], #16
1198                VST1        qYf7, [pDest @64]
1199            ENDIF
1200
1201        ENDIF
1202
1203
1204
1205        ENDIF ;// CortexA8
1206
1207
1208
1209        MEND
1210
1211        ;// Scale TWO input rows with TWO rows of 16 bit scale values
1212        ;//
1213        ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
1214        ;// input (Eight input values) with one row of scale values. Also
1215        ;// Loads next scale values from pScale, if $LastRow flag is not set.
1216        ;//
1217        ;// Input Registers:
1218        ;//
1219        ;// $dAlo           - Input D register with first four S16 values of row n
1220        ;// $dAhi           - Input D register with next four S16 values of row n
1221        ;// $dBlo           - Input D register with first four S16 values of row n+1
1222        ;// $dBhi           - Input D register with next four S16 values of row n+1
1223        ;// pScale          - Pointer to next row of scale values
1224        ;// qT0lo           - Temporary scratch register
1225        ;// qT0hi           - Temporary scratch register
1226        ;// qT1lo           - Temporary scratch register
1227        ;// qT1hi           - Temporary scratch register
1228        ;// dScale1lo       - Scale value of row n
1229        ;// dScale1hi       - Scale value of row n
1230        ;// dScale2lo       - Scale value of row n+1
1231        ;// dScale2hi       - Scale value of row n+1
1232        ;//
1233        ;// Input Flag
1234        ;//
1235        ;// $LastRow        - Flag to indicate whether current row is last row
1236        ;//
1237        ;// Output Registers:
1238        ;//
1239        ;// $dAlo           - Scaled output values (first four S16 of row n)
1240        ;// $dAhi           - Scaled output values (next four S16 of row n)
1241        ;// $dBlo           - Scaled output values (first four S16 of row n+1)
1242        ;// $dBhi           - Scaled output values (next four S16 of row n+1)
1243        ;// qScale1         - Scale values for next row
1244        ;// qScale2         - Scale values for next row+1
1245        ;// pScale          - Pointer to next row of scale values
1246        ;//
1247        MACRO
1248        M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
1249        VMULL       qT0lo, $dAlo, dScale1lo
1250        VMULL       qT0hi, $dAhi, dScale1hi
1251        VMULL       qT1lo, $dBlo, dScale2lo
1252        VMULL       qT1hi, $dBhi, dScale2hi
1253        IF "$LastRow"="0"
1254            VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
1255            VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
1256        ENDIF
1257        VQRSHRN       $dAlo, qT0lo, #12
1258        VQRSHRN       $dAhi, qT0hi, #12
1259        VQRSHRN       $dBlo, qT1lo, #12
1260        VQRSHRN       $dBhi, qT1hi, #12
1261        MEND
1262
1263        ;// Scale 8x8 block input values with 16 bit scale values
1264        ;//
1265        ;// This macro is used to pre-scale block of 8x8 input.
1266        ;// This also do the Ist stage transformations of IDCT.
1267        ;//
1268        ;// Input Registers:
1269        ;//
1270        ;// dXjnlo          - n th input D register with first four S16 values
1271        ;// dXjnhi          - n th input D register with next four S16 values
1272        ;// qXjn            - n th input Q register with eight S16 values
1273        ;// pScale          - Pointer to scale values
1274        ;//
1275        ;// Output Registers:
1276        ;//
1277        ;// qXin            - n th output Q register with eight S16 output values of 1st stage
1278        ;//
1279        MACRO
1280        M_IDCT_PRESCALE16
1281        VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
1282        VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
1283        M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
1284        M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
1285        M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
1286        M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
1287        VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
1288        VSUB        qXi6, qXj1, qXj7            ;// j1-j7
1289        LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
1290        VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
1291        VSUB        qXi2, qXj2, qXj6            ;// j2-j6
1292        VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
1293        VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
1294        VSUB        qXi4, qXj5, qXj3            ;// j5-j3
1295        MEND
1296
1297
1298        ;// Scale 8x8 block input values with 32 bit scale values
1299        ;//
1300        ;// This macro is used to pre-scale block of 8x8 input.
1301        ;// This also do the Ist stage transformations of IDCT.
1302        ;//
1303        ;// Input Registers:
1304        ;//
1305        ;// dXjnlo          - n th input D register with first four S16 values
1306        ;// dXjnhi          - n th input D register with next four S16 values
1307        ;// qXjn            - n th input Q register with eight S16 values
1308        ;// pScale          - Pointer to 32bit scale values in Q23 format
1309        ;//
1310        ;// Output Registers:
1311        ;//
1312        ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
1313        ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
1314        ;//
1315        MACRO
1316        M_IDCT_PRESCALE32
1317qScale0lo       QN 0.S32
1318qScale0hi       QN 1.S32
1319qScale1lo       QN 2.S32
1320qScale1hi       QN 3.S32
1321qScale2lo       QN qScale1lo
1322qScale2hi       QN qScale1hi
1323qScale3lo       QN qScale1lo
1324qScale3hi       QN qScale1hi
1325qScale4lo       QN qScale1lo
1326qScale4hi       QN qScale1hi
1327qScale5lo       QN qScale0lo
1328qScale5hi       QN qScale0hi
1329qScale6lo       QN qScale0lo
1330qScale6hi       QN qScale0hi
1331qScale7lo       QN qScale0lo
1332qScale7hi       QN qScale0hi
1333
1334qSrc0lo         QN 4.S32
1335qSrc0hi         QN 5.S32
1336qSrc1lo         QN 6.S32
1337qSrc1hi         QN Src4.S32
1338qSrc2lo         QN qSrc0lo
1339qSrc2hi         QN qSrc0hi
1340qSrc3lo         QN qSrc0lo
1341qSrc3hi         QN qSrc0hi
1342qSrc4lo         QN qSrc0lo
1343qSrc4hi         QN qSrc0hi
1344qSrc5lo         QN qSrc1lo
1345qSrc5hi         QN qSrc1hi
1346qSrc6lo         QN qSrc1lo
1347qSrc6hi         QN qSrc1hi
1348qSrc7lo         QN qSrc0lo
1349qSrc7hi         QN qSrc0hi
1350
1351qRes17lo        QN qScale0lo
1352qRes17hi        QN qScale0hi
1353qRes26lo        QN qScale0lo
1354qRes26hi        QN qScale0hi
1355qRes53lo        QN qScale0lo
1356qRes53hi        QN qScale0hi
1357
1358            ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
1359
1360            ;// Row 0
1361            VLD1        {qScale0lo, qScale0hi}, [pScale]!
1362            VSHLL       qSrc0lo, dXj0lo, #(12-1)
1363            VSHLL       qSrc0hi, dXj0hi, #(12-1)
1364            VLD1        {qScale1lo, qScale1hi}, [pScale]!
1365            VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
1366            VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
1367            VLD1        {qScale7lo, qScale7hi}, [pTemp]!
1368            VSHLL       qSrc1lo, dXj1lo, #(12-1)
1369            VSHLL       qSrc1hi, dXj1hi, #(12-1)
1370            VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
1371            VMOVN       dXi0hi, qSrc0hi
1372            VSHLL       qSrc7lo, dXj7lo, #(12-1)
1373            VSHLL       qSrc7hi, dXj7hi, #(12-1)
1374            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
1375            VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
1376            VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
1377            VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
1378            VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
1379            VLD1        {qScale2lo, qScale2hi}, [pScale]!
1380
1381            ;// Row 1 & 7
1382            VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
1383            VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
1384            VMOVN       dXi5lo, qRes17lo                ;// Output i5
1385            VMOVN       dXi5hi, qRes17hi
1386            VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
1387            VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
1388            VMOVN       dXi6lo, qRes17lo                ;// Output i6
1389            VMOVN       dXi6hi, qRes17hi
1390            VSHLL       qSrc2lo, dXj2lo, #(12-1)
1391            VSHLL       qSrc2hi, dXj2hi, #(12-1)
1392            VLD1        {qScale6lo, qScale6hi}, [pTemp]!
1393            VSHLL       qSrc6lo, dXj6lo, #(12-1)
1394            VSHLL       qSrc6hi, dXj6hi, #(12-1)
1395            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
1396            VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
1397            VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
1398            VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
1399            VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
1400            VLD1        {qScale3lo, qScale3hi}, [pScale]!
1401
1402            ;// Row 2 & 6
1403            VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
1404            VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
1405            VMOVN       dXi3lo, qRes26lo                ;// Output i3
1406            VMOVN       dXi3hi, qRes26hi
1407            VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
1408            VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
1409            VMOVN       dXi2lo, qRes26lo                ;// Output i2
1410            VMOVN       dXi2hi, qRes26hi
1411            VSHLL       qSrc3lo, dXj3lo, #(12-1)
1412            VSHLL       qSrc3hi, dXj3hi, #(12-1)
1413            VLD1        {qScale5lo, qScale5hi}, [pTemp]!
1414            VSHLL       qSrc5lo, dXj5lo, #(12-1)
1415            VSHLL       qSrc5hi, dXj5hi, #(12-1)
1416            VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
1417            VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
1418            VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
1419            VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
1420
1421            ;// Row 3 & 5
1422            VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
1423            VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
1424            SUB         pSrc, pSrc, #16*2*2
1425            VMOVN       dXi7lo, qRes53lo                ;// Output i7
1426            VMOVN       dXi7hi, qRes53hi
1427            VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
1428            VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
1429            VLD1        qXj4, [pSrc @64]
1430            VMOVN       dXi4lo, qRes53lo                ;// Output i4
1431            VMOVN       dXi4hi, qRes53hi
1432            VSHLL       qSrc4lo, dXj4lo, #(12-1)
1433            VSHLL       qSrc4hi, dXj4hi, #(12-1)
1434            VLD1        {qScale4lo, qScale4hi}, [pScale]
1435            LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
1436            VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
1437            VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
1438            VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
1439            ;// Row 4
1440            VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
1441            VMOVN       dXi1hi, qSrc4hi
1442
1443        MEND
1444
1445        END
1446