1;
2; jfdctflt.asm - floating-point FDCT (SSE)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; This file contains a floating-point implementation of the forward DCT
17; (Discrete Cosine Transform). The following code is based directly on
18; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
19;
20; [TAB8]
21
22%include "jsimdext.inc"
23%include "jdct.inc"
24
25; --------------------------------------------------------------------------
26
27%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
28        shufps  %1,%2,0x44
29%endmacro
30
31%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
32        shufps  %1,%2,0xEE
33%endmacro
34
35; --------------------------------------------------------------------------
36        SECTION SEG_CONST
37
38        alignz  16
39        global  EXTN(jconst_fdct_float_sse)
40
41EXTN(jconst_fdct_float_sse):
42
43PD_0_382        times 4 dd  0.382683432365089771728460
44PD_0_707        times 4 dd  0.707106781186547524400844
45PD_0_541        times 4 dd  0.541196100146196984399723
46PD_1_306        times 4 dd  1.306562964876376527856643
47
48        alignz  16
49
50; --------------------------------------------------------------------------
51        SECTION SEG_TEXT
52        BITS    32
53;
54; Perform the forward DCT on one block of samples.
55;
56; GLOBAL(void)
57; jsimd_fdct_float_sse (FAST_FLOAT *data)
58;
59
60%define data(b)         (b)+8           ; FAST_FLOAT *data
61
62%define original_ebp    ebp+0
63%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
64%define WK_NUM          2
65
66        align   16
67        global  EXTN(jsimd_fdct_float_sse)
68
69EXTN(jsimd_fdct_float_sse):
70        push    ebp
71        mov     eax,esp                         ; eax = original ebp
72        sub     esp, byte 4
73        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
74        mov     [esp],eax
75        mov     ebp,esp                         ; ebp = aligned ebp
76        lea     esp, [wk(0)]
77        pushpic ebx
78;       push    ecx             ; need not be preserved
79;       push    edx             ; need not be preserved
80;       push    esi             ; unused
81;       push    edi             ; unused
82
83        get_GOT ebx             ; get GOT address
84
85        ; ---- Pass 1: process rows.
86
87        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
88        mov     ecx, DCTSIZE/4
89        alignx  16,7
90.rowloop:
91
92        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
93        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
94        movaps  xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
95        movaps  xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
96
97        ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
98        ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
99
100        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
101        unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
102        unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
103        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
104        unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
105        unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
106
107        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
108        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
109        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
110        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
111
112        ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
113        ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
114
115        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
116        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
117
118        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
119        unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
120        unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
121        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
122        unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
123        unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
124
125        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
126        unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
127        unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
128        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
129        unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
130        unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
131
132        movaps  xmm0,xmm7
133        movaps  xmm5,xmm6
134        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
135        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
136        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
137        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
138
139        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
140        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
141        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
142        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
143
144        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
145        unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
146        unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
147        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
148        unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
149        unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
150
151        movaps  xmm2,xmm7
152        movaps  xmm3,xmm4
153        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
154        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
155        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
156        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
157
158        ; -- Even part
159
160        movaps  xmm1,xmm5
161        movaps  xmm6,xmm0
162        subps   xmm5,xmm7               ; xmm5=tmp13
163        subps   xmm0,xmm4               ; xmm0=tmp12
164        addps   xmm1,xmm7               ; xmm1=tmp10
165        addps   xmm6,xmm4               ; xmm6=tmp11
166
167        addps   xmm0,xmm5
168        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
169
170        movaps  xmm7,xmm1
171        movaps  xmm4,xmm5
172        subps   xmm1,xmm6               ; xmm1=data4
173        subps   xmm5,xmm0               ; xmm5=data6
174        addps   xmm7,xmm6               ; xmm7=data0
175        addps   xmm4,xmm0               ; xmm4=data2
176
177        movaps  XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
178        movaps  XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
179        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
180        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
181
182        ; -- Odd part
183
184        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
185        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
186
187        addps   xmm2,xmm3               ; xmm2=tmp10
188        addps   xmm3,xmm6               ; xmm3=tmp11
189        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
190
191        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
192
193        movaps  xmm1,xmm2               ; xmm1=tmp10
194        subps   xmm2,xmm6
195        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
196        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
197        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
198        addps   xmm1,xmm2               ; xmm1=z2
199        addps   xmm6,xmm2               ; xmm6=z4
200
201        movaps  xmm5,xmm0
202        subps   xmm0,xmm3               ; xmm0=z13
203        addps   xmm5,xmm3               ; xmm5=z11
204
205        movaps  xmm7,xmm0
206        movaps  xmm4,xmm5
207        subps   xmm0,xmm1               ; xmm0=data3
208        subps   xmm5,xmm6               ; xmm5=data7
209        addps   xmm7,xmm1               ; xmm7=data5
210        addps   xmm4,xmm6               ; xmm4=data1
211
212        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
213        movaps  XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
214        movaps  XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
215        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
216
217        add     edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
218        dec     ecx
219        jnz     near .rowloop
220
221        ; ---- Pass 2: process columns.
222
223        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
224        mov     ecx, DCTSIZE/4
225        alignx  16,7
226.columnloop:
227
228        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
229        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
230        movaps  xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
231        movaps  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
232
233        ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
234        ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
235
236        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
237        unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
238        unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
239        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
240        unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
241        unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
242
243        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
244        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
245        movaps  xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
246        movaps  xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
247
248        ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
249        ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
250
251        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
252        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
253
254        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
255        unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
256        unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
257        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
258        unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
259        unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
260
261        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
262        unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
263        unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
264        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
265        unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
266        unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
267
268        movaps  xmm0,xmm7
269        movaps  xmm5,xmm6
270        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
271        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
272        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
273        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
274
275        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
276        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
277        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
278        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
279
280        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
281        unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
282        unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
283        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
284        unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
285        unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
286
287        movaps  xmm2,xmm7
288        movaps  xmm3,xmm4
289        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
290        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
291        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
292        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
293
294        ; -- Even part
295
296        movaps  xmm1,xmm5
297        movaps  xmm6,xmm0
298        subps   xmm5,xmm7               ; xmm5=tmp13
299        subps   xmm0,xmm4               ; xmm0=tmp12
300        addps   xmm1,xmm7               ; xmm1=tmp10
301        addps   xmm6,xmm4               ; xmm6=tmp11
302
303        addps   xmm0,xmm5
304        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
305
306        movaps  xmm7,xmm1
307        movaps  xmm4,xmm5
308        subps   xmm1,xmm6               ; xmm1=data4
309        subps   xmm5,xmm0               ; xmm5=data6
310        addps   xmm7,xmm6               ; xmm7=data0
311        addps   xmm4,xmm0               ; xmm4=data2
312
313        movaps  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
314        movaps  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
315        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
316        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
317
318        ; -- Odd part
319
320        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
321        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
322
323        addps   xmm2,xmm3               ; xmm2=tmp10
324        addps   xmm3,xmm6               ; xmm3=tmp11
325        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
326
327        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
328
329        movaps  xmm1,xmm2               ; xmm1=tmp10
330        subps   xmm2,xmm6
331        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
332        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
333        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
334        addps   xmm1,xmm2               ; xmm1=z2
335        addps   xmm6,xmm2               ; xmm6=z4
336
337        movaps  xmm5,xmm0
338        subps   xmm0,xmm3               ; xmm0=z13
339        addps   xmm5,xmm3               ; xmm5=z11
340
341        movaps  xmm7,xmm0
342        movaps  xmm4,xmm5
343        subps   xmm0,xmm1               ; xmm0=data3
344        subps   xmm5,xmm6               ; xmm5=data7
345        addps   xmm7,xmm1               ; xmm7=data5
346        addps   xmm4,xmm6               ; xmm4=data1
347
348        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
349        movaps  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
350        movaps  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
351        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
352
353        add     edx, byte 4*SIZEOF_FAST_FLOAT
354        dec     ecx
355        jnz     near .columnloop
356
357;       pop     edi             ; unused
358;       pop     esi             ; unused
359;       pop     edx             ; need not be preserved
360;       pop     ecx             ; need not be preserved
361        poppic  ebx
362        mov     esp,ebp         ; esp <- aligned ebp
363        pop     esp             ; esp <- original ebp
364        pop     ebp
365        ret
366
367; For some reason, the OS X linker does not honor the request to align the
368; segment unless we do this.
369        align   16
370