1;
2; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a fast, not so accurate integer implementation of
18; the forward DCT (Discrete Cosine Transform). The following code is
19; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
20; for more details.
21;
22; [TAB8]
23
24%include "jsimdext.inc"
25%include "jdct.inc"
26
27; --------------------------------------------------------------------------
28
29%define CONST_BITS      8       ; 14 is also OK.
30
31%if CONST_BITS == 8
32F_0_382 equ      98             ; FIX(0.382683433)
33F_0_541 equ     139             ; FIX(0.541196100)
34F_0_707 equ     181             ; FIX(0.707106781)
35F_1_306 equ     334             ; FIX(1.306562965)
36%else
37; NASM cannot do compile-time arithmetic on floating-point constants.
38%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
39F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
40F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
41F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
42F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
43%endif
44
45; --------------------------------------------------------------------------
46        SECTION SEG_CONST
47
48; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
49; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
50
51%define PRE_MULTIPLY_SCALE_BITS   2
52%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
53
54        alignz  16
55        global  EXTN(jconst_fdct_ifast_sse2)
56
57EXTN(jconst_fdct_ifast_sse2):
58
59PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT
60PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT
61PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT
62PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT
63
64        alignz  16
65
66; --------------------------------------------------------------------------
67        SECTION SEG_TEXT
68        BITS    64
69;
70; Perform the forward DCT on one block of samples.
71;
72; GLOBAL(void)
73; jsimd_fdct_ifast_sse2 (DCTELEM *data)
74;
75
76; r10 = DCTELEM *data
77
78%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
79%define WK_NUM          2
80
81        align   16
82        global  EXTN(jsimd_fdct_ifast_sse2)
83
84EXTN(jsimd_fdct_ifast_sse2):
85        push    rbp
86        mov     rax,rsp                         ; rax = original rbp
87        sub     rsp, byte 4
88        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
89        mov     [rsp],rax
90        mov     rbp,rsp                         ; rbp = aligned rbp
91        lea     rsp, [wk(0)]
92        collect_args
93
94        ; ---- Pass 1: process rows.
95
96        mov     rdx, r10        ; (DCTELEM *)
97
98        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
99        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
100        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
101        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
102
103        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
104        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
105
106        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
107        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
108        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
109        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
110        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
111        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
112
113        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
114        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
115        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
116        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
117
118        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
119        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
120
121        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
122        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
123
124        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
125        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
126        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
127        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
128        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
129        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
130
131        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
132        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
133        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
134        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
135        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
136        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
137
138        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
139        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
140        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
141        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
142
143        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
144        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
145        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
146        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
147        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
148        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
149
150        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
151        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
152        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
153        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
154        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
155        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
156
157        movdqa  xmm6,xmm1
158        movdqa  xmm3,xmm0
159        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
160        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
161        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
162        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
163
164        movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
165        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
166        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
167        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
168
169        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
170        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
171        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
172        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
173        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
174        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
175
176        movdqa  xmm2,xmm1
177        movdqa  xmm5,xmm7
178        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
179        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
180        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
181        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
182
183        ; -- Even part
184
185        movdqa  xmm4,xmm3
186        movdqa  xmm0,xmm6
187        psubw   xmm3,xmm1               ; xmm3=tmp13
188        psubw   xmm6,xmm7               ; xmm6=tmp12
189        paddw   xmm4,xmm1               ; xmm4=tmp10
190        paddw   xmm0,xmm7               ; xmm0=tmp11
191
192        paddw   xmm6,xmm3
193        psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
194        pmulhw  xmm6,[rel PW_F0707] ; xmm6=z1
195
196        movdqa  xmm1,xmm4
197        movdqa  xmm7,xmm3
198        psubw   xmm4,xmm0               ; xmm4=data4
199        psubw   xmm3,xmm6               ; xmm3=data6
200        paddw   xmm1,xmm0               ; xmm1=data0
201        paddw   xmm7,xmm6               ; xmm7=data2
202
203        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
204        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
205        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
206        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
207
208        ; -- Odd part
209
210        paddw   xmm2,xmm5               ; xmm2=tmp10
211        paddw   xmm5,xmm0               ; xmm5=tmp11
212        paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
213
214        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
215        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
216
217        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
218        pmulhw  xmm5,[rel PW_F0707] ; xmm5=z3
219
220        movdqa  xmm4,xmm2               ; xmm4=tmp10
221        psubw   xmm2,xmm0
222        pmulhw  xmm2,[rel PW_F0382] ; xmm2=z5
223        pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
224        pmulhw  xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
225        paddw   xmm4,xmm2               ; xmm4=z2
226        paddw   xmm0,xmm2               ; xmm0=z4
227
228        movdqa  xmm3,xmm6
229        psubw   xmm6,xmm5               ; xmm6=z13
230        paddw   xmm3,xmm5               ; xmm3=z11
231
232        movdqa  xmm2,xmm6
233        movdqa  xmm5,xmm3
234        psubw   xmm6,xmm4               ; xmm6=data3
235        psubw   xmm3,xmm0               ; xmm3=data7
236        paddw   xmm2,xmm4               ; xmm2=data5
237        paddw   xmm5,xmm0               ; xmm5=data1
238
239        ; ---- Pass 2: process columns.
240
241        ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
242        ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
243
244        movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
245        punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
246        punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
247        movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
248        punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
249        punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
250
251        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
252        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
253
254        ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
255        ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
256
257        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
258        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
259
260        movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
261        punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
262        punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
263        movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
264        punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
265        punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
266
267        movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
268        punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
269        punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
270        movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
271        punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
272        punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
273
274        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
275        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
276        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
277        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
278
279        movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
280        punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
281        punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
282        movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
283        punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
284        punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
285
286        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
287        punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
288        punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
289        movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
290        punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
291        punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
292
293        movdqa  xmm5,xmm6
294        movdqa  xmm3,xmm1
295        psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
296        psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
297        paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
298        paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
299
300        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
301        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
302        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
303        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
304
305        movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
306        punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
307        punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
308        movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
309        punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
310        punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
311
312        movdqa  xmm7,xmm6
313        movdqa  xmm0,xmm2
314        paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
315        paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
316        psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
317        psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
318
319        ; -- Even part
320
321        movdqa  xmm4,xmm3
322        movdqa  xmm1,xmm5
323        psubw   xmm3,xmm6               ; xmm3=tmp13
324        psubw   xmm5,xmm2               ; xmm5=tmp12
325        paddw   xmm4,xmm6               ; xmm4=tmp10
326        paddw   xmm1,xmm2               ; xmm1=tmp11
327
328        paddw   xmm5,xmm3
329        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
330        pmulhw  xmm5,[rel PW_F0707] ; xmm5=z1
331
332        movdqa  xmm6,xmm4
333        movdqa  xmm2,xmm3
334        psubw   xmm4,xmm1               ; xmm4=data4
335        psubw   xmm3,xmm5               ; xmm3=data6
336        paddw   xmm6,xmm1               ; xmm6=data0
337        paddw   xmm2,xmm5               ; xmm2=data2
338
339        movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
340        movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
341        movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
342        movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
343
344        ; -- Odd part
345
346        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
347        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
348
349        paddw   xmm7,xmm0               ; xmm7=tmp10
350        paddw   xmm0,xmm1               ; xmm0=tmp11
351        paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
352
353        psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
354        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
355
356        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
357        pmulhw  xmm0,[rel PW_F0707] ; xmm0=z3
358
359        movdqa  xmm4,xmm7               ; xmm4=tmp10
360        psubw   xmm7,xmm1
361        pmulhw  xmm7,[rel PW_F0382] ; xmm7=z5
362        pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
363        pmulhw  xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
364        paddw   xmm4,xmm7               ; xmm4=z2
365        paddw   xmm1,xmm7               ; xmm1=z4
366
367        movdqa  xmm3,xmm5
368        psubw   xmm5,xmm0               ; xmm5=z13
369        paddw   xmm3,xmm0               ; xmm3=z11
370
371        movdqa  xmm6,xmm5
372        movdqa  xmm2,xmm3
373        psubw   xmm5,xmm4               ; xmm5=data3
374        psubw   xmm3,xmm1               ; xmm3=data7
375        paddw   xmm6,xmm4               ; xmm6=data5
376        paddw   xmm2,xmm1               ; xmm2=data1
377
378        movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
379        movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
380        movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
381        movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
382
383        uncollect_args
384        mov     rsp,rbp         ; rsp <- aligned rbp
385        pop     rsp             ; rsp <- original rbp
386        pop     rbp
387        ret
388
389; For some reason, the OS X linker does not honor the request to align the
390; segment unless we do this.
391        align   16
392