1; 2; jiss2fst.asm - fast integer IDCT (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on 7; x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a fast, not so accurate integer implementation of 18; the inverse DCT (Discrete Cosine Transform). The following code is 19; based directly on the IJG's original jidctfst.c; see the jidctfst.c 20; for more details. 21; 22; [TAB8] 23 24%include "jsimdext.inc" 25%include "jdct.inc" 26 27; -------------------------------------------------------------------------- 28 29%define CONST_BITS 8 ; 14 is also OK. 30%define PASS1_BITS 2 31 32%if IFAST_SCALE_BITS != PASS1_BITS 33%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." 34%endif 35 36%if CONST_BITS == 8 37F_1_082 equ 277 ; FIX(1.082392200) 38F_1_414 equ 362 ; FIX(1.414213562) 39F_1_847 equ 473 ; FIX(1.847759065) 40F_2_613 equ 669 ; FIX(2.613125930) 41F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) 42%else 43; NASM cannot do compile-time arithmetic on floating-point constants. 44%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) 45F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) 46F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) 47F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) 48F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) 49F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) 50%endif 51 52; -------------------------------------------------------------------------- 53 SECTION SEG_CONST 54 55; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) 56; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) 57 58%define PRE_MULTIPLY_SCALE_BITS 2 59%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) 60 61 alignz 16 62 global EXTN(jconst_idct_ifast_sse2) PRIVATE 63 64EXTN(jconst_idct_ifast_sse2): 65 66PW_F1414 times 8 dw F_1_414 << CONST_SHIFT 67PW_F1847 times 8 dw F_1_847 << CONST_SHIFT 68PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT 69PW_F1082 times 8 dw F_1_082 << CONST_SHIFT 70PB_CENTERJSAMP times 16 db CENTERJSAMPLE 71 72 alignz 16 73 74; -------------------------------------------------------------------------- 75 SECTION SEG_TEXT 76 BITS 32 77; 78; Perform dequantization and inverse DCT on one block of coefficients. 79; 80; GLOBAL(void) 81; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block, 82; JSAMPARRAY output_buf, JDIMENSION output_col) 83; 84 85%define dct_table(b) (b)+8 ; jpeg_component_info * compptr 86%define coef_block(b) (b)+12 ; JCOEFPTR coef_block 87%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf 88%define output_col(b) (b)+20 ; JDIMENSION output_col 89 90%define original_ebp ebp+0 91%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 92%define WK_NUM 2 93 94 align 16 95 global EXTN(jsimd_idct_ifast_sse2) PRIVATE 96 97EXTN(jsimd_idct_ifast_sse2): 98 push ebp 99 mov eax,esp ; eax = original ebp 100 sub esp, byte 4 101 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 102 mov [esp],eax 103 mov ebp,esp ; ebp = aligned ebp 104 lea esp, [wk(0)] 105 pushpic ebx 106; push ecx ; unused 107; push edx ; need not be preserved 108 push esi 109 push edi 110 111 get_GOT ebx ; get GOT address 112 113 ; ---- Pass 1: process columns from input. 114 115; mov eax, [original_ebp] 116 mov edx, POINTER [dct_table(eax)] ; quantptr 117 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 118 119%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 120 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 121 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 122 jnz near .columnDCT 123 124 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] 125 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] 126 por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] 127 por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] 128 por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] 129 por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] 130 por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] 131 por xmm1,xmm0 132 packsswb xmm1,xmm1 133 packsswb xmm1,xmm1 134 movd eax,xmm1 135 test eax,eax 136 jnz short .columnDCT 137 138 ; -- AC terms all zero 139 140 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] 141 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 142 143 movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) 144 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 145 punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) 146 147 pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) 148 pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) 149 pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) 150 pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) 151 pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) 152 pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) 153 pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) 154 pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) 155 156 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 157 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 158 jmp near .column_end 159 alignx 16,7 160%endif 161.columnDCT: 162 163 ; -- Even part 164 165 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] 166 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] 167 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] 168 pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] 169 movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] 170 movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] 171 pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] 172 pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] 173 174 movdqa xmm4,xmm0 175 movdqa xmm5,xmm1 176 psubw xmm0,xmm2 ; xmm0=tmp11 177 psubw xmm1,xmm3 178 paddw xmm4,xmm2 ; xmm4=tmp10 179 paddw xmm5,xmm3 ; xmm5=tmp13 180 181 psllw xmm1,PRE_MULTIPLY_SCALE_BITS 182 pmulhw xmm1,[GOTOFF(ebx,PW_F1414)] 183 psubw xmm1,xmm5 ; xmm1=tmp12 184 185 movdqa xmm6,xmm4 186 movdqa xmm7,xmm0 187 psubw xmm4,xmm5 ; xmm4=tmp3 188 psubw xmm0,xmm1 ; xmm0=tmp2 189 paddw xmm6,xmm5 ; xmm6=tmp0 190 paddw xmm7,xmm1 ; xmm7=tmp1 191 192 movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 193 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 194 195 ; -- Odd part 196 197 movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] 198 movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] 199 pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] 200 pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] 201 movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] 202 movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] 203 pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] 204 pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] 205 206 movdqa xmm4,xmm2 207 movdqa xmm0,xmm5 208 psubw xmm2,xmm1 ; xmm2=z12 209 psubw xmm5,xmm3 ; xmm5=z10 210 paddw xmm4,xmm1 ; xmm4=z11 211 paddw xmm0,xmm3 ; xmm0=z13 212 213 movdqa xmm1,xmm5 ; xmm1=z10(unscaled) 214 psllw xmm2,PRE_MULTIPLY_SCALE_BITS 215 psllw xmm5,PRE_MULTIPLY_SCALE_BITS 216 217 movdqa xmm3,xmm4 218 psubw xmm4,xmm0 219 paddw xmm3,xmm0 ; xmm3=tmp7 220 221 psllw xmm4,PRE_MULTIPLY_SCALE_BITS 222 pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 223 224 ; To avoid overflow... 225 ; 226 ; (Original) 227 ; tmp12 = -2.613125930 * z10 + z5; 228 ; 229 ; (This implementation) 230 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 231 ; = -1.613125930 * z10 - z10 + z5; 232 233 movdqa xmm0,xmm5 234 paddw xmm5,xmm2 235 pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5 236 pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)] 237 pmulhw xmm2,[GOTOFF(ebx,PW_F1082)] 238 psubw xmm0,xmm1 239 psubw xmm2,xmm5 ; xmm2=tmp10 240 paddw xmm0,xmm5 ; xmm0=tmp12 241 242 ; -- Final output stage 243 244 psubw xmm0,xmm3 ; xmm0=tmp6 245 movdqa xmm1,xmm6 246 movdqa xmm5,xmm7 247 paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) 248 paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) 249 psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) 250 psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) 251 psubw xmm4,xmm0 ; xmm4=tmp5 252 253 movdqa xmm3,xmm6 ; transpose coefficients(phase 1) 254 punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) 255 punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) 256 movdqa xmm0,xmm5 ; transpose coefficients(phase 1) 257 punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) 258 punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) 259 260 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 261 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 262 263 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) 264 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) 265 266 paddw xmm2,xmm4 ; xmm2=tmp4 267 movdqa xmm5,xmm7 268 movdqa xmm0,xmm1 269 paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) 270 paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) 271 psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) 272 psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) 273 274 movdqa xmm4,xmm7 ; transpose coefficients(phase 1) 275 punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) 276 punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) 277 movdqa xmm2,xmm1 ; transpose coefficients(phase 1) 278 punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) 279 punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) 280 281 movdqa xmm0,xmm3 ; transpose coefficients(phase 2) 282 punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) 283 punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) 284 movdqa xmm5,xmm6 ; transpose coefficients(phase 2) 285 punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) 286 punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) 287 288 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) 289 movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) 290 291 movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) 292 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) 293 294 movdqa xmm3,xmm1 ; transpose coefficients(phase 2) 295 punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) 296 punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) 297 movdqa xmm0,xmm2 ; transpose coefficients(phase 2) 298 punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) 299 punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) 300 301 movdqa xmm4,xmm6 ; transpose coefficients(phase 3) 302 punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) 303 punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) 304 movdqa xmm7,xmm5 ; transpose coefficients(phase 3) 305 punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) 306 punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) 307 308 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) 309 movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) 310 311 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 312 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 313 314 movdqa xmm4,xmm1 ; transpose coefficients(phase 3) 315 punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) 316 punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) 317 movdqa xmm7,xmm3 ; transpose coefficients(phase 3) 318 punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) 319 punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) 320.column_end: 321 322 ; -- Prefetch the next coefficient block 323 324 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] 325 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] 326 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] 327 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] 328 329 ; ---- Pass 2: process rows from work array, store into output array. 330 331 mov eax, [original_ebp] 332 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 333 mov eax, JDIMENSION [output_col(eax)] 334 335 ; -- Even part 336 337 ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 338 339 movdqa xmm2,xmm6 340 movdqa xmm0,xmm5 341 psubw xmm6,xmm1 ; xmm6=tmp11 342 psubw xmm5,xmm3 343 paddw xmm2,xmm1 ; xmm2=tmp10 344 paddw xmm0,xmm3 ; xmm0=tmp13 345 346 psllw xmm5,PRE_MULTIPLY_SCALE_BITS 347 pmulhw xmm5,[GOTOFF(ebx,PW_F1414)] 348 psubw xmm5,xmm0 ; xmm5=tmp12 349 350 movdqa xmm1,xmm2 351 movdqa xmm3,xmm6 352 psubw xmm2,xmm0 ; xmm2=tmp3 353 psubw xmm6,xmm5 ; xmm6=tmp2 354 paddw xmm1,xmm0 ; xmm1=tmp0 355 paddw xmm3,xmm5 ; xmm3=tmp1 356 357 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 358 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 359 360 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 361 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 362 363 ; -- Odd part 364 365 ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 366 367 movdqa xmm2,xmm0 368 movdqa xmm6,xmm4 369 psubw xmm0,xmm7 ; xmm0=z12 370 psubw xmm4,xmm5 ; xmm4=z10 371 paddw xmm2,xmm7 ; xmm2=z11 372 paddw xmm6,xmm5 ; xmm6=z13 373 374 movdqa xmm7,xmm4 ; xmm7=z10(unscaled) 375 psllw xmm0,PRE_MULTIPLY_SCALE_BITS 376 psllw xmm4,PRE_MULTIPLY_SCALE_BITS 377 378 movdqa xmm5,xmm2 379 psubw xmm2,xmm6 380 paddw xmm5,xmm6 ; xmm5=tmp7 381 382 psllw xmm2,PRE_MULTIPLY_SCALE_BITS 383 pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 384 385 ; To avoid overflow... 386 ; 387 ; (Original) 388 ; tmp12 = -2.613125930 * z10 + z5; 389 ; 390 ; (This implementation) 391 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 392 ; = -1.613125930 * z10 - z10 + z5; 393 394 movdqa xmm6,xmm4 395 paddw xmm4,xmm0 396 pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5 397 pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)] 398 pmulhw xmm0,[GOTOFF(ebx,PW_F1082)] 399 psubw xmm6,xmm7 400 psubw xmm0,xmm4 ; xmm0=tmp10 401 paddw xmm6,xmm4 ; xmm6=tmp12 402 403 ; -- Final output stage 404 405 psubw xmm6,xmm5 ; xmm6=tmp6 406 movdqa xmm7,xmm1 407 movdqa xmm4,xmm3 408 paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) 409 paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) 410 psraw xmm1,(PASS1_BITS+3) ; descale 411 psraw xmm3,(PASS1_BITS+3) ; descale 412 psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) 413 psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) 414 psraw xmm7,(PASS1_BITS+3) ; descale 415 psraw xmm4,(PASS1_BITS+3) ; descale 416 psubw xmm2,xmm6 ; xmm2=tmp5 417 418 packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 419 packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 420 421 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 422 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 423 424 paddw xmm0,xmm2 ; xmm0=tmp4 425 movdqa xmm4,xmm5 426 movdqa xmm7,xmm6 427 paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) 428 paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) 429 psraw xmm5,(PASS1_BITS+3) ; descale 430 psraw xmm6,(PASS1_BITS+3) ; descale 431 psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) 432 psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) 433 psraw xmm4,(PASS1_BITS+3) ; descale 434 psraw xmm7,(PASS1_BITS+3) ; descale 435 436 movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] 437 438 packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) 439 packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) 440 441 paddb xmm1,xmm2 442 paddb xmm3,xmm2 443 paddb xmm5,xmm2 444 paddb xmm7,xmm2 445 446 movdqa xmm0,xmm1 ; transpose coefficients(phase 1) 447 punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) 448 punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) 449 movdqa xmm6,xmm5 ; transpose coefficients(phase 1) 450 punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) 451 punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) 452 453 movdqa xmm4,xmm1 ; transpose coefficients(phase 2) 454 punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 455 punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) 456 movdqa xmm2,xmm6 ; transpose coefficients(phase 2) 457 punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 458 punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) 459 460 movdqa xmm3,xmm1 ; transpose coefficients(phase 3) 461 punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 462 punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 463 movdqa xmm7,xmm4 ; transpose coefficients(phase 3) 464 punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) 465 punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) 466 467 pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 468 pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 469 pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) 470 pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) 471 472 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 473 mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 474 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 475 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 476 mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] 477 mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] 478 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 479 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 480 481 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 482 mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 483 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 484 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 485 mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] 486 mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] 487 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 488 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 489 490 pop edi 491 pop esi 492; pop edx ; need not be preserved 493; pop ecx ; unused 494 poppic ebx 495 mov esp,ebp ; esp <- aligned ebp 496 pop esp ; esp <- original ebp 497 pop ebp 498 ret 499 500; For some reason, the OS X linker does not honor the request to align the 501; segment unless we do this. 502 align 16 503