1a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 272130be99f592adf302fb13682eea7b87352026cDRC; jidctflt.asm - floating-point IDCT (SSE & MMX) 3a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 4018fc42974f125bb8791eb81390137c562d15693Pierre Ossman; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5018fc42974f125bb8791eb81390137c562d15693Pierre Ossman; 66eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis; Based on the x86 SIMD extension for IJG JPEG library 7a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; Copyright (C) 1999-2006, MIYASAKA Masaru. 8a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; For conditions of distribution and use, see copyright notice in jsimdext.inc 9a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 10a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; This file should be assembled with NASM (Netwide Assembler), 11a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; can *not* be assembled with Microsoft's MASM or any compatible 12a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; assembler (including Borland's Turbo Assembler). 13a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; NASM is available from http://nasm.sourceforge.net/ or 14a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; http://sourceforge.net/project/showfiles.php?group_id=6208 15a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 16a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; This file contains a floating-point implementation of the inverse DCT 17a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; (Discrete Cosine Transform). The following code is based directly on 18a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; the IJG's original jidctflt.c; see the jidctflt.c for more details. 19a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 20a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; [TAB8] 21a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 223a65ef478bebc51faa71fc86eece9620ed02611fPierre Ossman%include "jsimdext.inc" 233a65ef478bebc51faa71fc86eece9620ed02611fPierre Ossman%include "jdct.inc" 24a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 25a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; -------------------------------------------------------------------------- 26a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 27e5eaf37440b8e337ab150c017df7c03faf846c51DRC%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 28e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps %1,%2,0x44 29a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru%endmacro 30a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 31e5eaf37440b8e337ab150c017df7c03faf846c51DRC%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 32e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps %1,%2,0xEE 33a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru%endmacro 34a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 35a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; -------------------------------------------------------------------------- 36e5eaf37440b8e337ab150c017df7c03faf846c51DRC SECTION SEG_CONST 37a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 38e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignz 16 39e5eaf37440b8e337ab150c017df7c03faf846c51DRC global EXTN(jconst_idct_float_sse) 40a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 41a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA MasaruEXTN(jconst_idct_float_sse): 42a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 43e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_1_414 times 4 dd 1.414213562373095048801689 44e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_1_847 times 4 dd 1.847759065022573512256366 45e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_1_082 times 4 dd 1.082392200292393968799446 46e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_M2_613 times 4 dd -2.613125929752753055713286 47e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_0_125 times 4 dd 0.125 ; 1/8 48e5eaf37440b8e337ab150c017df7c03faf846c51DRCPB_CENTERJSAMP times 8 db CENTERJSAMPLE 49a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 50e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignz 16 51a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 52a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; -------------------------------------------------------------------------- 53e5eaf37440b8e337ab150c017df7c03faf846c51DRC SECTION SEG_TEXT 54e5eaf37440b8e337ab150c017df7c03faf846c51DRC BITS 32 55a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 56a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; Perform dequantization and inverse DCT on one block of coefficients. 57a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 58a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; GLOBAL(void) 596eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis; jsimd_idct_float_sse (void *dct_table, JCOEFPTR coef_block, 60018fc42974f125bb8791eb81390137c562d15693Pierre Ossman; JSAMPARRAY output_buf, JDIMENSION output_col) 61a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 62a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 636eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis%define dct_table(b) (b)+8 ; void *dct_table 64e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define coef_block(b) (b)+12 ; JCOEFPTR coef_block 65e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf 66e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define output_col(b) (b)+20 ; JDIMENSION output_col 67a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 68e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define original_ebp ebp+0 69e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 70e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define WK_NUM 2 71e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT 72e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; FAST_FLOAT workspace[DCTSIZE2] 73a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 74e5eaf37440b8e337ab150c017df7c03faf846c51DRC align 16 75e5eaf37440b8e337ab150c017df7c03faf846c51DRC global EXTN(jsimd_idct_float_sse) 76a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 77018fc42974f125bb8791eb81390137c562d15693Pierre OssmanEXTN(jsimd_idct_float_sse): 78e5eaf37440b8e337ab150c017df7c03faf846c51DRC push ebp 79e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov eax,esp ; eax = original ebp 80e5eaf37440b8e337ab150c017df7c03faf846c51DRC sub esp, byte 4 81e5eaf37440b8e337ab150c017df7c03faf846c51DRC and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 82e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov [esp],eax 83e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov ebp,esp ; ebp = aligned ebp 84e5eaf37440b8e337ab150c017df7c03faf846c51DRC lea esp, [workspace] 85e5eaf37440b8e337ab150c017df7c03faf846c51DRC push ebx 86e5eaf37440b8e337ab150c017df7c03faf846c51DRC; push ecx ; need not be preserved 87e5eaf37440b8e337ab150c017df7c03faf846c51DRC; push edx ; need not be preserved 88e5eaf37440b8e337ab150c017df7c03faf846c51DRC push esi 89e5eaf37440b8e337ab150c017df7c03faf846c51DRC push edi 90e5eaf37440b8e337ab150c017df7c03faf846c51DRC 91e5eaf37440b8e337ab150c017df7c03faf846c51DRC get_GOT ebx ; get GOT address 92e5eaf37440b8e337ab150c017df7c03faf846c51DRC 93e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; ---- Pass 1: process columns from input, store into work array. 94e5eaf37440b8e337ab150c017df7c03faf846c51DRC 95e5eaf37440b8e337ab150c017df7c03faf846c51DRC; mov eax, [original_ebp] 96e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov edx, POINTER [dct_table(eax)] ; quantptr 97e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov esi, JCOEFPTR [coef_block(eax)] ; inptr 986eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis lea edi, [workspace] ; FAST_FLOAT *wsptr 99e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov ecx, DCTSIZE/4 ; ctr 100e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignx 16,7 101a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru.columnloop: 102a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 103e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 104e5eaf37440b8e337ab150c017df7c03faf846c51DRC or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 105e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz near .columnDCT 106e5eaf37440b8e337ab150c017df7c03faf846c51DRC 107e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 108e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 109e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 110e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 111e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 112e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 113e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 114e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm1,mm0 115e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb mm1,mm1 116e5eaf37440b8e337ab150c017df7c03faf846c51DRC movd eax,mm1 117e5eaf37440b8e337ab150c017df7c03faf846c51DRC test eax,eax 118e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz short .columnDCT 119e5eaf37440b8e337ab150c017df7c03faf846c51DRC 120e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- AC terms all zero 121e5eaf37440b8e337ab150c017df7c03faf846c51DRC 122e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 123e5eaf37440b8e337ab150c017df7c03faf846c51DRC 124e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm1,mm0 ; mm1=(** 02 ** 03) 125e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm0,mm0 ; mm0=(00 00 01 01) 126e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03) 127e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) 128e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm3,mm1 ; xmm3=(02 03 ** **) 129e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) 130e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm0,xmm3 ; xmm0=in0=(00 01 02 03) 131e5eaf37440b8e337ab150c017df7c03faf846c51DRC 132e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 133e5eaf37440b8e337ab150c017df7c03faf846c51DRC 134e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1,xmm0 135e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm2,xmm0 136e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm0 137e5eaf37440b8e337ab150c017df7c03faf846c51DRC 138e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) 139e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) 140e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) 141e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) 142e5eaf37440b8e337ab150c017df7c03faf846c51DRC 143e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 144e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 145e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 146e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 147e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 148e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 149e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 150e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 151e5eaf37440b8e337ab150c017df7c03faf846c51DRC jmp near .nextcolumn 152e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignx 16,7 153a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru%endif 154a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru.columnDCT: 155a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 156e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Even part 157e5eaf37440b8e337ab150c017df7c03faf846c51DRC 158e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 159e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 160e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 161e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 162e5eaf37440b8e337ab150c017df7c03faf846c51DRC 163e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm4,mm0 ; mm4=(** 02 ** 03) 164e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm0,mm0 ; mm0=(00 00 01 01) 165e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm5,mm1 ; mm5=(** 22 ** 23) 166e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm1,mm1 ; mm1=(20 20 21 21) 167e5eaf37440b8e337ab150c017df7c03faf846c51DRC 168e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03) 169e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) 170e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm4,mm4 ; xmm4=(02 03 ** **) 171e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) 172e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23) 173e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21) 174e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm5,mm5 ; xmm5=(22 23 ** **) 175e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm1,mm1 ; xmm1=(20 21 ** **) 176e5eaf37440b8e337ab150c017df7c03faf846c51DRC 177e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm6,mm2 ; mm6=(** 42 ** 43) 178e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm2,mm2 ; mm2=(40 40 41 41) 179e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm7,mm3 ; mm7=(** 62 ** 63) 180e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm3,mm3 ; mm3=(60 60 61 61) 181e5eaf37440b8e337ab150c017df7c03faf846c51DRC 182e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43) 183e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41) 184e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm6,mm6 ; xmm6=(42 43 ** **) 185e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm2,mm2 ; xmm2=(40 41 ** **) 186e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63) 187e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61) 188e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm7,mm7 ; xmm7=(62 63 ** **) 189e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm3,mm3 ; xmm3=(60 61 ** **) 190e5eaf37440b8e337ab150c017df7c03faf846c51DRC 191e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm0,xmm4 ; xmm0=in0=(00 01 02 03) 192e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm1,xmm5 ; xmm1=in2=(20 21 22 23) 193e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 194e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 195e5eaf37440b8e337ab150c017df7c03faf846c51DRC 196e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm2,xmm6 ; xmm2=in4=(40 41 42 43) 197e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm3,xmm7 ; xmm3=in6=(60 61 62 63) 198e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 199e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 200e5eaf37440b8e337ab150c017df7c03faf846c51DRC 201e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm4,xmm0 202e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5,xmm1 203e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm2 ; xmm0=tmp11 204e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm1,xmm3 205e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm4,xmm2 ; xmm4=tmp10 206e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm5,xmm3 ; xmm5=tmp13 207e5eaf37440b8e337ab150c017df7c03faf846c51DRC 208e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm1,[GOTOFF(ebx,PD_1_414)] 209e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm1,xmm5 ; xmm1=tmp12 210e5eaf37440b8e337ab150c017df7c03faf846c51DRC 211e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm6,xmm4 212e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm7,xmm0 213e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm5 ; xmm4=tmp3 214e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm1 ; xmm0=tmp2 215e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm6,xmm5 ; xmm6=tmp0 216e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm7,xmm1 ; xmm7=tmp1 217e5eaf37440b8e337ab150c017df7c03faf846c51DRC 218e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(1)], xmm4 ; tmp3 219e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(0)], xmm0 ; tmp2 220e5eaf37440b8e337ab150c017df7c03faf846c51DRC 221e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Odd part 222e5eaf37440b8e337ab150c017df7c03faf846c51DRC 223e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 224e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 225e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 226e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 227e5eaf37440b8e337ab150c017df7c03faf846c51DRC 228e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm6,mm4 ; mm6=(** 12 ** 13) 229e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm4,mm4 ; mm4=(10 10 11 11) 230e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm2,mm0 ; mm2=(** 32 ** 33) 231e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm0,mm0 ; mm0=(30 30 31 31) 232e5eaf37440b8e337ab150c017df7c03faf846c51DRC 233e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13) 234e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11) 235e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm4,mm6 ; xmm4=(12 13 ** **) 236e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm2,mm4 ; xmm2=(10 11 ** **) 237e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33) 238e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31) 239e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm0,mm2 ; xmm0=(32 33 ** **) 240e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm3,mm0 ; xmm3=(30 31 ** **) 241e5eaf37440b8e337ab150c017df7c03faf846c51DRC 242e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm7,mm5 ; mm7=(** 52 ** 53) 243e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm5,mm5 ; mm5=(50 50 51 51) 244e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm3,mm1 ; mm3=(** 72 ** 73) 245e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm1,mm1 ; mm1=(70 70 71 71) 246e5eaf37440b8e337ab150c017df7c03faf846c51DRC 247e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm2,xmm4 ; xmm2=in1=(10 11 12 13) 248e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm3,xmm0 ; xmm3=in3=(30 31 32 33) 249e5eaf37440b8e337ab150c017df7c03faf846c51DRC 250e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53) 251e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51) 252e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm4,mm7 ; xmm4=(52 53 ** **) 253e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm5,mm5 ; xmm5=(50 51 ** **) 254e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73) 255e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71) 256e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm0,mm3 ; xmm0=(72 73 ** **) 257e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm1,mm1 ; xmm1=(70 71 ** **) 258e5eaf37440b8e337ab150c017df7c03faf846c51DRC 259e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 260e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 261e5eaf37440b8e337ab150c017df7c03faf846c51DRC 262e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm5,xmm4 ; xmm5=in5=(50 51 52 53) 263e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm1,xmm0 ; xmm1=in7=(70 71 72 73) 264e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 265e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 266e5eaf37440b8e337ab150c017df7c03faf846c51DRC 267e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm4,xmm2 268e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm5 269e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm2,xmm1 ; xmm2=z11 270e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm5,xmm3 ; xmm5=z13 271e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm1 ; xmm4=z12 272e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm3 ; xmm0=z10 273e5eaf37440b8e337ab150c017df7c03faf846c51DRC 274e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1,xmm2 275e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm2,xmm5 276e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm1,xmm5 ; xmm1=tmp7 277e5eaf37440b8e337ab150c017df7c03faf846c51DRC 278e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 279e5eaf37440b8e337ab150c017df7c03faf846c51DRC 280e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm0 281e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm0,xmm4 282e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 283e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 284e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 285e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm3,xmm0 ; xmm3=tmp12 286e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm0 ; xmm4=tmp10 287e5eaf37440b8e337ab150c017df7c03faf846c51DRC 288e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Final output stage 289e5eaf37440b8e337ab150c017df7c03faf846c51DRC 290e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm3,xmm1 ; xmm3=tmp6 291e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5,xmm6 292e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm7 293e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) 294e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) 295e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) 296e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) 297e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm2,xmm3 ; xmm2=tmp5 298e5eaf37440b8e337ab150c017df7c03faf846c51DRC 299e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1,xmm6 ; transpose coefficients(phase 1) 300e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) 301e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) 302e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm0 ; transpose coefficients(phase 1) 303e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) 304e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) 305e5eaf37440b8e337ab150c017df7c03faf846c51DRC 306e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 307e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 308e5eaf37440b8e337ab150c017df7c03faf846c51DRC 309e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 310e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 311e5eaf37440b8e337ab150c017df7c03faf846c51DRC 312e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm4,xmm2 ; xmm4=tmp4 313e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm7 314e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm5 315e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) 316e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) 317e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) 318e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) 319e5eaf37440b8e337ab150c017df7c03faf846c51DRC 320e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm2,xmm7 ; transpose coefficients(phase 1) 321e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) 322e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) 323e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm4,xmm5 ; transpose coefficients(phase 1) 324e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) 325e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) 326e5eaf37440b8e337ab150c017df7c03faf846c51DRC 327e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm6 ; transpose coefficients(phase 2) 328e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) 329e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) 330e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm1 ; transpose coefficients(phase 2) 331e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) 332e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) 333e5eaf37440b8e337ab150c017df7c03faf846c51DRC 334e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 335e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 336e5eaf37440b8e337ab150c017df7c03faf846c51DRC 337e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 338e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 339e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 340e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 341e5eaf37440b8e337ab150c017df7c03faf846c51DRC 342e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm6,xmm5 ; transpose coefficients(phase 2) 343e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) 344e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) 345e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm4 ; transpose coefficients(phase 2) 346e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) 347e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) 348e5eaf37440b8e337ab150c017df7c03faf846c51DRC 349e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 350e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 351e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 352e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 353a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 354a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru.nextcolumn: 355e5eaf37440b8e337ab150c017df7c03faf846c51DRC add esi, byte 4*SIZEOF_JCOEF ; coef_block 356e5eaf37440b8e337ab150c017df7c03faf846c51DRC add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 357e5eaf37440b8e337ab150c017df7c03faf846c51DRC add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 358e5eaf37440b8e337ab150c017df7c03faf846c51DRC dec ecx ; ctr 359e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz near .columnloop 360e5eaf37440b8e337ab150c017df7c03faf846c51DRC 361e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Prefetch the next coefficient block 362e5eaf37440b8e337ab150c017df7c03faf846c51DRC 363e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 364e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 365e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 366e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 367e5eaf37440b8e337ab150c017df7c03faf846c51DRC 368e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; ---- Pass 2: process rows from work array, store into output array. 369e5eaf37440b8e337ab150c017df7c03faf846c51DRC 370e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov eax, [original_ebp] 3716eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis lea esi, [workspace] ; FAST_FLOAT *wsptr 372e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 373e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov eax, JDIMENSION [output_col(eax)] 374e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov ecx, DCTSIZE/4 ; ctr 375e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignx 16,7 376a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru.rowloop: 377a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 378e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Even part 379e5eaf37440b8e337ab150c017df7c03faf846c51DRC 380e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 381e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 382e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 383e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 384e5eaf37440b8e337ab150c017df7c03faf846c51DRC 385e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm4,xmm0 386e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5,xmm1 387e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm2 ; xmm0=tmp11 388e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm1,xmm3 389e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm4,xmm2 ; xmm4=tmp10 390e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm5,xmm3 ; xmm5=tmp13 391e5eaf37440b8e337ab150c017df7c03faf846c51DRC 392e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm1,[GOTOFF(ebx,PD_1_414)] 393e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm1,xmm5 ; xmm1=tmp12 394e5eaf37440b8e337ab150c017df7c03faf846c51DRC 395e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm6,xmm4 396e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm7,xmm0 397e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm5 ; xmm4=tmp3 398e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm1 ; xmm0=tmp2 399e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm6,xmm5 ; xmm6=tmp0 400e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm7,xmm1 ; xmm7=tmp1 401e5eaf37440b8e337ab150c017df7c03faf846c51DRC 402e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(1)], xmm4 ; tmp3 403e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(0)], xmm0 ; tmp2 404e5eaf37440b8e337ab150c017df7c03faf846c51DRC 405e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Odd part 406e5eaf37440b8e337ab150c017df7c03faf846c51DRC 407e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 408e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 409e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 410e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 411e5eaf37440b8e337ab150c017df7c03faf846c51DRC 412e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm4,xmm2 413e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm5 414e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm2,xmm1 ; xmm2=z11 415e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm5,xmm3 ; xmm5=z13 416e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm1 ; xmm4=z12 417e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm3 ; xmm0=z10 418e5eaf37440b8e337ab150c017df7c03faf846c51DRC 419e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1,xmm2 420e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm2,xmm5 421e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm1,xmm5 ; xmm1=tmp7 422e5eaf37440b8e337ab150c017df7c03faf846c51DRC 423e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 424e5eaf37440b8e337ab150c017df7c03faf846c51DRC 425e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm0 426e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm0,xmm4 427e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 428e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 429e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 430e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm3,xmm0 ; xmm3=tmp12 431e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm0 ; xmm4=tmp10 432e5eaf37440b8e337ab150c017df7c03faf846c51DRC 433e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Final output stage 434e5eaf37440b8e337ab150c017df7c03faf846c51DRC 435e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm3,xmm1 ; xmm3=tmp6 436e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5,xmm6 437e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm7 438e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) 439e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) 440e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) 441e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) 442e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm2,xmm3 ; xmm2=tmp5 443e5eaf37440b8e337ab150c017df7c03faf846c51DRC 444e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1,[GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125] 445e5eaf37440b8e337ab150c017df7c03faf846c51DRC 446e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm6,xmm1 ; descale(1/8) 447e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm7,xmm1 ; descale(1/8) 448e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm5,xmm1 ; descale(1/8) 449e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0,xmm1 ; descale(1/8) 450e5eaf37440b8e337ab150c017df7c03faf846c51DRC 451e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm3,xmm6 452e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm1,xmm7 453e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm0,xmm6 ; round to int32, mm0=data0L=(00 10) 454e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm1,xmm7 ; round to int32, mm1=data1L=(01 11) 455e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm2,xmm3 ; round to int32, mm2=data0H=(20 30) 456e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm3,xmm1 ; round to int32, mm3=data1H=(21 31) 457e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm0,mm2 ; mm0=data0=(00 10 20 30) 458e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm1,mm3 ; mm1=data1=(01 11 21 31) 459e5eaf37440b8e337ab150c017df7c03faf846c51DRC 460e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm6,xmm5 461e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm7,xmm0 462e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm4,xmm5 ; round to int32, mm4=data7L=(07 17) 463e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm5,xmm0 ; round to int32, mm5=data6L=(06 16) 464e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm6,xmm6 ; round to int32, mm6=data7H=(27 37) 465e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm7,xmm7 ; round to int32, mm7=data6H=(26 36) 466e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm4,mm6 ; mm4=data7=(07 17 27 37) 467e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm5,mm7 ; mm5=data6=(06 16 26 36) 468e5eaf37440b8e337ab150c017df7c03faf846c51DRC 469e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb mm0,mm5 ; mm0=(00 10 20 30 06 16 26 36) 470e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb mm1,mm4 ; mm1=(01 11 21 31 07 17 27 37) 471e5eaf37440b8e337ab150c017df7c03faf846c51DRC 472e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2 473e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 474e5eaf37440b8e337ab150c017df7c03faf846c51DRC 475e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm6,[GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125] 476e5eaf37440b8e337ab150c017df7c03faf846c51DRC 477e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm4,xmm2 ; xmm4=tmp4 478e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5,xmm3 479e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm1 480e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm3,xmm2 ; xmm3=data2=(02 12 22 32) 481e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm1,xmm4 ; xmm1=data4=(04 14 24 34) 482e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm5,xmm2 ; xmm5=data5=(05 15 25 35) 483e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm4 ; xmm0=data3=(03 13 23 33) 484e5eaf37440b8e337ab150c017df7c03faf846c51DRC 485e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm3,xmm6 ; descale(1/8) 486e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm1,xmm6 ; descale(1/8) 487e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm5,xmm6 ; descale(1/8) 488e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0,xmm6 ; descale(1/8) 489e5eaf37440b8e337ab150c017df7c03faf846c51DRC 490e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm7,xmm3 491e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm2,xmm1 492e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm2,xmm3 ; round to int32, mm2=data2L=(02 12) 493e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm3,xmm1 ; round to int32, mm3=data4L=(04 14) 494e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm6,xmm7 ; round to int32, mm6=data2H=(22 32) 495e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm7,xmm2 ; round to int32, mm7=data4H=(24 34) 496e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm2,mm6 ; mm2=data2=(02 12 22 32) 497e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm3,mm7 ; mm3=data4=(04 14 24 34) 498e5eaf37440b8e337ab150c017df7c03faf846c51DRC 499e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm4,xmm5 500e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm6,xmm0 501e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm5,xmm5 ; round to int32, mm5=data5L=(05 15) 502e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm4,xmm0 ; round to int32, mm4=data3L=(03 13) 503e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm6,xmm4 ; round to int32, mm6=data5H=(25 35) 504e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm7,xmm6 ; round to int32, mm7=data3H=(23 33) 505e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm5,mm6 ; mm5=data5=(05 15 25 35) 506e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm4,mm7 ; mm4=data3=(03 13 23 33) 507e5eaf37440b8e337ab150c017df7c03faf846c51DRC 508e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] 509e5eaf37440b8e337ab150c017df7c03faf846c51DRC 510e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb mm2,mm3 ; mm2=(02 12 22 32 04 14 24 34) 511e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb mm4,mm5 ; mm4=(03 13 23 33 05 15 25 35) 512e5eaf37440b8e337ab150c017df7c03faf846c51DRC 513e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb mm0,mm6 514e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb mm1,mm6 515e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb mm2,mm6 516e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb mm4,mm6 517e5eaf37440b8e337ab150c017df7c03faf846c51DRC 518e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm7,mm0 ; transpose coefficients(phase 1) 519e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklbw mm0,mm1 ; mm0=(00 01 10 11 20 21 30 31) 520e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhbw mm7,mm1 ; mm7=(06 07 16 17 26 27 36 37) 521e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm3,mm2 ; transpose coefficients(phase 1) 522e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklbw mm2,mm4 ; mm2=(02 03 12 13 22 23 32 33) 523e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhbw mm3,mm4 ; mm3=(04 05 14 15 24 25 34 35) 524e5eaf37440b8e337ab150c017df7c03faf846c51DRC 525e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm5,mm0 ; transpose coefficients(phase 2) 526e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm0,mm2 ; mm0=(00 01 02 03 10 11 12 13) 527e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm5,mm2 ; mm5=(20 21 22 23 30 31 32 33) 528e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm6,mm3 ; transpose coefficients(phase 2) 529e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm3,mm7 ; mm3=(04 05 06 07 14 15 16 17) 530e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm6,mm7 ; mm6=(24 25 26 27 34 35 36 37) 531e5eaf37440b8e337ab150c017df7c03faf846c51DRC 532e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm1,mm0 ; transpose coefficients(phase 3) 533e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq mm0,mm3 ; mm0=(00 01 02 03 04 05 06 07) 534e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq mm1,mm3 ; mm1=(10 11 12 13 14 15 16 17) 535e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm4,mm5 ; transpose coefficients(phase 3) 536e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq mm5,mm6 ; mm5=(20 21 22 23 24 25 26 27) 537e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq mm4,mm6 ; mm4=(30 31 32 33 34 35 36 37) 538e5eaf37440b8e337ab150c017df7c03faf846c51DRC 539e5eaf37440b8e337ab150c017df7c03faf846c51DRC pushpic ebx ; save GOT address 540e5eaf37440b8e337ab150c017df7c03faf846c51DRC 541e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 542e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 543e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 544e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 545e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 546e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 547e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 548e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 549e5eaf37440b8e337ab150c017df7c03faf846c51DRC 550e5eaf37440b8e337ab150c017df7c03faf846c51DRC poppic ebx ; restore GOT address 551e5eaf37440b8e337ab150c017df7c03faf846c51DRC 552e5eaf37440b8e337ab150c017df7c03faf846c51DRC add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 553e5eaf37440b8e337ab150c017df7c03faf846c51DRC add edi, byte 4*SIZEOF_JSAMPROW 554e5eaf37440b8e337ab150c017df7c03faf846c51DRC dec ecx ; ctr 555e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz near .rowloop 556e5eaf37440b8e337ab150c017df7c03faf846c51DRC 557e5eaf37440b8e337ab150c017df7c03faf846c51DRC emms ; empty MMX state 558e5eaf37440b8e337ab150c017df7c03faf846c51DRC 559e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop edi 560e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop esi 561e5eaf37440b8e337ab150c017df7c03faf846c51DRC; pop edx ; need not be preserved 562e5eaf37440b8e337ab150c017df7c03faf846c51DRC; pop ecx ; need not be preserved 563e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop ebx 564e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov esp,ebp ; esp <- aligned ebp 565e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop esp ; esp <- original ebp 566e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop ebp 567e5eaf37440b8e337ab150c017df7c03faf846c51DRC ret 568a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 569132b5fdd6d1b70c32d76ff5389bd6e183e363f4dDRC; For some reason, the OS X linker does not honor the request to align the 570132b5fdd6d1b70c32d76ff5389bd6e183e363f4dDRC; segment unless we do this. 571e5eaf37440b8e337ab150c017df7c03faf846c51DRC align 16 572