1a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 272130be99f592adf302fb13682eea7b87352026cDRC; jidctflt.asm - floating-point IDCT (SSE & MMX) 3a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 4018fc42974f125bb8791eb81390137c562d15693Pierre Ossman; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5018fc42974f125bb8791eb81390137c562d15693Pierre Ossman; 6018fc42974f125bb8791eb81390137c562d15693Pierre Ossman; Based on 7a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; x86 SIMD extension for IJG JPEG library 8a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; Copyright (C) 1999-2006, MIYASAKA Masaru. 9a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; For conditions of distribution and use, see copyright notice in jsimdext.inc 10a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 11a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; This file should be assembled with NASM (Netwide Assembler), 12a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; can *not* be assembled with Microsoft's MASM or any compatible 13a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; assembler (including Borland's Turbo Assembler). 14a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; NASM is available from http://nasm.sourceforge.net/ or 15a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; http://sourceforge.net/project/showfiles.php?group_id=6208 16a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 17a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; This file contains a floating-point implementation of the inverse DCT 18a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; (Discrete Cosine Transform). The following code is based directly on 19a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; the IJG's original jidctflt.c; see the jidctflt.c for more details. 20a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 21a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; [TAB8] 22a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 233a65ef478bebc51faa71fc86eece9620ed02611fPierre Ossman%include "jsimdext.inc" 243a65ef478bebc51faa71fc86eece9620ed02611fPierre Ossman%include "jdct.inc" 25a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 26a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; -------------------------------------------------------------------------- 27a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 28e5eaf37440b8e337ab150c017df7c03faf846c51DRC%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 29e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps %1,%2,0x44 30a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru%endmacro 31a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 32e5eaf37440b8e337ab150c017df7c03faf846c51DRC%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 33e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps %1,%2,0xEE 34a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru%endmacro 35a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 36a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; -------------------------------------------------------------------------- 37e5eaf37440b8e337ab150c017df7c03faf846c51DRC SECTION SEG_CONST 38a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 39e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignz 16 40e5eaf37440b8e337ab150c017df7c03faf846c51DRC global EXTN(jconst_idct_float_sse) 41a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 42a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA MasaruEXTN(jconst_idct_float_sse): 43a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 44e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_1_414 times 4 dd 1.414213562373095048801689 45e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_1_847 times 4 dd 1.847759065022573512256366 46e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_1_082 times 4 dd 1.082392200292393968799446 47e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_M2_613 times 4 dd -2.613125929752753055713286 48e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_0_125 times 4 dd 0.125 ; 1/8 49e5eaf37440b8e337ab150c017df7c03faf846c51DRCPB_CENTERJSAMP times 8 db CENTERJSAMPLE 50a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 51e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignz 16 52a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 53a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; -------------------------------------------------------------------------- 54e5eaf37440b8e337ab150c017df7c03faf846c51DRC SECTION SEG_TEXT 55e5eaf37440b8e337ab150c017df7c03faf846c51DRC BITS 32 56a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 57a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; Perform dequantization and inverse DCT on one block of coefficients. 58a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 59a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; GLOBAL(void) 60018fc42974f125bb8791eb81390137c562d15693Pierre Ossman; jsimd_idct_float_sse (void * dct_table, JCOEFPTR coef_block, 61018fc42974f125bb8791eb81390137c562d15693Pierre Ossman; JSAMPARRAY output_buf, JDIMENSION output_col) 62a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru; 63a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 64e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define dct_table(b) (b)+8 ; void * dct_table 65e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define coef_block(b) (b)+12 ; JCOEFPTR coef_block 66e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf 67e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define output_col(b) (b)+20 ; JDIMENSION output_col 68a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 69e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define original_ebp ebp+0 70e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 71e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define WK_NUM 2 72e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT 73e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; FAST_FLOAT workspace[DCTSIZE2] 74a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 75e5eaf37440b8e337ab150c017df7c03faf846c51DRC align 16 76e5eaf37440b8e337ab150c017df7c03faf846c51DRC global EXTN(jsimd_idct_float_sse) 77a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 78018fc42974f125bb8791eb81390137c562d15693Pierre OssmanEXTN(jsimd_idct_float_sse): 79e5eaf37440b8e337ab150c017df7c03faf846c51DRC push ebp 80e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov eax,esp ; eax = original ebp 81e5eaf37440b8e337ab150c017df7c03faf846c51DRC sub esp, byte 4 82e5eaf37440b8e337ab150c017df7c03faf846c51DRC and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 83e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov [esp],eax 84e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov ebp,esp ; ebp = aligned ebp 85e5eaf37440b8e337ab150c017df7c03faf846c51DRC lea esp, [workspace] 86e5eaf37440b8e337ab150c017df7c03faf846c51DRC push ebx 87e5eaf37440b8e337ab150c017df7c03faf846c51DRC; push ecx ; need not be preserved 88e5eaf37440b8e337ab150c017df7c03faf846c51DRC; push edx ; need not be preserved 89e5eaf37440b8e337ab150c017df7c03faf846c51DRC push esi 90e5eaf37440b8e337ab150c017df7c03faf846c51DRC push edi 91e5eaf37440b8e337ab150c017df7c03faf846c51DRC 92e5eaf37440b8e337ab150c017df7c03faf846c51DRC get_GOT ebx ; get GOT address 93e5eaf37440b8e337ab150c017df7c03faf846c51DRC 94e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; ---- Pass 1: process columns from input, store into work array. 95e5eaf37440b8e337ab150c017df7c03faf846c51DRC 96e5eaf37440b8e337ab150c017df7c03faf846c51DRC; mov eax, [original_ebp] 97e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov edx, POINTER [dct_table(eax)] ; quantptr 98e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov esi, JCOEFPTR [coef_block(eax)] ; inptr 99e5eaf37440b8e337ab150c017df7c03faf846c51DRC lea edi, [workspace] ; FAST_FLOAT * wsptr 100e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov ecx, DCTSIZE/4 ; ctr 101e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignx 16,7 102a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru.columnloop: 103a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 104e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 105e5eaf37440b8e337ab150c017df7c03faf846c51DRC or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 106e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz near .columnDCT 107e5eaf37440b8e337ab150c017df7c03faf846c51DRC 108e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 109e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 110e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 111e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 112e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 113e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 114e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 115e5eaf37440b8e337ab150c017df7c03faf846c51DRC por mm1,mm0 116e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb mm1,mm1 117e5eaf37440b8e337ab150c017df7c03faf846c51DRC movd eax,mm1 118e5eaf37440b8e337ab150c017df7c03faf846c51DRC test eax,eax 119e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz short .columnDCT 120e5eaf37440b8e337ab150c017df7c03faf846c51DRC 121e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- AC terms all zero 122e5eaf37440b8e337ab150c017df7c03faf846c51DRC 123e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 124e5eaf37440b8e337ab150c017df7c03faf846c51DRC 125e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm1,mm0 ; mm1=(** 02 ** 03) 126e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm0,mm0 ; mm0=(00 00 01 01) 127e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03) 128e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) 129e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm3,mm1 ; xmm3=(02 03 ** **) 130e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) 131e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm0,xmm3 ; xmm0=in0=(00 01 02 03) 132e5eaf37440b8e337ab150c017df7c03faf846c51DRC 133e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 134e5eaf37440b8e337ab150c017df7c03faf846c51DRC 135e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1,xmm0 136e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm2,xmm0 137e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm0 138e5eaf37440b8e337ab150c017df7c03faf846c51DRC 139e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) 140e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) 141e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) 142e5eaf37440b8e337ab150c017df7c03faf846c51DRC shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) 143e5eaf37440b8e337ab150c017df7c03faf846c51DRC 144e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 145e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 146e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 147e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 148e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 149e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 150e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 151e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 152e5eaf37440b8e337ab150c017df7c03faf846c51DRC jmp near .nextcolumn 153e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignx 16,7 154a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru%endif 155a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru.columnDCT: 156a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 157e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Even part 158e5eaf37440b8e337ab150c017df7c03faf846c51DRC 159e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 160e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 161e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 162e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 163e5eaf37440b8e337ab150c017df7c03faf846c51DRC 164e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm4,mm0 ; mm4=(** 02 ** 03) 165e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm0,mm0 ; mm0=(00 00 01 01) 166e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm5,mm1 ; mm5=(** 22 ** 23) 167e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm1,mm1 ; mm1=(20 20 21 21) 168e5eaf37440b8e337ab150c017df7c03faf846c51DRC 169e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03) 170e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) 171e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm4,mm4 ; xmm4=(02 03 ** **) 172e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) 173e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23) 174e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21) 175e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm5,mm5 ; xmm5=(22 23 ** **) 176e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm1,mm1 ; xmm1=(20 21 ** **) 177e5eaf37440b8e337ab150c017df7c03faf846c51DRC 178e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm6,mm2 ; mm6=(** 42 ** 43) 179e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm2,mm2 ; mm2=(40 40 41 41) 180e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm7,mm3 ; mm7=(** 62 ** 63) 181e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm3,mm3 ; mm3=(60 60 61 61) 182e5eaf37440b8e337ab150c017df7c03faf846c51DRC 183e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43) 184e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41) 185e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm6,mm6 ; xmm6=(42 43 ** **) 186e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm2,mm2 ; xmm2=(40 41 ** **) 187e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63) 188e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61) 189e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm7,mm7 ; xmm7=(62 63 ** **) 190e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm3,mm3 ; xmm3=(60 61 ** **) 191e5eaf37440b8e337ab150c017df7c03faf846c51DRC 192e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm0,xmm4 ; xmm0=in0=(00 01 02 03) 193e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm1,xmm5 ; xmm1=in2=(20 21 22 23) 194e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 195e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 196e5eaf37440b8e337ab150c017df7c03faf846c51DRC 197e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm2,xmm6 ; xmm2=in4=(40 41 42 43) 198e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm3,xmm7 ; xmm3=in6=(60 61 62 63) 199e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 200e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 201e5eaf37440b8e337ab150c017df7c03faf846c51DRC 202e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm4,xmm0 203e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5,xmm1 204e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm2 ; xmm0=tmp11 205e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm1,xmm3 206e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm4,xmm2 ; xmm4=tmp10 207e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm5,xmm3 ; xmm5=tmp13 208e5eaf37440b8e337ab150c017df7c03faf846c51DRC 209e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm1,[GOTOFF(ebx,PD_1_414)] 210e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm1,xmm5 ; xmm1=tmp12 211e5eaf37440b8e337ab150c017df7c03faf846c51DRC 212e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm6,xmm4 213e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm7,xmm0 214e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm5 ; xmm4=tmp3 215e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm1 ; xmm0=tmp2 216e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm6,xmm5 ; xmm6=tmp0 217e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm7,xmm1 ; xmm7=tmp1 218e5eaf37440b8e337ab150c017df7c03faf846c51DRC 219e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(1)], xmm4 ; tmp3 220e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(0)], xmm0 ; tmp2 221e5eaf37440b8e337ab150c017df7c03faf846c51DRC 222e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Odd part 223e5eaf37440b8e337ab150c017df7c03faf846c51DRC 224e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 225e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 226e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 227e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 228e5eaf37440b8e337ab150c017df7c03faf846c51DRC 229e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm6,mm4 ; mm6=(** 12 ** 13) 230e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm4,mm4 ; mm4=(10 10 11 11) 231e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm2,mm0 ; mm2=(** 32 ** 33) 232e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm0,mm0 ; mm0=(30 30 31 31) 233e5eaf37440b8e337ab150c017df7c03faf846c51DRC 234e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13) 235e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11) 236e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm4,mm6 ; xmm4=(12 13 ** **) 237e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm2,mm4 ; xmm2=(10 11 ** **) 238e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33) 239e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31) 240e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm0,mm2 ; xmm0=(32 33 ** **) 241e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm3,mm0 ; xmm3=(30 31 ** **) 242e5eaf37440b8e337ab150c017df7c03faf846c51DRC 243e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm7,mm5 ; mm7=(** 52 ** 53) 244e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm5,mm5 ; mm5=(50 50 51 51) 245e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm3,mm1 ; mm3=(** 72 ** 73) 246e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm1,mm1 ; mm1=(70 70 71 71) 247e5eaf37440b8e337ab150c017df7c03faf846c51DRC 248e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm2,xmm4 ; xmm2=in1=(10 11 12 13) 249e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm3,xmm0 ; xmm3=in3=(30 31 32 33) 250e5eaf37440b8e337ab150c017df7c03faf846c51DRC 251e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53) 252e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51) 253e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm4,mm7 ; xmm4=(52 53 ** **) 254e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm5,mm5 ; xmm5=(50 51 ** **) 255e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73) 256e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71) 257e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm0,mm3 ; xmm0=(72 73 ** **) 258e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtpi2ps xmm1,mm1 ; xmm1=(70 71 ** **) 259e5eaf37440b8e337ab150c017df7c03faf846c51DRC 260e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 261e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 262e5eaf37440b8e337ab150c017df7c03faf846c51DRC 263e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm5,xmm4 ; xmm5=in5=(50 51 52 53) 264e5eaf37440b8e337ab150c017df7c03faf846c51DRC movlhps xmm1,xmm0 ; xmm1=in7=(70 71 72 73) 265e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 266e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 267e5eaf37440b8e337ab150c017df7c03faf846c51DRC 268e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm4,xmm2 269e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm5 270e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm2,xmm1 ; xmm2=z11 271e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm5,xmm3 ; xmm5=z13 272e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm1 ; xmm4=z12 273e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm3 ; xmm0=z10 274e5eaf37440b8e337ab150c017df7c03faf846c51DRC 275e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1,xmm2 276e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm2,xmm5 277e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm1,xmm5 ; xmm1=tmp7 278e5eaf37440b8e337ab150c017df7c03faf846c51DRC 279e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 280e5eaf37440b8e337ab150c017df7c03faf846c51DRC 281e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm0 282e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm0,xmm4 283e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 284e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 285e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 286e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm3,xmm0 ; xmm3=tmp12 287e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm0 ; xmm4=tmp10 288e5eaf37440b8e337ab150c017df7c03faf846c51DRC 289e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Final output stage 290e5eaf37440b8e337ab150c017df7c03faf846c51DRC 291e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm3,xmm1 ; xmm3=tmp6 292e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5,xmm6 293e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm7 294e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) 295e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) 296e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) 297e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) 298e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm2,xmm3 ; xmm2=tmp5 299e5eaf37440b8e337ab150c017df7c03faf846c51DRC 300e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1,xmm6 ; transpose coefficients(phase 1) 301e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) 302e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) 303e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm0 ; transpose coefficients(phase 1) 304e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) 305e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) 306e5eaf37440b8e337ab150c017df7c03faf846c51DRC 307e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 308e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 309e5eaf37440b8e337ab150c017df7c03faf846c51DRC 310e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 311e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 312e5eaf37440b8e337ab150c017df7c03faf846c51DRC 313e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm4,xmm2 ; xmm4=tmp4 314e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm7 315e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm5 316e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) 317e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) 318e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) 319e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) 320e5eaf37440b8e337ab150c017df7c03faf846c51DRC 321e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm2,xmm7 ; transpose coefficients(phase 1) 322e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) 323e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) 324e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm4,xmm5 ; transpose coefficients(phase 1) 325e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) 326e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) 327e5eaf37440b8e337ab150c017df7c03faf846c51DRC 328e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm6 ; transpose coefficients(phase 2) 329e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) 330e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) 331e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm1 ; transpose coefficients(phase 2) 332e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) 333e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) 334e5eaf37440b8e337ab150c017df7c03faf846c51DRC 335e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 336e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 337e5eaf37440b8e337ab150c017df7c03faf846c51DRC 338e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 339e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 340e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 341e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 342e5eaf37440b8e337ab150c017df7c03faf846c51DRC 343e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm6,xmm5 ; transpose coefficients(phase 2) 344e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) 345e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) 346e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm4 ; transpose coefficients(phase 2) 347e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) 348e5eaf37440b8e337ab150c017df7c03faf846c51DRC unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) 349e5eaf37440b8e337ab150c017df7c03faf846c51DRC 350e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 351e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 352e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 353e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 354a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 355a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru.nextcolumn: 356e5eaf37440b8e337ab150c017df7c03faf846c51DRC add esi, byte 4*SIZEOF_JCOEF ; coef_block 357e5eaf37440b8e337ab150c017df7c03faf846c51DRC add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 358e5eaf37440b8e337ab150c017df7c03faf846c51DRC add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 359e5eaf37440b8e337ab150c017df7c03faf846c51DRC dec ecx ; ctr 360e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz near .columnloop 361e5eaf37440b8e337ab150c017df7c03faf846c51DRC 362e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Prefetch the next coefficient block 363e5eaf37440b8e337ab150c017df7c03faf846c51DRC 364e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 365e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 366e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 367e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 368e5eaf37440b8e337ab150c017df7c03faf846c51DRC 369e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; ---- Pass 2: process rows from work array, store into output array. 370e5eaf37440b8e337ab150c017df7c03faf846c51DRC 371e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov eax, [original_ebp] 372e5eaf37440b8e337ab150c017df7c03faf846c51DRC lea esi, [workspace] ; FAST_FLOAT * wsptr 373e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 374e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov eax, JDIMENSION [output_col(eax)] 375e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov ecx, DCTSIZE/4 ; ctr 376e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignx 16,7 377a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru.rowloop: 378a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 379e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Even part 380e5eaf37440b8e337ab150c017df7c03faf846c51DRC 381e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 382e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 383e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 384e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 385e5eaf37440b8e337ab150c017df7c03faf846c51DRC 386e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm4,xmm0 387e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5,xmm1 388e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm2 ; xmm0=tmp11 389e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm1,xmm3 390e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm4,xmm2 ; xmm4=tmp10 391e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm5,xmm3 ; xmm5=tmp13 392e5eaf37440b8e337ab150c017df7c03faf846c51DRC 393e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm1,[GOTOFF(ebx,PD_1_414)] 394e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm1,xmm5 ; xmm1=tmp12 395e5eaf37440b8e337ab150c017df7c03faf846c51DRC 396e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm6,xmm4 397e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm7,xmm0 398e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm5 ; xmm4=tmp3 399e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm1 ; xmm0=tmp2 400e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm6,xmm5 ; xmm6=tmp0 401e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm7,xmm1 ; xmm7=tmp1 402e5eaf37440b8e337ab150c017df7c03faf846c51DRC 403e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(1)], xmm4 ; tmp3 404e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps XMMWORD [wk(0)], xmm0 ; tmp2 405e5eaf37440b8e337ab150c017df7c03faf846c51DRC 406e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Odd part 407e5eaf37440b8e337ab150c017df7c03faf846c51DRC 408e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 409e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 410e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 411e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 412e5eaf37440b8e337ab150c017df7c03faf846c51DRC 413e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm4,xmm2 414e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm5 415e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm2,xmm1 ; xmm2=z11 416e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm5,xmm3 ; xmm5=z13 417e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm1 ; xmm4=z12 418e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm3 ; xmm0=z10 419e5eaf37440b8e337ab150c017df7c03faf846c51DRC 420e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1,xmm2 421e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm2,xmm5 422e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm1,xmm5 ; xmm1=tmp7 423e5eaf37440b8e337ab150c017df7c03faf846c51DRC 424e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 425e5eaf37440b8e337ab150c017df7c03faf846c51DRC 426e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3,xmm0 427e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm0,xmm4 428e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 429e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 430e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 431e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm3,xmm0 ; xmm3=tmp12 432e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm4,xmm0 ; xmm4=tmp10 433e5eaf37440b8e337ab150c017df7c03faf846c51DRC 434e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Final output stage 435e5eaf37440b8e337ab150c017df7c03faf846c51DRC 436e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm3,xmm1 ; xmm3=tmp6 437e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5,xmm6 438e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm7 439e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) 440e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) 441e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) 442e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) 443e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm2,xmm3 ; xmm2=tmp5 444e5eaf37440b8e337ab150c017df7c03faf846c51DRC 445e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1,[GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125] 446e5eaf37440b8e337ab150c017df7c03faf846c51DRC 447e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm6,xmm1 ; descale(1/8) 448e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm7,xmm1 ; descale(1/8) 449e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm5,xmm1 ; descale(1/8) 450e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0,xmm1 ; descale(1/8) 451e5eaf37440b8e337ab150c017df7c03faf846c51DRC 452e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm3,xmm6 453e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm1,xmm7 454e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm0,xmm6 ; round to int32, mm0=data0L=(00 10) 455e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm1,xmm7 ; round to int32, mm1=data1L=(01 11) 456e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm2,xmm3 ; round to int32, mm2=data0H=(20 30) 457e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm3,xmm1 ; round to int32, mm3=data1H=(21 31) 458e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm0,mm2 ; mm0=data0=(00 10 20 30) 459e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm1,mm3 ; mm1=data1=(01 11 21 31) 460e5eaf37440b8e337ab150c017df7c03faf846c51DRC 461e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm6,xmm5 462e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm7,xmm0 463e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm4,xmm5 ; round to int32, mm4=data7L=(07 17) 464e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm5,xmm0 ; round to int32, mm5=data6L=(06 16) 465e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm6,xmm6 ; round to int32, mm6=data7H=(27 37) 466e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm7,xmm7 ; round to int32, mm7=data6H=(26 36) 467e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm4,mm6 ; mm4=data7=(07 17 27 37) 468e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm5,mm7 ; mm5=data6=(06 16 26 36) 469e5eaf37440b8e337ab150c017df7c03faf846c51DRC 470e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb mm0,mm5 ; mm0=(00 10 20 30 06 16 26 36) 471e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb mm1,mm4 ; mm1=(01 11 21 31 07 17 27 37) 472e5eaf37440b8e337ab150c017df7c03faf846c51DRC 473e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2 474e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 475e5eaf37440b8e337ab150c017df7c03faf846c51DRC 476e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm6,[GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125] 477e5eaf37440b8e337ab150c017df7c03faf846c51DRC 478e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm4,xmm2 ; xmm4=tmp4 479e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm5,xmm3 480e5eaf37440b8e337ab150c017df7c03faf846c51DRC movaps xmm0,xmm1 481e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm3,xmm2 ; xmm3=data2=(02 12 22 32) 482e5eaf37440b8e337ab150c017df7c03faf846c51DRC addps xmm1,xmm4 ; xmm1=data4=(04 14 24 34) 483e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm5,xmm2 ; xmm5=data5=(05 15 25 35) 484e5eaf37440b8e337ab150c017df7c03faf846c51DRC subps xmm0,xmm4 ; xmm0=data3=(03 13 23 33) 485e5eaf37440b8e337ab150c017df7c03faf846c51DRC 486e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm3,xmm6 ; descale(1/8) 487e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm1,xmm6 ; descale(1/8) 488e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm5,xmm6 ; descale(1/8) 489e5eaf37440b8e337ab150c017df7c03faf846c51DRC mulps xmm0,xmm6 ; descale(1/8) 490e5eaf37440b8e337ab150c017df7c03faf846c51DRC 491e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm7,xmm3 492e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm2,xmm1 493e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm2,xmm3 ; round to int32, mm2=data2L=(02 12) 494e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm3,xmm1 ; round to int32, mm3=data4L=(04 14) 495e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm6,xmm7 ; round to int32, mm6=data2H=(22 32) 496e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm7,xmm2 ; round to int32, mm7=data4H=(24 34) 497e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm2,mm6 ; mm2=data2=(02 12 22 32) 498e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm3,mm7 ; mm3=data4=(04 14 24 34) 499e5eaf37440b8e337ab150c017df7c03faf846c51DRC 500e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm4,xmm5 501e5eaf37440b8e337ab150c017df7c03faf846c51DRC movhlps xmm6,xmm0 502e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm5,xmm5 ; round to int32, mm5=data5L=(05 15) 503e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm4,xmm0 ; round to int32, mm4=data3L=(03 13) 504e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm6,xmm4 ; round to int32, mm6=data5H=(25 35) 505e5eaf37440b8e337ab150c017df7c03faf846c51DRC cvtps2pi mm7,xmm6 ; round to int32, mm7=data3H=(23 33) 506e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm5,mm6 ; mm5=data5=(05 15 25 35) 507e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw mm4,mm7 ; mm4=data3=(03 13 23 33) 508e5eaf37440b8e337ab150c017df7c03faf846c51DRC 509e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] 510e5eaf37440b8e337ab150c017df7c03faf846c51DRC 511e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb mm2,mm3 ; mm2=(02 12 22 32 04 14 24 34) 512e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb mm4,mm5 ; mm4=(03 13 23 33 05 15 25 35) 513e5eaf37440b8e337ab150c017df7c03faf846c51DRC 514e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb mm0,mm6 515e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb mm1,mm6 516e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb mm2,mm6 517e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb mm4,mm6 518e5eaf37440b8e337ab150c017df7c03faf846c51DRC 519e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm7,mm0 ; transpose coefficients(phase 1) 520e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklbw mm0,mm1 ; mm0=(00 01 10 11 20 21 30 31) 521e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhbw mm7,mm1 ; mm7=(06 07 16 17 26 27 36 37) 522e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm3,mm2 ; transpose coefficients(phase 1) 523e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklbw mm2,mm4 ; mm2=(02 03 12 13 22 23 32 33) 524e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhbw mm3,mm4 ; mm3=(04 05 14 15 24 25 34 35) 525e5eaf37440b8e337ab150c017df7c03faf846c51DRC 526e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm5,mm0 ; transpose coefficients(phase 2) 527e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm0,mm2 ; mm0=(00 01 02 03 10 11 12 13) 528e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm5,mm2 ; mm5=(20 21 22 23 30 31 32 33) 529e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm6,mm3 ; transpose coefficients(phase 2) 530e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd mm3,mm7 ; mm3=(04 05 06 07 14 15 16 17) 531e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd mm6,mm7 ; mm6=(24 25 26 27 34 35 36 37) 532e5eaf37440b8e337ab150c017df7c03faf846c51DRC 533e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm1,mm0 ; transpose coefficients(phase 3) 534e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq mm0,mm3 ; mm0=(00 01 02 03 04 05 06 07) 535e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq mm1,mm3 ; mm1=(10 11 12 13 14 15 16 17) 536e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq mm4,mm5 ; transpose coefficients(phase 3) 537e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq mm5,mm6 ; mm5=(20 21 22 23 24 25 26 27) 538e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq mm4,mm6 ; mm4=(30 31 32 33 34 35 36 37) 539e5eaf37440b8e337ab150c017df7c03faf846c51DRC 540e5eaf37440b8e337ab150c017df7c03faf846c51DRC pushpic ebx ; save GOT address 541e5eaf37440b8e337ab150c017df7c03faf846c51DRC 542e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 543e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 544e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 545e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 546e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 547e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 548e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 549e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 550e5eaf37440b8e337ab150c017df7c03faf846c51DRC 551e5eaf37440b8e337ab150c017df7c03faf846c51DRC poppic ebx ; restore GOT address 552e5eaf37440b8e337ab150c017df7c03faf846c51DRC 553e5eaf37440b8e337ab150c017df7c03faf846c51DRC add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 554e5eaf37440b8e337ab150c017df7c03faf846c51DRC add edi, byte 4*SIZEOF_JSAMPROW 555e5eaf37440b8e337ab150c017df7c03faf846c51DRC dec ecx ; ctr 556e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz near .rowloop 557e5eaf37440b8e337ab150c017df7c03faf846c51DRC 558e5eaf37440b8e337ab150c017df7c03faf846c51DRC emms ; empty MMX state 559e5eaf37440b8e337ab150c017df7c03faf846c51DRC 560e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop edi 561e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop esi 562e5eaf37440b8e337ab150c017df7c03faf846c51DRC; pop edx ; need not be preserved 563e5eaf37440b8e337ab150c017df7c03faf846c51DRC; pop ecx ; need not be preserved 564e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop ebx 565e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov esp,ebp ; esp <- aligned ebp 566e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop esp ; esp <- original ebp 567e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop ebp 568e5eaf37440b8e337ab150c017df7c03faf846c51DRC ret 569a2e6a9dd47eb10c701a42a16f305ded1a02cd886MIYASAKA Masaru 570132b5fdd6d1b70c32d76ff5389bd6e183e363f4dDRC; For some reason, the OS X linker does not honor the request to align the 571132b5fdd6d1b70c32d76ff5389bd6e183e363f4dDRC; segment unless we do this. 572e5eaf37440b8e337ab150c017df7c03faf846c51DRC align 16 573