1cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 272130be99f592adf302fb13682eea7b87352026cDRC; jidctint.asm - accurate integer IDCT (64-bit SSE2) 3cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 4cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; Copyright 2009 D. R. Commander 6cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 7cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; Based on 8cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; x86 SIMD extension for IJG JPEG library 9cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; Copyright (C) 1999-2006, MIYASAKA Masaru. 10cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; For conditions of distribution and use, see copyright notice in jsimdext.inc 11cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 12cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; This file should be assembled with NASM (Netwide Assembler), 13cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; can *not* be assembled with Microsoft's MASM or any compatible 14cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; assembler (including Borland's Turbo Assembler). 15cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; NASM is available from http://nasm.sourceforge.net/ or 16cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; http://sourceforge.net/project/showfiles.php?group_id=6208 17cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 18cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; This file contains a slow-but-accurate integer implementation of the 19cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; inverse DCT (Discrete Cosine Transform). The following code is based 20cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; directly on the IJG's original jidctint.c; see the jidctint.c for 21cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; more details. 22cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 23cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; [TAB8] 24cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 25cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%include "jsimdext.inc" 26cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%include "jdct.inc" 27cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 28cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; -------------------------------------------------------------------------- 29cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 30e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define CONST_BITS 13 31e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define PASS1_BITS 2 32cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 33e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define DESCALE_P1 (CONST_BITS-PASS1_BITS) 34e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) 35cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 36cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%if CONST_BITS == 13 37e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_298 equ 2446 ; FIX(0.298631336) 38e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_390 equ 3196 ; FIX(0.390180644) 39e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_541 equ 4433 ; FIX(0.541196100) 40e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_765 equ 6270 ; FIX(0.765366865) 41e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_899 equ 7373 ; FIX(0.899976223) 42e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_175 equ 9633 ; FIX(1.175875602) 43e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_501 equ 12299 ; FIX(1.501321110) 44e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_847 equ 15137 ; FIX(1.847759065) 45e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_961 equ 16069 ; FIX(1.961570560) 46e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_2_053 equ 16819 ; FIX(2.053119869) 47e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_2_562 equ 20995 ; FIX(2.562915447) 48e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_3_072 equ 25172 ; FIX(3.072711026) 49cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%else 50cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; NASM cannot do compile-time arithmetic on floating-point constants. 51cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) 52e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) 53e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) 54e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) 55e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) 56e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) 57e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) 58e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) 59e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) 60e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) 61e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) 62e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) 63e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) 64cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%endif 65cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 66cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; -------------------------------------------------------------------------- 67e5eaf37440b8e337ab150c017df7c03faf846c51DRC SECTION SEG_CONST 68cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 69e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignz 16 70e5eaf37440b8e337ab150c017df7c03faf846c51DRC global EXTN(jconst_idct_islow_sse2) 71cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 72cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRCEXTN(jconst_idct_islow_sse2): 73cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 74e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 75e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) 76e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 77e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) 78e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 79e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) 80e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 81e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) 82e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) 83e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) 84e5eaf37440b8e337ab150c017df7c03faf846c51DRCPB_CENTERJSAMP times 16 db CENTERJSAMPLE 85cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 86e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignz 16 87cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 88cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; -------------------------------------------------------------------------- 89e5eaf37440b8e337ab150c017df7c03faf846c51DRC SECTION SEG_TEXT 90e5eaf37440b8e337ab150c017df7c03faf846c51DRC BITS 64 91cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 92cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; Perform dequantization and inverse DCT on one block of coefficients. 93cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 94cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; GLOBAL(void) 95cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block, 96cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; JSAMPARRAY output_buf, JDIMENSION output_col) 97cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 98cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 99cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; r10 = jpeg_component_info * compptr 100cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; r11 = JCOEFPTR coef_block 101cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; r12 = JSAMPARRAY output_buf 102cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; r13 = JDIMENSION output_col 103cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 104e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define original_rbp rbp+0 105e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 106e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define WK_NUM 12 107cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 108e5eaf37440b8e337ab150c017df7c03faf846c51DRC align 16 109e5eaf37440b8e337ab150c017df7c03faf846c51DRC global EXTN(jsimd_idct_islow_sse2) 110cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 111cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRCEXTN(jsimd_idct_islow_sse2): 112e5eaf37440b8e337ab150c017df7c03faf846c51DRC push rbp 113e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rax,rsp ; rax = original rbp 114e5eaf37440b8e337ab150c017df7c03faf846c51DRC sub rsp, byte 4 115e5eaf37440b8e337ab150c017df7c03faf846c51DRC and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 116e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov [rsp],rax 117e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rbp,rsp ; rbp = aligned rbp 118e5eaf37440b8e337ab150c017df7c03faf846c51DRC lea rsp, [wk(0)] 119e5eaf37440b8e337ab150c017df7c03faf846c51DRC collect_args 120cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 121e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; ---- Pass 1: process columns from input. 122cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 123e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdx, r10 ; quantptr 124e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsi, r11 ; inptr 125cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 126cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 127e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] 128e5eaf37440b8e337ab150c017df7c03faf846c51DRC or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] 129e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz near .columnDCT 130e5eaf37440b8e337ab150c017df7c03faf846c51DRC 131e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 132e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 133e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 134e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 135e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 136e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 137e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 138e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm1,xmm0 139e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm1,xmm1 140e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm1,xmm1 141e5eaf37440b8e337ab150c017df7c03faf846c51DRC movd eax,xmm1 142e5eaf37440b8e337ab150c017df7c03faf846c51DRC test rax,rax 143e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz short .columnDCT 144e5eaf37440b8e337ab150c017df7c03faf846c51DRC 145e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- AC terms all zero 146e5eaf37440b8e337ab150c017df7c03faf846c51DRC 147e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 148e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 149e5eaf37440b8e337ab150c017df7c03faf846c51DRC 150e5eaf37440b8e337ab150c017df7c03faf846c51DRC psllw xmm5,PASS1_BITS 151e5eaf37440b8e337ab150c017df7c03faf846c51DRC 152e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) 153e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) 154e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) 155e5eaf37440b8e337ab150c017df7c03faf846c51DRC 156e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) 157e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) 158e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) 159e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) 160e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) 161e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) 162e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) 163e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) 164e5eaf37440b8e337ab150c017df7c03faf846c51DRC 165e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 166e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 167e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 168e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 169e5eaf37440b8e337ab150c017df7c03faf846c51DRC jmp near .column_end 170cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%endif 171cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC.columnDCT: 172cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 173e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Even part 174e5eaf37440b8e337ab150c017df7c03faf846c51DRC 175e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 176e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 177e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 178e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 179e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 180e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 181e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 182e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 183e5eaf37440b8e337ab150c017df7c03faf846c51DRC 184e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 185e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = (z2 + z3) * 0.541196100; 186e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = z1 + z3 * -1.847759065; 187e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = z1 + z2 * 0.765366865; 188e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 189e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 190e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); 191e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; 192e5eaf37440b8e337ab150c017df7c03faf846c51DRC 193e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm1 ; xmm1=in2=z2 194e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm1 195e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm4,xmm3 ; xmm3=in6=z3 196e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm5,xmm3 197e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm4 198e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm5 199e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L 200e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H 201e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L 202e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H 203e5eaf37440b8e337ab150c017df7c03faf846c51DRC 204e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm0 205e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm0,xmm2 ; xmm0=in0+in4 206e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubw xmm6,xmm2 ; xmm6=in0-in4 207e5eaf37440b8e337ab150c017df7c03faf846c51DRC 208e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm7,xmm7 209e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm2,xmm2 210e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm7,xmm0 ; xmm7=tmp0L 211e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm2,xmm0 ; xmm2=tmp0H 212e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS 213e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS 214e5eaf37440b8e337ab150c017df7c03faf846c51DRC 215e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm7 216e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm4 ; xmm7=tmp10L 217e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm4 ; xmm0=tmp13L 218e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm2 219e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm5 ; xmm2=tmp10H 220e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm4,xmm5 ; xmm4=tmp13H 221e5eaf37440b8e337ab150c017df7c03faf846c51DRC 222e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L 223e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H 224e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L 225e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H 226e5eaf37440b8e337ab150c017df7c03faf846c51DRC 227e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm5,xmm5 228e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm7,xmm7 229e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm5,xmm6 ; xmm5=tmp1L 230e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm7,xmm6 ; xmm7=tmp1H 231e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS 232e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS 233e5eaf37440b8e337ab150c017df7c03faf846c51DRC 234e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm5 235e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm1 ; xmm5=tmp11L 236e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm2,xmm1 ; xmm2=tmp12L 237e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm7 238e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm3 ; xmm7=tmp11H 239e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm3 ; xmm0=tmp12H 240e5eaf37440b8e337ab150c017df7c03faf846c51DRC 241e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L 242e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H 243e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L 244e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H 245e5eaf37440b8e337ab150c017df7c03faf846c51DRC 246e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Odd part 247e5eaf37440b8e337ab150c017df7c03faf846c51DRC 248e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 249e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 250e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 251e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 252e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 253e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 254e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 255e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 256e5eaf37440b8e337ab150c017df7c03faf846c51DRC 257e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm6 258e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm4 259e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm5,xmm3 ; xmm5=z3 260e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm7,xmm1 ; xmm7=z4 261e5eaf37440b8e337ab150c017df7c03faf846c51DRC 262e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 263e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z5 = (z3 + z4) * 1.175875602; 264e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 265e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 += z5; z4 += z5; 266e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 267e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 268e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 269e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 270e5eaf37440b8e337ab150c017df7c03faf846c51DRC 271e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm5 272e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm5 273e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm2,xmm7 274e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm0,xmm7 275e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm2 276e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm0 277e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L 278e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H 279e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L 280e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H 281e5eaf37440b8e337ab150c017df7c03faf846c51DRC 282e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L 283e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H 284e5eaf37440b8e337ab150c017df7c03faf846c51DRC 285e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 286e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; 287e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; 288e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; 289e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 290e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 += z1 + z3; tmp1 += z2 + z4; 291e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 += z2 + z3; tmp3 += z1 + z4; 292e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 293e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 294e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; 295e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; 296e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); 297e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); 298e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 += z3; tmp1 += z4; 299e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 += z3; tmp3 += z4; 300e5eaf37440b8e337ab150c017df7c03faf846c51DRC 301e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm3 302e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm3 303e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm2,xmm4 304e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm0,xmm4 305e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm2 306e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm0 307e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L 308e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H 309e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L 310e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H 311e5eaf37440b8e337ab150c017df7c03faf846c51DRC 312e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L 313e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H 314e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm5 ; xmm3=tmp3L 315e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm7 ; xmm4=tmp3H 316e5eaf37440b8e337ab150c017df7c03faf846c51DRC 317e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L 318e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H 319e5eaf37440b8e337ab150c017df7c03faf846c51DRC 320e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm1 321e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm1 322e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm2,xmm6 323e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm0,xmm6 324e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm2 325e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm0 326e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L 327e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H 328e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L 329e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H 330e5eaf37440b8e337ab150c017df7c03faf846c51DRC 331e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm5 ; xmm2=tmp1L 332e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm7 ; xmm0=tmp1H 333e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L 334e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H 335e5eaf37440b8e337ab150c017df7c03faf846c51DRC 336e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L 337e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H 338e5eaf37440b8e337ab150c017df7c03faf846c51DRC 339e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Final output stage 340e5eaf37440b8e337ab150c017df7c03faf846c51DRC 341e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L 342e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H 343e5eaf37440b8e337ab150c017df7c03faf846c51DRC 344e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm5 345e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm7 346e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm3 ; xmm5=data0L 347e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm4 ; xmm7=data0H 348e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm2,xmm3 ; xmm2=data7L 349e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm4 ; xmm0=data7H 350e5eaf37440b8e337ab150c017df7c03faf846c51DRC 351e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] 352e5eaf37440b8e337ab150c017df7c03faf846c51DRC 353e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm3 354e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm3 355e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm5,DESCALE_P1 356e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,DESCALE_P1 357e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm3 358e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm3 359e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm2,DESCALE_P1 360e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P1 361e5eaf37440b8e337ab150c017df7c03faf846c51DRC 362e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) 363e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) 364e5eaf37440b8e337ab150c017df7c03faf846c51DRC 365e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L 366e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H 367e5eaf37440b8e337ab150c017df7c03faf846c51DRC 368e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm4 369e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm3 370e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm1 ; xmm4=data1L 371e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm6 ; xmm3=data1H 372e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm7,xmm1 ; xmm7=data6L 373e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm6 ; xmm0=data6H 374e5eaf37440b8e337ab150c017df7c03faf846c51DRC 375e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] 376e5eaf37440b8e337ab150c017df7c03faf846c51DRC 377e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm1 378e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm1 379e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,DESCALE_P1 380e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm3,DESCALE_P1 381e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm1 382e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm1 383e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,DESCALE_P1 384e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P1 385e5eaf37440b8e337ab150c017df7c03faf846c51DRC 386e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) 387e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) 388e5eaf37440b8e337ab150c017df7c03faf846c51DRC 389e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm5 ; transpose coefficients(phase 1) 390e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) 391e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) 392e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm7 ; transpose coefficients(phase 1) 393e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) 394e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) 395e5eaf37440b8e337ab150c017df7c03faf846c51DRC 396e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L 397e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H 398e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L 399e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H 400e5eaf37440b8e337ab150c017df7c03faf846c51DRC 401e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) 402e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) 403e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) 404e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) 405e5eaf37440b8e337ab150c017df7c03faf846c51DRC 406e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm3 407e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm0 408e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm4 ; xmm3=data2L 409e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm2 ; xmm0=data2H 410e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm5,xmm4 ; xmm5=data5L 411e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm6,xmm2 ; xmm6=data5H 412e5eaf37440b8e337ab150c017df7c03faf846c51DRC 413e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] 414e5eaf37440b8e337ab150c017df7c03faf846c51DRC 415e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm7 416e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm7 417e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm3,DESCALE_P1 418e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P1 419e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm7 420e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6,xmm7 421e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm5,DESCALE_P1 422e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm6,DESCALE_P1 423e5eaf37440b8e337ab150c017df7c03faf846c51DRC 424e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) 425e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) 426e5eaf37440b8e337ab150c017df7c03faf846c51DRC 427e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L 428e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H 429e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L 430e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H 431e5eaf37440b8e337ab150c017df7c03faf846c51DRC 432e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm1 433e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm4 434e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm2 ; xmm1=data3L 435e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm7 ; xmm4=data3H 436e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm2 ; xmm0=data4L 437e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm6,xmm7 ; xmm6=data4H 438e5eaf37440b8e337ab150c017df7c03faf846c51DRC 439e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] 440e5eaf37440b8e337ab150c017df7c03faf846c51DRC 441e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm2 442e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm2 443e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm1,DESCALE_P1 444e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,DESCALE_P1 445e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm2 446e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6,xmm2 447e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P1 448e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm6,DESCALE_P1 449e5eaf37440b8e337ab150c017df7c03faf846c51DRC 450e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) 451e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) 452e5eaf37440b8e337ab150c017df7c03faf846c51DRC 453e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) 454e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) 455e5eaf37440b8e337ab150c017df7c03faf846c51DRC 456e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm3 ; transpose coefficients(phase 1) 457e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) 458e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) 459e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm0 ; transpose coefficients(phase 1) 460e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) 461e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) 462e5eaf37440b8e337ab150c017df7c03faf846c51DRC 463e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm7 ; transpose coefficients(phase 2) 464e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) 465e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) 466e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm2 ; transpose coefficients(phase 2) 467e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) 468e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) 469e5eaf37440b8e337ab150c017df7c03faf846c51DRC 470e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) 471e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) 472e5eaf37440b8e337ab150c017df7c03faf846c51DRC 473e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) 474e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) 475e5eaf37440b8e337ab150c017df7c03faf846c51DRC 476e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm0 ; transpose coefficients(phase 2) 477e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) 478e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) 479e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm6 ; transpose coefficients(phase 2) 480e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) 481e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) 482e5eaf37440b8e337ab150c017df7c03faf846c51DRC 483e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm7 ; transpose coefficients(phase 3) 484e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) 485e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) 486e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm1 ; transpose coefficients(phase 3) 487e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) 488e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) 489e5eaf37440b8e337ab150c017df7c03faf846c51DRC 490e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) 491e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) 492e5eaf37440b8e337ab150c017df7c03faf846c51DRC 493e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 494e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 495e5eaf37440b8e337ab150c017df7c03faf846c51DRC 496e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm0 ; transpose coefficients(phase 3) 497e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) 498e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) 499e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm2 ; transpose coefficients(phase 3) 500e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) 501e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) 502e5eaf37440b8e337ab150c017df7c03faf846c51DRC 503e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 504e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 505cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC.column_end: 506cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 507e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Prefetch the next coefficient block 508e5eaf37440b8e337ab150c017df7c03faf846c51DRC 509e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] 510e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] 511e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] 512e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] 513e5eaf37440b8e337ab150c017df7c03faf846c51DRC 514e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; ---- Pass 2: process rows from work array, store into output array. 515e5eaf37440b8e337ab150c017df7c03faf846c51DRC 516e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rax, [original_rbp] 517e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdi, r12 ; (JSAMPROW *) 518498d9bc92fcf39124b6f08e57326944dedd2ddd6Chandler Carruth mov eax, r13d 519e5eaf37440b8e337ab150c017df7c03faf846c51DRC 520e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Even part 521e5eaf37440b8e337ab150c017df7c03faf846c51DRC 522e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 523e5eaf37440b8e337ab150c017df7c03faf846c51DRC 524e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 525e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = (z2 + z3) * 0.541196100; 526e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = z1 + z3 * -1.847759065; 527e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = z1 + z2 * 0.765366865; 528e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 529e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 530e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); 531e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; 532e5eaf37440b8e337ab150c017df7c03faf846c51DRC 533e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm1 ; xmm1=in2=z2 534e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm1 535e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm6,xmm2 ; xmm2=in6=z3 536e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm5,xmm2 537e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm6 538e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm5 539e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L 540e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H 541e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L 542e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H 543e5eaf37440b8e337ab150c017df7c03faf846c51DRC 544e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm7 545e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm7,xmm0 ; xmm7=in0+in4 546e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubw xmm3,xmm0 ; xmm3=in0-in4 547e5eaf37440b8e337ab150c017df7c03faf846c51DRC 548e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm4,xmm4 549e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm0,xmm0 550e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm4,xmm7 ; xmm4=tmp0L 551e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm0,xmm7 ; xmm0=tmp0H 552e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS 553e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS 554e5eaf37440b8e337ab150c017df7c03faf846c51DRC 555e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm4 556e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm6 ; xmm4=tmp10L 557e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm7,xmm6 ; xmm7=tmp13L 558e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm0 559e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm5 ; xmm0=tmp10H 560e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm6,xmm5 ; xmm6=tmp13H 561e5eaf37440b8e337ab150c017df7c03faf846c51DRC 562e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L 563e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H 564e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L 565e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H 566e5eaf37440b8e337ab150c017df7c03faf846c51DRC 567e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm5,xmm5 568e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm4,xmm4 569e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm5,xmm3 ; xmm5=tmp1L 570e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm4,xmm3 ; xmm4=tmp1H 571e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS 572e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS 573e5eaf37440b8e337ab150c017df7c03faf846c51DRC 574e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm5 575e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm1 ; xmm5=tmp11L 576e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm1 ; xmm0=tmp12L 577e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm4 578e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm2 ; xmm4=tmp11H 579e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm7,xmm2 ; xmm7=tmp12H 580e5eaf37440b8e337ab150c017df7c03faf846c51DRC 581e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L 582e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H 583e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L 584e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H 585e5eaf37440b8e337ab150c017df7c03faf846c51DRC 586e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Odd part 587e5eaf37440b8e337ab150c017df7c03faf846c51DRC 588e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 589e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 590e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 591e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 592e5eaf37440b8e337ab150c017df7c03faf846c51DRC 593e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm6 594e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm3 595e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm5,xmm1 ; xmm5=z3 596e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm4,xmm2 ; xmm4=z4 597e5eaf37440b8e337ab150c017df7c03faf846c51DRC 598e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 599e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z5 = (z3 + z4) * 1.175875602; 600e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 601e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 += z5; z4 += z5; 602e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 603e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 604e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 605e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 606e5eaf37440b8e337ab150c017df7c03faf846c51DRC 607e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm5 608e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm5 609e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm0,xmm4 610e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm7,xmm4 611e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm0 612e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm7 613e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L 614e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H 615e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L 616e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H 617e5eaf37440b8e337ab150c017df7c03faf846c51DRC 618e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L 619e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H 620e5eaf37440b8e337ab150c017df7c03faf846c51DRC 621e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 622e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; 623e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; 624e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; 625e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 626e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 += z1 + z3; tmp1 += z2 + z4; 627e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 += z2 + z3; tmp3 += z1 + z4; 628e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 629e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 630e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; 631e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; 632e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); 633e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); 634e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 += z3; tmp1 += z4; 635e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 += z3; tmp3 += z4; 636e5eaf37440b8e337ab150c017df7c03faf846c51DRC 637e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm1 638e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm1 639e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm0,xmm3 640e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm7,xmm3 641e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm0 642e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm7 643e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L 644e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H 645e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L 646e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H 647e5eaf37440b8e337ab150c017df7c03faf846c51DRC 648e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L 649e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H 650e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm5 ; xmm1=tmp3L 651e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm4 ; xmm3=tmp3H 652e5eaf37440b8e337ab150c017df7c03faf846c51DRC 653e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L 654e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H 655e5eaf37440b8e337ab150c017df7c03faf846c51DRC 656e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm2 657e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm2 658e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm0,xmm6 659e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm7,xmm6 660e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm0 661e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm7 662e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L 663e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H 664e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L 665e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H 666e5eaf37440b8e337ab150c017df7c03faf846c51DRC 667e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm5 ; xmm0=tmp1L 668e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm4 ; xmm7=tmp1H 669e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L 670e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H 671e5eaf37440b8e337ab150c017df7c03faf846c51DRC 672e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L 673e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H 674e5eaf37440b8e337ab150c017df7c03faf846c51DRC 675e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Final output stage 676e5eaf37440b8e337ab150c017df7c03faf846c51DRC 677e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L 678e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H 679e5eaf37440b8e337ab150c017df7c03faf846c51DRC 680e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm5 681e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm4 682e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm1 ; xmm5=data0L 683e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm3 ; xmm4=data0H 684e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm1 ; xmm0=data7L 685e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm7,xmm3 ; xmm7=data7H 686e5eaf37440b8e337ab150c017df7c03faf846c51DRC 687e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] 688e5eaf37440b8e337ab150c017df7c03faf846c51DRC 689e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm1 690e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm1 691e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm5,DESCALE_P2 692e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,DESCALE_P2 693e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm1 694e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm1 695e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P2 696e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,DESCALE_P2 697e5eaf37440b8e337ab150c017df7c03faf846c51DRC 698e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) 699e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) 700e5eaf37440b8e337ab150c017df7c03faf846c51DRC 701e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L 702e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H 703e5eaf37440b8e337ab150c017df7c03faf846c51DRC 704e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm3 705e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm1 706e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm2 ; xmm3=data1L 707e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm6 ; xmm1=data1H 708e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm4,xmm2 ; xmm4=data6L 709e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm7,xmm6 ; xmm7=data6H 710e5eaf37440b8e337ab150c017df7c03faf846c51DRC 711e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] 712e5eaf37440b8e337ab150c017df7c03faf846c51DRC 713e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm2 714e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm2 715e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm3,DESCALE_P2 716e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm1,DESCALE_P2 717e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm2 718e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm2 719e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,DESCALE_P2 720e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,DESCALE_P2 721e5eaf37440b8e337ab150c017df7c03faf846c51DRC 722e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) 723e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) 724e5eaf37440b8e337ab150c017df7c03faf846c51DRC 725e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 726e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 727e5eaf37440b8e337ab150c017df7c03faf846c51DRC 728e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L 729e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H 730e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L 731e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H 732e5eaf37440b8e337ab150c017df7c03faf846c51DRC 733e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 734e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 735e5eaf37440b8e337ab150c017df7c03faf846c51DRC 736e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm6 737e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm2 738e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6,xmm1 ; xmm6=data2L 739e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm7 ; xmm2=data2H 740e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm4,xmm1 ; xmm4=data5L 741e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm7 ; xmm0=data5H 742e5eaf37440b8e337ab150c017df7c03faf846c51DRC 743e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] 744e5eaf37440b8e337ab150c017df7c03faf846c51DRC 745e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6,xmm5 746e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm5 747e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm6,DESCALE_P2 748e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm2,DESCALE_P2 749e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm5 750e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm5 751e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,DESCALE_P2 752e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P2 753e5eaf37440b8e337ab150c017df7c03faf846c51DRC 754e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) 755e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) 756e5eaf37440b8e337ab150c017df7c03faf846c51DRC 757e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L 758e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H 759e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L 760e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H 761e5eaf37440b8e337ab150c017df7c03faf846c51DRC 762e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm3 763e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm1 764e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm7 ; xmm3=data3L 765e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm5 ; xmm1=data3H 766e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm2,xmm7 ; xmm2=data4L 767e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm5 ; xmm0=data4H 768e5eaf37440b8e337ab150c017df7c03faf846c51DRC 769e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] 770e5eaf37440b8e337ab150c017df7c03faf846c51DRC 771e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm7 772e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm7 773e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm3,DESCALE_P2 774e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm1,DESCALE_P2 775e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm7 776e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm7 777e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm2,DESCALE_P2 778e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P2 779e5eaf37440b8e337ab150c017df7c03faf846c51DRC 780e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] 781e5eaf37440b8e337ab150c017df7c03faf846c51DRC 782e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) 783e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) 784e5eaf37440b8e337ab150c017df7c03faf846c51DRC 785e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 786e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 787e5eaf37440b8e337ab150c017df7c03faf846c51DRC 788e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) 789e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) 790e5eaf37440b8e337ab150c017df7c03faf846c51DRC 791e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb xmm7,xmm5 792e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb xmm1,xmm5 793e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb xmm6,xmm5 794e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb xmm3,xmm5 795e5eaf37440b8e337ab150c017df7c03faf846c51DRC 796e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm7 ; transpose coefficients(phase 1) 797e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) 798e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) 799e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm6 ; transpose coefficients(phase 1) 800e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) 801e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) 802e5eaf37440b8e337ab150c017df7c03faf846c51DRC 803e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm7 ; transpose coefficients(phase 2) 804e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 805e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) 806e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm2 ; transpose coefficients(phase 2) 807e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 808e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) 809e5eaf37440b8e337ab150c017df7c03faf846c51DRC 810e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm7 ; transpose coefficients(phase 3) 811e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 812e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 813e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm4 ; transpose coefficients(phase 3) 814e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) 815e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) 816e5eaf37440b8e337ab150c017df7c03faf846c51DRC 817e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 818e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 819e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) 820e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) 821e5eaf37440b8e337ab150c017df7c03faf846c51DRC 822e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] 823e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] 824e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7 825e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 826e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] 827e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] 828e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 829e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 830e5eaf37440b8e337ab150c017df7c03faf846c51DRC 831e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] 832e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] 833e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 834e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 835e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] 836e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] 837e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 838e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 839e5eaf37440b8e337ab150c017df7c03faf846c51DRC 840e5eaf37440b8e337ab150c017df7c03faf846c51DRC uncollect_args 841e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsp,rbp ; rsp <- aligned rbp 842e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop rsp ; rsp <- original rbp 843e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop rbp 844e5eaf37440b8e337ab150c017df7c03faf846c51DRC ret 845132b5fdd6d1b70c32d76ff5389bd6e183e363f4dDRC 846132b5fdd6d1b70c32d76ff5389bd6e183e363f4dDRC; For some reason, the OS X linker does not honor the request to align the 847132b5fdd6d1b70c32d76ff5389bd6e183e363f4dDRC; segment unless we do this. 848e5eaf37440b8e337ab150c017df7c03faf846c51DRC align 16 849