1cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 272130be99f592adf302fb13682eea7b87352026cDRC; jidctint.asm - accurate integer IDCT (64-bit SSE2) 3cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 4cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 56eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis; Copyright (C) 2009, D. R. Commander. 6cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 76eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis; Based on the x86 SIMD extension for IJG JPEG library 8cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; Copyright (C) 1999-2006, MIYASAKA Masaru. 9cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; For conditions of distribution and use, see copyright notice in jsimdext.inc 10cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 11cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; This file should be assembled with NASM (Netwide Assembler), 12cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; can *not* be assembled with Microsoft's MASM or any compatible 13cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; assembler (including Borland's Turbo Assembler). 14cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; NASM is available from http://nasm.sourceforge.net/ or 15cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; http://sourceforge.net/project/showfiles.php?group_id=6208 16cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 17cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; This file contains a slow-but-accurate integer implementation of the 18cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; inverse DCT (Discrete Cosine Transform). The following code is based 19cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; directly on the IJG's original jidctint.c; see the jidctint.c for 20cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; more details. 21cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 22cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; [TAB8] 23cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 24cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%include "jsimdext.inc" 25cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%include "jdct.inc" 26cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 27cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; -------------------------------------------------------------------------- 28cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 29e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define CONST_BITS 13 30e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define PASS1_BITS 2 31cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 32e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define DESCALE_P1 (CONST_BITS-PASS1_BITS) 33e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) 34cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 35cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%if CONST_BITS == 13 36e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_298 equ 2446 ; FIX(0.298631336) 37e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_390 equ 3196 ; FIX(0.390180644) 38e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_541 equ 4433 ; FIX(0.541196100) 39e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_765 equ 6270 ; FIX(0.765366865) 40e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_899 equ 7373 ; FIX(0.899976223) 41e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_175 equ 9633 ; FIX(1.175875602) 42e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_501 equ 12299 ; FIX(1.501321110) 43e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_847 equ 15137 ; FIX(1.847759065) 44e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_961 equ 16069 ; FIX(1.961570560) 45e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_2_053 equ 16819 ; FIX(2.053119869) 46e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_2_562 equ 20995 ; FIX(2.562915447) 47e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_3_072 equ 25172 ; FIX(3.072711026) 48cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%else 49cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; NASM cannot do compile-time arithmetic on floating-point constants. 50cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) 51e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) 52e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) 53e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) 54e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) 55e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) 56e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) 57e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) 58e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) 59e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) 60e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) 61e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) 62e5eaf37440b8e337ab150c017df7c03faf846c51DRCF_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) 63cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%endif 64cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 65cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; -------------------------------------------------------------------------- 66e5eaf37440b8e337ab150c017df7c03faf846c51DRC SECTION SEG_CONST 67cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 68e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignz 16 69e5eaf37440b8e337ab150c017df7c03faf846c51DRC global EXTN(jconst_idct_islow_sse2) 70cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 71cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRCEXTN(jconst_idct_islow_sse2): 72cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 73e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 74e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) 75e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 76e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) 77e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 78e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) 79e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 80e5eaf37440b8e337ab150c017df7c03faf846c51DRCPW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) 81e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) 82e5eaf37440b8e337ab150c017df7c03faf846c51DRCPD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) 83e5eaf37440b8e337ab150c017df7c03faf846c51DRCPB_CENTERJSAMP times 16 db CENTERJSAMPLE 84cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 85e5eaf37440b8e337ab150c017df7c03faf846c51DRC alignz 16 86cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 87cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; -------------------------------------------------------------------------- 88e5eaf37440b8e337ab150c017df7c03faf846c51DRC SECTION SEG_TEXT 89e5eaf37440b8e337ab150c017df7c03faf846c51DRC BITS 64 90cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 91cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; Perform dequantization and inverse DCT on one block of coefficients. 92cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 93cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; GLOBAL(void) 946eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block, 95cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; JSAMPARRAY output_buf, JDIMENSION output_col) 96cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; 97cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 986eb7d3798b5a79347c62825fc4c16f7ce673bdd0Alex Naidis; r10 = jpeg_component_info *compptr 99cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; r11 = JCOEFPTR coef_block 100cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; r12 = JSAMPARRAY output_buf 101cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC; r13 = JDIMENSION output_col 102cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 103e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define original_rbp rbp+0 104e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 105e5eaf37440b8e337ab150c017df7c03faf846c51DRC%define WK_NUM 12 106cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 107e5eaf37440b8e337ab150c017df7c03faf846c51DRC align 16 108e5eaf37440b8e337ab150c017df7c03faf846c51DRC global EXTN(jsimd_idct_islow_sse2) 109cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 110cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRCEXTN(jsimd_idct_islow_sse2): 111e5eaf37440b8e337ab150c017df7c03faf846c51DRC push rbp 112e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rax,rsp ; rax = original rbp 113e5eaf37440b8e337ab150c017df7c03faf846c51DRC sub rsp, byte 4 114e5eaf37440b8e337ab150c017df7c03faf846c51DRC and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 115e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov [rsp],rax 116e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rbp,rsp ; rbp = aligned rbp 117e5eaf37440b8e337ab150c017df7c03faf846c51DRC lea rsp, [wk(0)] 118e5eaf37440b8e337ab150c017df7c03faf846c51DRC collect_args 119cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 120e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; ---- Pass 1: process columns from input. 121cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 122e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdx, r10 ; quantptr 123e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsi, r11 ; inptr 124cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 125cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 126e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] 127e5eaf37440b8e337ab150c017df7c03faf846c51DRC or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] 128e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz near .columnDCT 129e5eaf37440b8e337ab150c017df7c03faf846c51DRC 130e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 131e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 132e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 133e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 134e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 135e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 136e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 137e5eaf37440b8e337ab150c017df7c03faf846c51DRC por xmm1,xmm0 138e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm1,xmm1 139e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm1,xmm1 140e5eaf37440b8e337ab150c017df7c03faf846c51DRC movd eax,xmm1 141e5eaf37440b8e337ab150c017df7c03faf846c51DRC test rax,rax 142e5eaf37440b8e337ab150c017df7c03faf846c51DRC jnz short .columnDCT 143e5eaf37440b8e337ab150c017df7c03faf846c51DRC 144e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- AC terms all zero 145e5eaf37440b8e337ab150c017df7c03faf846c51DRC 146e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 147e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 148e5eaf37440b8e337ab150c017df7c03faf846c51DRC 149e5eaf37440b8e337ab150c017df7c03faf846c51DRC psllw xmm5,PASS1_BITS 150e5eaf37440b8e337ab150c017df7c03faf846c51DRC 151e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) 152e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) 153e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) 154e5eaf37440b8e337ab150c017df7c03faf846c51DRC 155e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) 156e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) 157e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) 158e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) 159e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) 160e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) 161e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) 162e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) 163e5eaf37440b8e337ab150c017df7c03faf846c51DRC 164e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 165e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 166e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 167e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 168e5eaf37440b8e337ab150c017df7c03faf846c51DRC jmp near .column_end 169cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC%endif 170cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC.columnDCT: 171cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 172e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Even part 173e5eaf37440b8e337ab150c017df7c03faf846c51DRC 174e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 175e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 176e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 177e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 178e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 179e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 180e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 181e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 182e5eaf37440b8e337ab150c017df7c03faf846c51DRC 183e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 184e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = (z2 + z3) * 0.541196100; 185e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = z1 + z3 * -1.847759065; 186e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = z1 + z2 * 0.765366865; 187e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 188e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 189e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); 190e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; 191e5eaf37440b8e337ab150c017df7c03faf846c51DRC 192e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm1 ; xmm1=in2=z2 193e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm1 194e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm4,xmm3 ; xmm3=in6=z3 195e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm5,xmm3 196e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm4 197e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm5 198e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L 199e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H 200e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L 201e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H 202e5eaf37440b8e337ab150c017df7c03faf846c51DRC 203e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm0 204e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm0,xmm2 ; xmm0=in0+in4 205e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubw xmm6,xmm2 ; xmm6=in0-in4 206e5eaf37440b8e337ab150c017df7c03faf846c51DRC 207e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm7,xmm7 208e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm2,xmm2 209e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm7,xmm0 ; xmm7=tmp0L 210e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm2,xmm0 ; xmm2=tmp0H 211e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS 212e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS 213e5eaf37440b8e337ab150c017df7c03faf846c51DRC 214e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm7 215e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm4 ; xmm7=tmp10L 216e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm4 ; xmm0=tmp13L 217e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm2 218e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm5 ; xmm2=tmp10H 219e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm4,xmm5 ; xmm4=tmp13H 220e5eaf37440b8e337ab150c017df7c03faf846c51DRC 221e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L 222e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H 223e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L 224e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H 225e5eaf37440b8e337ab150c017df7c03faf846c51DRC 226e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm5,xmm5 227e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm7,xmm7 228e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm5,xmm6 ; xmm5=tmp1L 229e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm7,xmm6 ; xmm7=tmp1H 230e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS 231e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS 232e5eaf37440b8e337ab150c017df7c03faf846c51DRC 233e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm5 234e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm1 ; xmm5=tmp11L 235e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm2,xmm1 ; xmm2=tmp12L 236e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm7 237e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm3 ; xmm7=tmp11H 238e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm3 ; xmm0=tmp12H 239e5eaf37440b8e337ab150c017df7c03faf846c51DRC 240e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L 241e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H 242e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L 243e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H 244e5eaf37440b8e337ab150c017df7c03faf846c51DRC 245e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Odd part 246e5eaf37440b8e337ab150c017df7c03faf846c51DRC 247e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 248e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 249e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 250e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 251e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 252e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 253e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 254e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 255e5eaf37440b8e337ab150c017df7c03faf846c51DRC 256e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm6 257e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm4 258e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm5,xmm3 ; xmm5=z3 259e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm7,xmm1 ; xmm7=z4 260e5eaf37440b8e337ab150c017df7c03faf846c51DRC 261e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 262e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z5 = (z3 + z4) * 1.175875602; 263e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 264e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 += z5; z4 += z5; 265e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 266e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 267e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 268e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 269e5eaf37440b8e337ab150c017df7c03faf846c51DRC 270e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm5 271e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm5 272e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm2,xmm7 273e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm0,xmm7 274e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm2 275e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm0 276e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L 277e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H 278e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L 279e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H 280e5eaf37440b8e337ab150c017df7c03faf846c51DRC 281e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L 282e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H 283e5eaf37440b8e337ab150c017df7c03faf846c51DRC 284e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 285e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; 286e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; 287e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; 288e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 289e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 += z1 + z3; tmp1 += z2 + z4; 290e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 += z2 + z3; tmp3 += z1 + z4; 291e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 292e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 293e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; 294e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; 295e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); 296e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); 297e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 += z3; tmp1 += z4; 298e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 += z3; tmp3 += z4; 299e5eaf37440b8e337ab150c017df7c03faf846c51DRC 300e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm3 301e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm3 302e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm2,xmm4 303e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm0,xmm4 304e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm2 305e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm0 306e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L 307e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H 308e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L 309e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H 310e5eaf37440b8e337ab150c017df7c03faf846c51DRC 311e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L 312e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H 313e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm5 ; xmm3=tmp3L 314e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm7 ; xmm4=tmp3H 315e5eaf37440b8e337ab150c017df7c03faf846c51DRC 316e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L 317e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H 318e5eaf37440b8e337ab150c017df7c03faf846c51DRC 319e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm1 320e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm1 321e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm2,xmm6 322e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm0,xmm6 323e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm2 324e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm0 325e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L 326e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H 327e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L 328e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H 329e5eaf37440b8e337ab150c017df7c03faf846c51DRC 330e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm5 ; xmm2=tmp1L 331e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm7 ; xmm0=tmp1H 332e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L 333e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H 334e5eaf37440b8e337ab150c017df7c03faf846c51DRC 335e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L 336e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H 337e5eaf37440b8e337ab150c017df7c03faf846c51DRC 338e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Final output stage 339e5eaf37440b8e337ab150c017df7c03faf846c51DRC 340e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L 341e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H 342e5eaf37440b8e337ab150c017df7c03faf846c51DRC 343e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm5 344e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm7 345e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm3 ; xmm5=data0L 346e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm4 ; xmm7=data0H 347e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm2,xmm3 ; xmm2=data7L 348e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm4 ; xmm0=data7H 349e5eaf37440b8e337ab150c017df7c03faf846c51DRC 350e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] 351e5eaf37440b8e337ab150c017df7c03faf846c51DRC 352e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm3 353e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm3 354e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm5,DESCALE_P1 355e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,DESCALE_P1 356e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm3 357e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm3 358e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm2,DESCALE_P1 359e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P1 360e5eaf37440b8e337ab150c017df7c03faf846c51DRC 361e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) 362e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) 363e5eaf37440b8e337ab150c017df7c03faf846c51DRC 364e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L 365e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H 366e5eaf37440b8e337ab150c017df7c03faf846c51DRC 367e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm4 368e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm3 369e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm1 ; xmm4=data1L 370e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm6 ; xmm3=data1H 371e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm7,xmm1 ; xmm7=data6L 372e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm6 ; xmm0=data6H 373e5eaf37440b8e337ab150c017df7c03faf846c51DRC 374e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] 375e5eaf37440b8e337ab150c017df7c03faf846c51DRC 376e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm1 377e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm1 378e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,DESCALE_P1 379e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm3,DESCALE_P1 380e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm1 381e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm1 382e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,DESCALE_P1 383e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P1 384e5eaf37440b8e337ab150c017df7c03faf846c51DRC 385e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) 386e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) 387e5eaf37440b8e337ab150c017df7c03faf846c51DRC 388e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm5 ; transpose coefficients(phase 1) 389e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) 390e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) 391e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm7 ; transpose coefficients(phase 1) 392e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) 393e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) 394e5eaf37440b8e337ab150c017df7c03faf846c51DRC 395e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L 396e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H 397e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L 398e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H 399e5eaf37440b8e337ab150c017df7c03faf846c51DRC 400e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) 401e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) 402e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) 403e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) 404e5eaf37440b8e337ab150c017df7c03faf846c51DRC 405e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm3 406e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm0 407e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm4 ; xmm3=data2L 408e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm2 ; xmm0=data2H 409e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm5,xmm4 ; xmm5=data5L 410e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm6,xmm2 ; xmm6=data5H 411e5eaf37440b8e337ab150c017df7c03faf846c51DRC 412e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] 413e5eaf37440b8e337ab150c017df7c03faf846c51DRC 414e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm7 415e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm7 416e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm3,DESCALE_P1 417e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P1 418e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm7 419e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6,xmm7 420e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm5,DESCALE_P1 421e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm6,DESCALE_P1 422e5eaf37440b8e337ab150c017df7c03faf846c51DRC 423e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) 424e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) 425e5eaf37440b8e337ab150c017df7c03faf846c51DRC 426e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L 427e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H 428e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L 429e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H 430e5eaf37440b8e337ab150c017df7c03faf846c51DRC 431e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm1 432e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm4 433e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm2 ; xmm1=data3L 434e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm7 ; xmm4=data3H 435e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm2 ; xmm0=data4L 436e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm6,xmm7 ; xmm6=data4H 437e5eaf37440b8e337ab150c017df7c03faf846c51DRC 438e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] 439e5eaf37440b8e337ab150c017df7c03faf846c51DRC 440e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm2 441e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm2 442e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm1,DESCALE_P1 443e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,DESCALE_P1 444e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm2 445e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6,xmm2 446e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P1 447e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm6,DESCALE_P1 448e5eaf37440b8e337ab150c017df7c03faf846c51DRC 449e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) 450e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) 451e5eaf37440b8e337ab150c017df7c03faf846c51DRC 452e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) 453e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) 454e5eaf37440b8e337ab150c017df7c03faf846c51DRC 455e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm3 ; transpose coefficients(phase 1) 456e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) 457e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) 458e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm0 ; transpose coefficients(phase 1) 459e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) 460e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) 461e5eaf37440b8e337ab150c017df7c03faf846c51DRC 462e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm7 ; transpose coefficients(phase 2) 463e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) 464e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) 465e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm2 ; transpose coefficients(phase 2) 466e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) 467e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) 468e5eaf37440b8e337ab150c017df7c03faf846c51DRC 469e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) 470e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) 471e5eaf37440b8e337ab150c017df7c03faf846c51DRC 472e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) 473e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) 474e5eaf37440b8e337ab150c017df7c03faf846c51DRC 475e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm0 ; transpose coefficients(phase 2) 476e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) 477e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) 478e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm6 ; transpose coefficients(phase 2) 479e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) 480e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) 481e5eaf37440b8e337ab150c017df7c03faf846c51DRC 482e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm7 ; transpose coefficients(phase 3) 483e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) 484e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) 485e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm1 ; transpose coefficients(phase 3) 486e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) 487e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) 488e5eaf37440b8e337ab150c017df7c03faf846c51DRC 489e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) 490e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) 491e5eaf37440b8e337ab150c017df7c03faf846c51DRC 492e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 493e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 494e5eaf37440b8e337ab150c017df7c03faf846c51DRC 495e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm0 ; transpose coefficients(phase 3) 496e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) 497e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) 498e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm2 ; transpose coefficients(phase 3) 499e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) 500e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) 501e5eaf37440b8e337ab150c017df7c03faf846c51DRC 502e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 503e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 504cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC.column_end: 505cdc8ac3eb11f11a8c781b3dc33fba7df2c826e9eDRC 506e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Prefetch the next coefficient block 507e5eaf37440b8e337ab150c017df7c03faf846c51DRC 508e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] 509e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] 510e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] 511e5eaf37440b8e337ab150c017df7c03faf846c51DRC prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] 512e5eaf37440b8e337ab150c017df7c03faf846c51DRC 513e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; ---- Pass 2: process rows from work array, store into output array. 514e5eaf37440b8e337ab150c017df7c03faf846c51DRC 515e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rax, [original_rbp] 516e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdi, r12 ; (JSAMPROW *) 517498d9bc92fcf39124b6f08e57326944dedd2ddd6Chandler Carruth mov eax, r13d 518e5eaf37440b8e337ab150c017df7c03faf846c51DRC 519e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Even part 520e5eaf37440b8e337ab150c017df7c03faf846c51DRC 521e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 522e5eaf37440b8e337ab150c017df7c03faf846c51DRC 523e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 524e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = (z2 + z3) * 0.541196100; 525e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = z1 + z3 * -1.847759065; 526e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = z1 + z2 * 0.765366865; 527e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 528e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 529e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); 530e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; 531e5eaf37440b8e337ab150c017df7c03faf846c51DRC 532e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm1 ; xmm1=in2=z2 533e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm1 534e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm6,xmm2 ; xmm2=in6=z3 535e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm5,xmm2 536e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm6 537e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm5 538e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L 539e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H 540e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L 541e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H 542e5eaf37440b8e337ab150c017df7c03faf846c51DRC 543e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm7 544e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm7,xmm0 ; xmm7=in0+in4 545e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubw xmm3,xmm0 ; xmm3=in0-in4 546e5eaf37440b8e337ab150c017df7c03faf846c51DRC 547e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm4,xmm4 548e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm0,xmm0 549e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm4,xmm7 ; xmm4=tmp0L 550e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm0,xmm7 ; xmm0=tmp0H 551e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS 552e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS 553e5eaf37440b8e337ab150c017df7c03faf846c51DRC 554e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm4 555e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm6 ; xmm4=tmp10L 556e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm7,xmm6 ; xmm7=tmp13L 557e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm0 558e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm5 ; xmm0=tmp10H 559e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm6,xmm5 ; xmm6=tmp13H 560e5eaf37440b8e337ab150c017df7c03faf846c51DRC 561e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L 562e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H 563e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L 564e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H 565e5eaf37440b8e337ab150c017df7c03faf846c51DRC 566e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm5,xmm5 567e5eaf37440b8e337ab150c017df7c03faf846c51DRC pxor xmm4,xmm4 568e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm5,xmm3 ; xmm5=tmp1L 569e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm4,xmm3 ; xmm4=tmp1H 570e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS 571e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS 572e5eaf37440b8e337ab150c017df7c03faf846c51DRC 573e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm5 574e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm1 ; xmm5=tmp11L 575e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm1 ; xmm0=tmp12L 576e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm4 577e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm2 ; xmm4=tmp11H 578e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm7,xmm2 ; xmm7=tmp12H 579e5eaf37440b8e337ab150c017df7c03faf846c51DRC 580e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L 581e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H 582e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L 583e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H 584e5eaf37440b8e337ab150c017df7c03faf846c51DRC 585e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Odd part 586e5eaf37440b8e337ab150c017df7c03faf846c51DRC 587e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 588e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 589e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 590e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 591e5eaf37440b8e337ab150c017df7c03faf846c51DRC 592e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm6 593e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm3 594e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm5,xmm1 ; xmm5=z3 595e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddw xmm4,xmm2 ; xmm4=z4 596e5eaf37440b8e337ab150c017df7c03faf846c51DRC 597e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 598e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z5 = (z3 + z4) * 1.175875602; 599e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 600e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 += z5; z4 += z5; 601e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 602e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 603e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 604e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 605e5eaf37440b8e337ab150c017df7c03faf846c51DRC 606e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm5 607e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm5 608e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm0,xmm4 609e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm7,xmm4 610e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm0 611e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm7 612e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L 613e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H 614e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L 615e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H 616e5eaf37440b8e337ab150c017df7c03faf846c51DRC 617e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L 618e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H 619e5eaf37440b8e337ab150c017df7c03faf846c51DRC 620e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (Original) 621e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; 622e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; 623e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; 624e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 625e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 += z1 + z3; tmp1 += z2 + z4; 626e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 += z2 + z3; tmp3 += z1 + z4; 627e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; 628e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; (This implementation) 629e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; 630e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; 631e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); 632e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); 633e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp0 += z3; tmp1 += z4; 634e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; tmp2 += z3; tmp3 += z4; 635e5eaf37440b8e337ab150c017df7c03faf846c51DRC 636e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm1 637e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm1 638e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm0,xmm3 639e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm7,xmm3 640e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm0 641e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm7 642e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L 643e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H 644e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L 645e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H 646e5eaf37440b8e337ab150c017df7c03faf846c51DRC 647e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L 648e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H 649e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm5 ; xmm1=tmp3L 650e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm4 ; xmm3=tmp3H 651e5eaf37440b8e337ab150c017df7c03faf846c51DRC 652e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L 653e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H 654e5eaf37440b8e337ab150c017df7c03faf846c51DRC 655e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm2 656e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm2 657e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm0,xmm6 658e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm7,xmm6 659e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm0 660e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6,xmm7 661e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L 662e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H 663e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L 664e5eaf37440b8e337ab150c017df7c03faf846c51DRC pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H 665e5eaf37440b8e337ab150c017df7c03faf846c51DRC 666e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm5 ; xmm0=tmp1L 667e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm4 ; xmm7=tmp1H 668e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L 669e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H 670e5eaf37440b8e337ab150c017df7c03faf846c51DRC 671e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L 672e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H 673e5eaf37440b8e337ab150c017df7c03faf846c51DRC 674e5eaf37440b8e337ab150c017df7c03faf846c51DRC ; -- Final output stage 675e5eaf37440b8e337ab150c017df7c03faf846c51DRC 676e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L 677e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H 678e5eaf37440b8e337ab150c017df7c03faf846c51DRC 679e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm5 680e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm4 681e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm1 ; xmm5=data0L 682e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm3 ; xmm4=data0H 683e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm1 ; xmm0=data7L 684e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm7,xmm3 ; xmm7=data7H 685e5eaf37440b8e337ab150c017df7c03faf846c51DRC 686e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] 687e5eaf37440b8e337ab150c017df7c03faf846c51DRC 688e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm5,xmm1 689e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm1 690e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm5,DESCALE_P2 691e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,DESCALE_P2 692e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm1 693e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm1 694e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P2 695e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,DESCALE_P2 696e5eaf37440b8e337ab150c017df7c03faf846c51DRC 697e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) 698e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) 699e5eaf37440b8e337ab150c017df7c03faf846c51DRC 700e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L 701e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H 702e5eaf37440b8e337ab150c017df7c03faf846c51DRC 703e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm3 704e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,xmm1 705e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm2 ; xmm3=data1L 706e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm6 ; xmm1=data1H 707e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm4,xmm2 ; xmm4=data6L 708e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm7,xmm6 ; xmm7=data6H 709e5eaf37440b8e337ab150c017df7c03faf846c51DRC 710e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] 711e5eaf37440b8e337ab150c017df7c03faf846c51DRC 712e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm2 713e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm2 714e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm3,DESCALE_P2 715e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm1,DESCALE_P2 716e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm2 717e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm7,xmm2 718e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,DESCALE_P2 719e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm7,DESCALE_P2 720e5eaf37440b8e337ab150c017df7c03faf846c51DRC 721e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) 722e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) 723e5eaf37440b8e337ab150c017df7c03faf846c51DRC 724e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 725e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 726e5eaf37440b8e337ab150c017df7c03faf846c51DRC 727e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L 728e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H 729e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L 730e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H 731e5eaf37440b8e337ab150c017df7c03faf846c51DRC 732e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 733e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 734e5eaf37440b8e337ab150c017df7c03faf846c51DRC 735e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm6 736e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm2 737e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6,xmm1 ; xmm6=data2L 738e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm7 ; xmm2=data2H 739e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm4,xmm1 ; xmm4=data5L 740e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm7 ; xmm0=data5H 741e5eaf37440b8e337ab150c017df7c03faf846c51DRC 742e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] 743e5eaf37440b8e337ab150c017df7c03faf846c51DRC 744e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm6,xmm5 745e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm5 746e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm6,DESCALE_P2 747e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm2,DESCALE_P2 748e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm4,xmm5 749e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm5 750e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm4,DESCALE_P2 751e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P2 752e5eaf37440b8e337ab150c017df7c03faf846c51DRC 753e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) 754e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) 755e5eaf37440b8e337ab150c017df7c03faf846c51DRC 756e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L 757e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H 758e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L 759e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H 760e5eaf37440b8e337ab150c017df7c03faf846c51DRC 761e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm3 762e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm1 763e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm7 ; xmm3=data3L 764e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm5 ; xmm1=data3H 765e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm2,xmm7 ; xmm2=data4L 766e5eaf37440b8e337ab150c017df7c03faf846c51DRC psubd xmm0,xmm5 ; xmm0=data4H 767e5eaf37440b8e337ab150c017df7c03faf846c51DRC 768e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] 769e5eaf37440b8e337ab150c017df7c03faf846c51DRC 770e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm3,xmm7 771e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm1,xmm7 772e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm3,DESCALE_P2 773e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm1,DESCALE_P2 774e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm2,xmm7 775e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddd xmm0,xmm7 776e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm2,DESCALE_P2 777e5eaf37440b8e337ab150c017df7c03faf846c51DRC psrad xmm0,DESCALE_P2 778e5eaf37440b8e337ab150c017df7c03faf846c51DRC 779e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] 780e5eaf37440b8e337ab150c017df7c03faf846c51DRC 781e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) 782e5eaf37440b8e337ab150c017df7c03faf846c51DRC packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) 783e5eaf37440b8e337ab150c017df7c03faf846c51DRC 784e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 785e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 786e5eaf37440b8e337ab150c017df7c03faf846c51DRC 787e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) 788e5eaf37440b8e337ab150c017df7c03faf846c51DRC packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) 789e5eaf37440b8e337ab150c017df7c03faf846c51DRC 790e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb xmm7,xmm5 791e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb xmm1,xmm5 792e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb xmm6,xmm5 793e5eaf37440b8e337ab150c017df7c03faf846c51DRC paddb xmm3,xmm5 794e5eaf37440b8e337ab150c017df7c03faf846c51DRC 795e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm0,xmm7 ; transpose coefficients(phase 1) 796e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) 797e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) 798e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm2,xmm6 ; transpose coefficients(phase 1) 799e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) 800e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) 801e5eaf37440b8e337ab150c017df7c03faf846c51DRC 802e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm4,xmm7 ; transpose coefficients(phase 2) 803e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 804e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) 805e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm5,xmm2 ; transpose coefficients(phase 2) 806e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 807e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) 808e5eaf37440b8e337ab150c017df7c03faf846c51DRC 809e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm1,xmm7 ; transpose coefficients(phase 3) 810e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 811e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 812e5eaf37440b8e337ab150c017df7c03faf846c51DRC movdqa xmm3,xmm4 ; transpose coefficients(phase 3) 813e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) 814e5eaf37440b8e337ab150c017df7c03faf846c51DRC punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) 815e5eaf37440b8e337ab150c017df7c03faf846c51DRC 816e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 817e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 818e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) 819e5eaf37440b8e337ab150c017df7c03faf846c51DRC pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) 820e5eaf37440b8e337ab150c017df7c03faf846c51DRC 821e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] 822e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] 823e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7 824e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 825e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] 826e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] 827e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 828e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 829e5eaf37440b8e337ab150c017df7c03faf846c51DRC 830e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] 831e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] 832e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 833e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 834e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] 835e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] 836e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 837e5eaf37440b8e337ab150c017df7c03faf846c51DRC movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 838e5eaf37440b8e337ab150c017df7c03faf846c51DRC 839e5eaf37440b8e337ab150c017df7c03faf846c51DRC uncollect_args 840e5eaf37440b8e337ab150c017df7c03faf846c51DRC mov rsp,rbp ; rsp <- aligned rbp 841e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop rsp ; rsp <- original rbp 842e5eaf37440b8e337ab150c017df7c03faf846c51DRC pop rbp 843e5eaf37440b8e337ab150c017df7c03faf846c51DRC ret 844132b5fdd6d1b70c32d76ff5389bd6e183e363f4dDRC 845132b5fdd6d1b70c32d76ff5389bd6e183e363f4dDRC; For some reason, the OS X linker does not honor the request to align the 846132b5fdd6d1b70c32d76ff5389bd6e183e363f4dDRC; segment unless we do this. 847e5eaf37440b8e337ab150c017df7c03faf846c51DRC align 16 848