1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm" 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_idct_dequant_0_2x_sse2 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan; ( 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan; short *qcoeff - 0 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan; short *dequant - 1 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *dst - 2 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int dst_stride - 3 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan; ) 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan 22233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_idct_dequant_0_2x_sse2) PRIVATE 23233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_idct_dequant_0_2x_sse2): 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 4 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(1) ; dequant 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ; qcoeff 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm4, [rax] 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm5, [rdx] 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan pinsrw xmm4, [rax+32], 4 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan pinsrw xmm5, [rdx], 4 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm4, xmm5 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Zero out xmm5, for use unpacking 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm5, xmm5 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; clear coeffs 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rax], xmm5 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rax+32], xmm5 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan;pshufb 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(2) ; dst 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, dword ptr arg(3) ; dst_stride 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshuflw xmm4, xmm4, 00000000b 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufhw xmm4, xmm4, 00000000b 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rdx + rdx*2] 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm4, [GLOBAL(fours)] 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm4, 3 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm0, [rax] 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, [rax+rdx] 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, [rax+2*rdx] 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, [rax+rcx] 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm5 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm5 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm5 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm5 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Add to predict buffer 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm0, xmm4 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm1, xmm4 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm4 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm4 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; pack up before storing 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm0, xmm5 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm1, xmm5 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm2, xmm5 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm3, xmm5 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; store blocks back out 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rax], xmm0 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rax + rdx], xmm1 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rax, [rax + 2*rdx] 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rax], xmm2 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rax + rdx], xmm3 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_idct_dequant_full_2x_sse2 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan; ( 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan; short *qcoeff - 0 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan; short *dequant - 1 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *dst - 2 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int dst_stride - 3 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan; ) 104233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_idct_dequant_full_2x_sse2) PRIVATE 105233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_idct_dequant_full_2x_sse2): 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 4 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; special case when 2 blocks have 0 or 1 coeffs 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dc is set as first coeff, so no need to load qcoeff 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ; qcoeff 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(1) ; dequant 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(2) ; dst 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Zero out xmm7, for use unpacking 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; note the transpose of xmm1 and xmm2, necessary for shuffle 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; to spit out sensicle data 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, [rax] 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rax+16] 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rax+32] 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, [rax+48] 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Clear out coeffs 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rax], xmm7 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rax+16], xmm7 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rax+32], xmm7 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rax+48], xmm7 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dequantize qcoeff buffer 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm0, [rdx] 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm2, [rdx+16] 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm1, [rdx] 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm3, [rdx+16] 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, dword ptr arg(3) ; dst_stride 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; repack so block 0 row x and block 1 row x are together 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm0 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm0, xmm1 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm4, xmm1 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm0, xmm0, 11011000b 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm1, xmm4, 11011000b 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm2 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm3 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm4, xmm3 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm2, xmm2, 11011000b 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm3, xmm4, 11011000b 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; first pass 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm0, xmm2 ; b1 = 0-2 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm2 ; 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm0 ; a1 = 0+2 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm5, [GLOBAL(x_s1sqr2)] 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rdx + rdx*2] ;dst_stride * 3 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm3 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm7, xmm5 ; c1 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm3 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm1 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm3, [GLOBAL(x_s1sqr2)] 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm4 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm5 ; d1 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm2 ; a1 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm0 ; b1 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm3 ;0 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm4, xmm7 ;1 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm0, xmm7 ;2 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm6, xmm3 ;3 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose for the second pass 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm0, xmm2, 11011000b 226233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm2, xmm1, 11011000b 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm1, xmm5, 11011000b 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm3, xmm7, 11011000b 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; second pass 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm0, xmm2 ; b1 = 0-2 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm2 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm0 ; a1 = 0+2 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm5, [GLOBAL(x_s1sqr2)] 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm3 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm7, xmm5 ; c1 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm3 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm1 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm3, [GLOBAL(x_s1sqr2)] 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm4 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm5 ; d1 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm0, [GLOBAL(fours)] 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, [GLOBAL(fours)] 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm2 ; a1 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm0 ; b1 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm3 ;0 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm4, xmm7 ;1 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm0, xmm7 ;2 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm6, xmm3 ;3 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm2, 3 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm0, 3 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm4, 3 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm6, 3 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose to save 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 289233d2500723e5594f3e7c70896ffeeef32b9c950ywan 290233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 297233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm0, xmm2, 11011000b 304233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm2, xmm1, 11011000b 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm1, xmm5, 11011000b 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm3, xmm7, 11011000b 308233d2500723e5594f3e7c70896ffeeef32b9c950ywan 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Load up predict blocks 312233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm4, [rdi] 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm5, [rdi+rdx] 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm4, xmm7 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm5, xmm7 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm0, xmm4 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm1, xmm5 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan 321233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm4, [rdi+2*rdx] 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm5, [rdi+rcx] 323233d2500723e5594f3e7c70896ffeeef32b9c950ywan 324233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm4, xmm7 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm5, xmm7 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm4 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm5 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan.finish: 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; pack up before storing 333233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm0, xmm7 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm1, xmm7 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm2, xmm7 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm3, xmm7 337233d2500723e5594f3e7c70896ffeeef32b9c950ywan 338233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; store blocks back out 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi], xmm0 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi + rdx], xmm1 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi + rdx*2], xmm2 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi + rcx], xmm3 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_idct_dequant_dc_0_2x_sse2 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan; ( 355233d2500723e5594f3e7c70896ffeeef32b9c950ywan; short *qcoeff - 0 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan; short *dequant - 1 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *dst - 2 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int dst_stride - 3 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan; short *dc - 4 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan; ) 361233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE 362233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_idct_dequant_dc_0_2x_sse2): 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 365233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 5 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 367233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; special case when 2 blocks have 0 or 1 coeffs 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dc is set as first coeff, so no need to load qcoeff 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ; qcoeff 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(2) ; dst 375233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(4) ; dc 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Zero out xmm5, for use unpacking 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm5, xmm5 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; load up 2 dc words here == 2*16 = doubleword 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm4, [rdx] 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, dword ptr arg(3) ; dst_stride 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rdx + rdx*2] 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Load up predict blocks 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm0, [rdi] 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, [rdi+rdx*1] 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, [rdi+rdx*2] 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, [rdi+rcx] 390233d2500723e5594f3e7c70896ffeeef32b9c950ywan 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Duplicate and expand dc across 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm4, xmm4 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm4, xmm4 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Rounding to dequant and downshift 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm4, [GLOBAL(fours)] 397233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm4, 3 398233d2500723e5594f3e7c70896ffeeef32b9c950ywan 399233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Predict buffer needs to be expanded from bytes to words 400233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm5 401233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm5 402233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm5 403233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm5 404233d2500723e5594f3e7c70896ffeeef32b9c950ywan 405233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Add to predict buffer 406233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm0, xmm4 407233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm1, xmm4 408233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm4 409233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm4 410233d2500723e5594f3e7c70896ffeeef32b9c950ywan 411233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; pack up before storing 412233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm0, xmm5 413233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm1, xmm5 414233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm2, xmm5 415233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm3, xmm5 416233d2500723e5594f3e7c70896ffeeef32b9c950ywan 417233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; store blocks back out 418233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi], xmm0 419233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi + rdx], xmm1 420233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi + rdx*2], xmm2 421233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi + rcx], xmm3 422233d2500723e5594f3e7c70896ffeeef32b9c950ywan 423233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 424233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 425233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 426233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 427233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 428233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 429233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_idct_dequant_dc_full_2x_sse2 430233d2500723e5594f3e7c70896ffeeef32b9c950ywan; ( 431233d2500723e5594f3e7c70896ffeeef32b9c950ywan; short *qcoeff - 0 432233d2500723e5594f3e7c70896ffeeef32b9c950ywan; short *dequant - 1 433233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *dst - 2 434233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int dst_stride - 3 435233d2500723e5594f3e7c70896ffeeef32b9c950ywan; short *dc - 4 436233d2500723e5594f3e7c70896ffeeef32b9c950ywan; ) 437233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE 438233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_idct_dequant_dc_full_2x_sse2): 439233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 440233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 441233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 5 442233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 443233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 444233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 445233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 446233d2500723e5594f3e7c70896ffeeef32b9c950ywan 447233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; special case when 2 blocks have 0 or 1 coeffs 448233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dc is set as first coeff, so no need to load qcoeff 449233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ; qcoeff 450233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(1) ; dequant 451233d2500723e5594f3e7c70896ffeeef32b9c950ywan 452233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(2) ; dst 453233d2500723e5594f3e7c70896ffeeef32b9c950ywan 454233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Zero out xmm7, for use unpacking 455233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 456233d2500723e5594f3e7c70896ffeeef32b9c950ywan 457233d2500723e5594f3e7c70896ffeeef32b9c950ywan 458233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; note the transpose of xmm1 and xmm2, necessary for shuffle 459233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; to spit out sensicle data 460233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, [rax] 461233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rax+16] 462233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rax+32] 463233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, [rax+48] 464233d2500723e5594f3e7c70896ffeeef32b9c950ywan 465233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Clear out coeffs 466233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rax], xmm7 467233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rax+16], xmm7 468233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rax+32], xmm7 469233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rax+48], xmm7 470233d2500723e5594f3e7c70896ffeeef32b9c950ywan 471233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dequantize qcoeff buffer 472233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm0, [rdx] 473233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm2, [rdx+16] 474233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm1, [rdx] 475233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm3, [rdx+16] 476233d2500723e5594f3e7c70896ffeeef32b9c950ywan 477233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; DC component 478233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(4) 479233d2500723e5594f3e7c70896ffeeef32b9c950ywan 480233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; repack so block 0 row x and block 1 row x are together 481233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm0 482233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm0, xmm1 483233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm4, xmm1 484233d2500723e5594f3e7c70896ffeeef32b9c950ywan 485233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm0, xmm0, 11011000b 486233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm1, xmm4, 11011000b 487233d2500723e5594f3e7c70896ffeeef32b9c950ywan 488233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm2 489233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm3 490233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm4, xmm3 491233d2500723e5594f3e7c70896ffeeef32b9c950ywan 492233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm2, xmm2, 11011000b 493233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm3, xmm4, 11011000b 494233d2500723e5594f3e7c70896ffeeef32b9c950ywan 495233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; insert DC component 496233d2500723e5594f3e7c70896ffeeef32b9c950ywan pinsrw xmm0, [rdx], 0 497233d2500723e5594f3e7c70896ffeeef32b9c950ywan pinsrw xmm0, [rdx+2], 4 498233d2500723e5594f3e7c70896ffeeef32b9c950ywan 499233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; first pass 500233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm0, xmm2 ; b1 = 0-2 501233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm2 ; 502233d2500723e5594f3e7c70896ffeeef32b9c950ywan 503233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 504233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm0 ; a1 = 0+2 505233d2500723e5594f3e7c70896ffeeef32b9c950ywan 506233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm5, [GLOBAL(x_s1sqr2)] 507233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 508233d2500723e5594f3e7c70896ffeeef32b9c950ywan 509233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm3 510233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 511233d2500723e5594f3e7c70896ffeeef32b9c950ywan 512233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 513233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm7, xmm5 ; c1 514233d2500723e5594f3e7c70896ffeeef32b9c950ywan 515233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 516233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm3 517233d2500723e5594f3e7c70896ffeeef32b9c950ywan 518233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 519233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm1 520233d2500723e5594f3e7c70896ffeeef32b9c950ywan 521233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm3, [GLOBAL(x_s1sqr2)] 522233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm4 523233d2500723e5594f3e7c70896ffeeef32b9c950ywan 524233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm5 ; d1 525233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm2 ; a1 526233d2500723e5594f3e7c70896ffeeef32b9c950ywan 527233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm0 ; b1 528233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm3 ;0 529233d2500723e5594f3e7c70896ffeeef32b9c950ywan 530233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm4, xmm7 ;1 531233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm0, xmm7 ;2 532233d2500723e5594f3e7c70896ffeeef32b9c950ywan 533233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm6, xmm3 ;3 534233d2500723e5594f3e7c70896ffeeef32b9c950ywan 535233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose for the second pass 536233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 537233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 538233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 539233d2500723e5594f3e7c70896ffeeef32b9c950ywan 540233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 541233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 542233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 543233d2500723e5594f3e7c70896ffeeef32b9c950ywan 544233d2500723e5594f3e7c70896ffeeef32b9c950ywan 545233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 546233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 547233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 548233d2500723e5594f3e7c70896ffeeef32b9c950ywan 549233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 550233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 551233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 552233d2500723e5594f3e7c70896ffeeef32b9c950ywan 553233d2500723e5594f3e7c70896ffeeef32b9c950ywan 554233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 555233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 556233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 557233d2500723e5594f3e7c70896ffeeef32b9c950ywan 558233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 559233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 560233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 561233d2500723e5594f3e7c70896ffeeef32b9c950ywan 562233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm0, xmm2, 11011000b 563233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm2, xmm1, 11011000b 564233d2500723e5594f3e7c70896ffeeef32b9c950ywan 565233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm1, xmm5, 11011000b 566233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm3, xmm7, 11011000b 567233d2500723e5594f3e7c70896ffeeef32b9c950ywan 568233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; second pass 569233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm0, xmm2 ; b1 = 0-2 570233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm2 571233d2500723e5594f3e7c70896ffeeef32b9c950ywan 572233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 573233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm0 ; a1 = 0+2 574233d2500723e5594f3e7c70896ffeeef32b9c950ywan 575233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm5, [GLOBAL(x_s1sqr2)] 576233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 577233d2500723e5594f3e7c70896ffeeef32b9c950ywan 578233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm3 579233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 580233d2500723e5594f3e7c70896ffeeef32b9c950ywan 581233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 582233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm7, xmm5 ; c1 583233d2500723e5594f3e7c70896ffeeef32b9c950ywan 584233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 585233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm3 586233d2500723e5594f3e7c70896ffeeef32b9c950ywan 587233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 588233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm1 589233d2500723e5594f3e7c70896ffeeef32b9c950ywan 590233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm3, [GLOBAL(x_s1sqr2)] 591233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm4 592233d2500723e5594f3e7c70896ffeeef32b9c950ywan 593233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm5 ; d1 594233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm0, [GLOBAL(fours)] 595233d2500723e5594f3e7c70896ffeeef32b9c950ywan 596233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, [GLOBAL(fours)] 597233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm2 ; a1 598233d2500723e5594f3e7c70896ffeeef32b9c950ywan 599233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm0 ; b1 600233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm3 ;0 601233d2500723e5594f3e7c70896ffeeef32b9c950ywan 602233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm4, xmm7 ;1 603233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm0, xmm7 ;2 604233d2500723e5594f3e7c70896ffeeef32b9c950ywan 605233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm6, xmm3 ;3 606233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm2, 3 607233d2500723e5594f3e7c70896ffeeef32b9c950ywan 608233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm0, 3 609233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm4, 3 610233d2500723e5594f3e7c70896ffeeef32b9c950ywan 611233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm6, 3 612233d2500723e5594f3e7c70896ffeeef32b9c950ywan 613233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose to save 614233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 615233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 616233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 617233d2500723e5594f3e7c70896ffeeef32b9c950ywan 618233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 619233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 620233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 621233d2500723e5594f3e7c70896ffeeef32b9c950ywan 622233d2500723e5594f3e7c70896ffeeef32b9c950ywan 623233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 624233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 625233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 626233d2500723e5594f3e7c70896ffeeef32b9c950ywan 627233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 628233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 629233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 630233d2500723e5594f3e7c70896ffeeef32b9c950ywan 631233d2500723e5594f3e7c70896ffeeef32b9c950ywan 632233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 633233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 634233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 635233d2500723e5594f3e7c70896ffeeef32b9c950ywan 636233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 637233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 638233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 639233d2500723e5594f3e7c70896ffeeef32b9c950ywan 640233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm0, xmm2, 11011000b 641233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm2, xmm1, 11011000b 642233d2500723e5594f3e7c70896ffeeef32b9c950ywan 643233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm1, xmm5, 11011000b 644233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm3, xmm7, 11011000b 645233d2500723e5594f3e7c70896ffeeef32b9c950ywan 646233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 647233d2500723e5594f3e7c70896ffeeef32b9c950ywan 648233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Load up predict blocks 649233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, dword ptr arg(3) ; dst_stride 650233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm4, [rdi] 651233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm5, [rdi+rdx] 652233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rdx + rdx*2] 653233d2500723e5594f3e7c70896ffeeef32b9c950ywan 654233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm4, xmm7 655233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm5, xmm7 656233d2500723e5594f3e7c70896ffeeef32b9c950ywan 657233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm0, xmm4 658233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm1, xmm5 659233d2500723e5594f3e7c70896ffeeef32b9c950ywan 660233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm4, [rdi+rdx*2] 661233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm5, [rdi+rcx] 662233d2500723e5594f3e7c70896ffeeef32b9c950ywan 663233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm4, xmm7 664233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm5, xmm7 665233d2500723e5594f3e7c70896ffeeef32b9c950ywan 666233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm4 667233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm5 668233d2500723e5594f3e7c70896ffeeef32b9c950ywan 669233d2500723e5594f3e7c70896ffeeef32b9c950ywan.finish: 670233d2500723e5594f3e7c70896ffeeef32b9c950ywan 671233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; pack up before storing 672233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm0, xmm7 673233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm1, xmm7 674233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm2, xmm7 675233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm3, xmm7 676233d2500723e5594f3e7c70896ffeeef32b9c950ywan 677233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Load destination stride before writing out, 678233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; doesn't need to persist 679233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, dword ptr arg(3) ; dst_stride 680233d2500723e5594f3e7c70896ffeeef32b9c950ywan 681233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; store blocks back out 682233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi], xmm0 683233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi + rdx], xmm1 684233d2500723e5594f3e7c70896ffeeef32b9c950ywan 685233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi + 2*rdx] 686233d2500723e5594f3e7c70896ffeeef32b9c950ywan 687233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi], xmm2 688233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi + rdx], xmm3 689233d2500723e5594f3e7c70896ffeeef32b9c950ywan 690233d2500723e5594f3e7c70896ffeeef32b9c950ywan 691233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 692233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 693233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 694233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 695233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 696233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 697233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 698233d2500723e5594f3e7c70896ffeeef32b9c950ywan 699233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA 700233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 701233d2500723e5594f3e7c70896ffeeef32b9c950ywanfours: 702233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 8 dw 0x0004 703233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 704233d2500723e5594f3e7c70896ffeeef32b9c950ywanx_s1sqr2: 705233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 8 dw 0x8A8C 706233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 707233d2500723e5594f3e7c70896ffeeef32b9c950ywanx_c1sqr2less1: 708233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 8 dw 0x4E7B 709