1f71323e297a928af368937089d3ed71239786f86Andreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3f71323e297a928af368937089d3ed71239786f86Andreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 9f71323e297a928af368937089d3ed71239786f86Andreas Huber; 10f71323e297a928af368937089d3ed71239786f86Andreas Huber 11f71323e297a928af368937089d3ed71239786f86Andreas Huber 12f71323e297a928af368937089d3ed71239786f86Andreas Huber%include "vpx_ports/x86_abi_support.asm" 13f71323e297a928af368937089d3ed71239786f86Andreas Huber 141b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_idct_dequant_0_2x_sse2 15f71323e297a928af368937089d3ed71239786f86Andreas Huber; ( 16f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *qcoeff - 0 17f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *dequant - 1 181b362b15af34006e6a11974088a46d42b903418eJohann; unsigned char *dst - 2 191b362b15af34006e6a11974088a46d42b903418eJohann; int dst_stride - 3 20f71323e297a928af368937089d3ed71239786f86Andreas Huber; ) 21f71323e297a928af368937089d3ed71239786f86Andreas Huber 221b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_idct_dequant_0_2x_sse2) PRIVATE 231b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_idct_dequant_0_2x_sse2): 24f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 25f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 261b362b15af34006e6a11974088a46d42b903418eJohann SHADOW_ARGS_TO_STACK 4 27f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 28f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 29f71323e297a928af368937089d3ed71239786f86Andreas Huber 30f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdx, arg(1) ; dequant 31f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rax, arg(0) ; qcoeff 32f71323e297a928af368937089d3ed71239786f86Andreas Huber 33f71323e297a928af368937089d3ed71239786f86Andreas Huber movd xmm4, [rax] 34f71323e297a928af368937089d3ed71239786f86Andreas Huber movd xmm5, [rdx] 35f71323e297a928af368937089d3ed71239786f86Andreas Huber 36f71323e297a928af368937089d3ed71239786f86Andreas Huber pinsrw xmm4, [rax+32], 4 37f71323e297a928af368937089d3ed71239786f86Andreas Huber pinsrw xmm5, [rdx], 4 38f71323e297a928af368937089d3ed71239786f86Andreas Huber 39f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm4, xmm5 40f71323e297a928af368937089d3ed71239786f86Andreas Huber 411b362b15af34006e6a11974088a46d42b903418eJohann ; Zero out xmm5, for use unpacking 421b362b15af34006e6a11974088a46d42b903418eJohann pxor xmm5, xmm5 431b362b15af34006e6a11974088a46d42b903418eJohann 44f71323e297a928af368937089d3ed71239786f86Andreas Huber ; clear coeffs 451b362b15af34006e6a11974088a46d42b903418eJohann movd [rax], xmm5 461b362b15af34006e6a11974088a46d42b903418eJohann movd [rax+32], xmm5 47f71323e297a928af368937089d3ed71239786f86Andreas Huber;pshufb 481b362b15af34006e6a11974088a46d42b903418eJohann mov rax, arg(2) ; dst 491b362b15af34006e6a11974088a46d42b903418eJohann movsxd rdx, dword ptr arg(3) ; dst_stride 501b362b15af34006e6a11974088a46d42b903418eJohann 51f71323e297a928af368937089d3ed71239786f86Andreas Huber pshuflw xmm4, xmm4, 00000000b 52f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufhw xmm4, xmm4, 00000000b 53f71323e297a928af368937089d3ed71239786f86Andreas Huber 541b362b15af34006e6a11974088a46d42b903418eJohann lea rcx, [rdx + rdx*2] 55538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm4, [GLOBAL(fours)] 56f71323e297a928af368937089d3ed71239786f86Andreas Huber 57f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm4, 3 58f71323e297a928af368937089d3ed71239786f86Andreas Huber 59f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm0, [rax] 601b362b15af34006e6a11974088a46d42b903418eJohann movq xmm1, [rax+rdx] 611b362b15af34006e6a11974088a46d42b903418eJohann movq xmm2, [rax+2*rdx] 62f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm3, [rax+rcx] 63f71323e297a928af368937089d3ed71239786f86Andreas Huber 641b362b15af34006e6a11974088a46d42b903418eJohann punpcklbw xmm0, xmm5 651b362b15af34006e6a11974088a46d42b903418eJohann punpcklbw xmm1, xmm5 661b362b15af34006e6a11974088a46d42b903418eJohann punpcklbw xmm2, xmm5 671b362b15af34006e6a11974088a46d42b903418eJohann punpcklbw xmm3, xmm5 68f71323e297a928af368937089d3ed71239786f86Andreas Huber 69f71323e297a928af368937089d3ed71239786f86Andreas Huber 70f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Add to predict buffer 71f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm0, xmm4 72f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm1, xmm4 73f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm4 74f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 75f71323e297a928af368937089d3ed71239786f86Andreas Huber 76f71323e297a928af368937089d3ed71239786f86Andreas Huber ; pack up before storing 771b362b15af34006e6a11974088a46d42b903418eJohann packuswb xmm0, xmm5 781b362b15af34006e6a11974088a46d42b903418eJohann packuswb xmm1, xmm5 791b362b15af34006e6a11974088a46d42b903418eJohann packuswb xmm2, xmm5 801b362b15af34006e6a11974088a46d42b903418eJohann packuswb xmm3, xmm5 81f71323e297a928af368937089d3ed71239786f86Andreas Huber 82f71323e297a928af368937089d3ed71239786f86Andreas Huber ; store blocks back out 83f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rax], xmm0 84f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rax + rdx], xmm1 85f71323e297a928af368937089d3ed71239786f86Andreas Huber 86f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rax, [rax + 2*rdx] 87f71323e297a928af368937089d3ed71239786f86Andreas Huber 88f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rax], xmm2 89f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rax + rdx], xmm3 90f71323e297a928af368937089d3ed71239786f86Andreas Huber 91f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 92f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 93f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 94f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 95f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 96f71323e297a928af368937089d3ed71239786f86Andreas Huber 971b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_idct_dequant_full_2x_sse2 981b362b15af34006e6a11974088a46d42b903418eJohann; ( 991b362b15af34006e6a11974088a46d42b903418eJohann; short *qcoeff - 0 1001b362b15af34006e6a11974088a46d42b903418eJohann; short *dequant - 1 1011b362b15af34006e6a11974088a46d42b903418eJohann; unsigned char *dst - 2 1021b362b15af34006e6a11974088a46d42b903418eJohann; int dst_stride - 3 1031b362b15af34006e6a11974088a46d42b903418eJohann; ) 1041b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_idct_dequant_full_2x_sse2) PRIVATE 1051b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_idct_dequant_full_2x_sse2): 106f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 107f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 1081b362b15af34006e6a11974088a46d42b903418eJohann SHADOW_ARGS_TO_STACK 4 1091b362b15af34006e6a11974088a46d42b903418eJohann SAVE_XMM 7 110f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 111f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 112f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 113f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 114f71323e297a928af368937089d3ed71239786f86Andreas Huber 115f71323e297a928af368937089d3ed71239786f86Andreas Huber ; special case when 2 blocks have 0 or 1 coeffs 116f71323e297a928af368937089d3ed71239786f86Andreas Huber ; dc is set as first coeff, so no need to load qcoeff 117f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rax, arg(0) ; qcoeff 1181b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, arg(1) ; dequant 1191b362b15af34006e6a11974088a46d42b903418eJohann mov rdi, arg(2) ; dst 1201b362b15af34006e6a11974088a46d42b903418eJohann 121f71323e297a928af368937089d3ed71239786f86Andreas Huber 122f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Zero out xmm7, for use unpacking 123f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm7, xmm7 124f71323e297a928af368937089d3ed71239786f86Andreas Huber 125f71323e297a928af368937089d3ed71239786f86Andreas Huber 126f71323e297a928af368937089d3ed71239786f86Andreas Huber ; note the transpose of xmm1 and xmm2, necessary for shuffle 127f71323e297a928af368937089d3ed71239786f86Andreas Huber ; to spit out sensicle data 128f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm0, [rax] 129f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, [rax+16] 130f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, [rax+32] 131f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, [rax+48] 132f71323e297a928af368937089d3ed71239786f86Andreas Huber 133f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Clear out coeffs 134f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax], xmm7 135f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+16], xmm7 136f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+32], xmm7 137f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+48], xmm7 138f71323e297a928af368937089d3ed71239786f86Andreas Huber 139f71323e297a928af368937089d3ed71239786f86Andreas Huber ; dequantize qcoeff buffer 140f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm0, [rdx] 141f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm2, [rdx+16] 142f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm1, [rdx] 143f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm3, [rdx+16] 1441b362b15af34006e6a11974088a46d42b903418eJohann movsxd rdx, dword ptr arg(3) ; dst_stride 145f71323e297a928af368937089d3ed71239786f86Andreas Huber 146f71323e297a928af368937089d3ed71239786f86Andreas Huber ; repack so block 0 row x and block 1 row x are together 147f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 148f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm0, xmm1 149f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm4, xmm1 150f71323e297a928af368937089d3ed71239786f86Andreas Huber 151f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm0, 11011000b 152f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm4, 11011000b 153f71323e297a928af368937089d3ed71239786f86Andreas Huber 154f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm2 155f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm3 156f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm4, xmm3 157f71323e297a928af368937089d3ed71239786f86Andreas Huber 158f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm2, 11011000b 159f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm4, 11011000b 160f71323e297a928af368937089d3ed71239786f86Andreas Huber 161f71323e297a928af368937089d3ed71239786f86Andreas Huber ; first pass 162f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm2 ; b1 = 0-2 163f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm2 ; 164f71323e297a928af368937089d3ed71239786f86Andreas Huber 165f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 166f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm0 ; a1 = 0+2 167f71323e297a928af368937089d3ed71239786f86Andreas Huber 168538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_s1sqr2)] 1691b362b15af34006e6a11974088a46d42b903418eJohann lea rcx, [rdx + rdx*2] ;dst_stride * 3 170f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 171f71323e297a928af368937089d3ed71239786f86Andreas Huber 172f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm3 173538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 174f71323e297a928af368937089d3ed71239786f86Andreas Huber 175f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 176f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm7, xmm5 ; c1 177f71323e297a928af368937089d3ed71239786f86Andreas Huber 178f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 179f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 180f71323e297a928af368937089d3ed71239786f86Andreas Huber 181538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 182f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 183f71323e297a928af368937089d3ed71239786f86Andreas Huber 184538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm3, [GLOBAL(x_s1sqr2)] 185f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 186f71323e297a928af368937089d3ed71239786f86Andreas Huber 187f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 ; d1 188f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm2 ; a1 189f71323e297a928af368937089d3ed71239786f86Andreas Huber 190f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 ; b1 191f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm3 ;0 192f71323e297a928af368937089d3ed71239786f86Andreas Huber 193f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm4, xmm7 ;1 194f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm7 ;2 195f71323e297a928af368937089d3ed71239786f86Andreas Huber 196f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm6, xmm3 ;3 197f71323e297a928af368937089d3ed71239786f86Andreas Huber 198f71323e297a928af368937089d3ed71239786f86Andreas Huber ; transpose for the second pass 199f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 200f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 201f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 202f71323e297a928af368937089d3ed71239786f86Andreas Huber 203f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 204f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 205f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 206f71323e297a928af368937089d3ed71239786f86Andreas Huber 207f71323e297a928af368937089d3ed71239786f86Andreas Huber 208f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 209f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 210f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 211f71323e297a928af368937089d3ed71239786f86Andreas Huber 212f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 213f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 214f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 215f71323e297a928af368937089d3ed71239786f86Andreas Huber 216f71323e297a928af368937089d3ed71239786f86Andreas Huber 217f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 218f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 219f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 220f71323e297a928af368937089d3ed71239786f86Andreas Huber 221f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 222f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 223f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 224f71323e297a928af368937089d3ed71239786f86Andreas Huber 225f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm2, 11011000b 226f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm1, 11011000b 227f71323e297a928af368937089d3ed71239786f86Andreas Huber 228f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm5, 11011000b 229f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm7, 11011000b 230f71323e297a928af368937089d3ed71239786f86Andreas Huber 231f71323e297a928af368937089d3ed71239786f86Andreas Huber ; second pass 232f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm2 ; b1 = 0-2 233f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm2 234f71323e297a928af368937089d3ed71239786f86Andreas Huber 235f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 236f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm0 ; a1 = 0+2 237f71323e297a928af368937089d3ed71239786f86Andreas Huber 238538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_s1sqr2)] 239f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 240f71323e297a928af368937089d3ed71239786f86Andreas Huber 241f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm3 242538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 243f71323e297a928af368937089d3ed71239786f86Andreas Huber 244f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 245f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm7, xmm5 ; c1 246f71323e297a928af368937089d3ed71239786f86Andreas Huber 247f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 248f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 249f71323e297a928af368937089d3ed71239786f86Andreas Huber 250538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 251f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 252f71323e297a928af368937089d3ed71239786f86Andreas Huber 253538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm3, [GLOBAL(x_s1sqr2)] 254f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 255f71323e297a928af368937089d3ed71239786f86Andreas Huber 256f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 ; d1 257538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm0, [GLOBAL(fours)] 258f71323e297a928af368937089d3ed71239786f86Andreas Huber 259538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm2, [GLOBAL(fours)] 260f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm2 ; a1 261f71323e297a928af368937089d3ed71239786f86Andreas Huber 262f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 ; b1 263f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm3 ;0 264f71323e297a928af368937089d3ed71239786f86Andreas Huber 265f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm4, xmm7 ;1 266f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm7 ;2 267f71323e297a928af368937089d3ed71239786f86Andreas Huber 268f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm6, xmm3 ;3 269f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm2, 3 270f71323e297a928af368937089d3ed71239786f86Andreas Huber 271f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm0, 3 272f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm4, 3 273f71323e297a928af368937089d3ed71239786f86Andreas Huber 274f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm6, 3 275f71323e297a928af368937089d3ed71239786f86Andreas Huber 276f71323e297a928af368937089d3ed71239786f86Andreas Huber ; transpose to save 277f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 278f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 279f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 280f71323e297a928af368937089d3ed71239786f86Andreas Huber 281f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 282f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 283f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 284f71323e297a928af368937089d3ed71239786f86Andreas Huber 285f71323e297a928af368937089d3ed71239786f86Andreas Huber 286f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 287f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 288f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 289f71323e297a928af368937089d3ed71239786f86Andreas Huber 290f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 291f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 292f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 293f71323e297a928af368937089d3ed71239786f86Andreas Huber 294f71323e297a928af368937089d3ed71239786f86Andreas Huber 295f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 296f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 297f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 298f71323e297a928af368937089d3ed71239786f86Andreas Huber 299f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 300f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 301f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 302f71323e297a928af368937089d3ed71239786f86Andreas Huber 303f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm2, 11011000b 304f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm1, 11011000b 305f71323e297a928af368937089d3ed71239786f86Andreas Huber 306f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm5, 11011000b 307f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm7, 11011000b 308f71323e297a928af368937089d3ed71239786f86Andreas Huber 309f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm7, xmm7 310f71323e297a928af368937089d3ed71239786f86Andreas Huber 311f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Load up predict blocks 3121b362b15af34006e6a11974088a46d42b903418eJohann movq xmm4, [rdi] 3131b362b15af34006e6a11974088a46d42b903418eJohann movq xmm5, [rdi+rdx] 314f71323e297a928af368937089d3ed71239786f86Andreas Huber 315f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm4, xmm7 316f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm5, xmm7 317f71323e297a928af368937089d3ed71239786f86Andreas Huber 318f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm0, xmm4 319f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm1, xmm5 320f71323e297a928af368937089d3ed71239786f86Andreas Huber 3211b362b15af34006e6a11974088a46d42b903418eJohann movq xmm4, [rdi+2*rdx] 3221b362b15af34006e6a11974088a46d42b903418eJohann movq xmm5, [rdi+rcx] 323f71323e297a928af368937089d3ed71239786f86Andreas Huber 324f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm4, xmm7 325f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm5, xmm7 326f71323e297a928af368937089d3ed71239786f86Andreas Huber 327f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm4 328f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 329f71323e297a928af368937089d3ed71239786f86Andreas Huber 330f71323e297a928af368937089d3ed71239786f86Andreas Huber.finish: 331f71323e297a928af368937089d3ed71239786f86Andreas Huber 332f71323e297a928af368937089d3ed71239786f86Andreas Huber ; pack up before storing 333f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm0, xmm7 334f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm1, xmm7 335f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm2, xmm7 336f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm3, xmm7 337f71323e297a928af368937089d3ed71239786f86Andreas Huber 338f71323e297a928af368937089d3ed71239786f86Andreas Huber ; store blocks back out 339f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi], xmm0 340f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi + rdx], xmm1 3411b362b15af34006e6a11974088a46d42b903418eJohann movq [rdi + rdx*2], xmm2 3421b362b15af34006e6a11974088a46d42b903418eJohann movq [rdi + rcx], xmm3 343f71323e297a928af368937089d3ed71239786f86Andreas Huber 344f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 345f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 346f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 347f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 3481b362b15af34006e6a11974088a46d42b903418eJohann RESTORE_XMM 349f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 350f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 351f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 352f71323e297a928af368937089d3ed71239786f86Andreas Huber 3531b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_idct_dequant_dc_0_2x_sse2 354f71323e297a928af368937089d3ed71239786f86Andreas Huber; ( 355f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *qcoeff - 0 356f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *dequant - 1 3571b362b15af34006e6a11974088a46d42b903418eJohann; unsigned char *dst - 2 3581b362b15af34006e6a11974088a46d42b903418eJohann; int dst_stride - 3 3591b362b15af34006e6a11974088a46d42b903418eJohann; short *dc - 4 360f71323e297a928af368937089d3ed71239786f86Andreas Huber; ) 3611b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE 3621b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_idct_dequant_dc_0_2x_sse2): 363f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 364f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 3651b362b15af34006e6a11974088a46d42b903418eJohann SHADOW_ARGS_TO_STACK 5 366f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 367f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 368f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 369f71323e297a928af368937089d3ed71239786f86Andreas Huber 370f71323e297a928af368937089d3ed71239786f86Andreas Huber ; special case when 2 blocks have 0 or 1 coeffs 371f71323e297a928af368937089d3ed71239786f86Andreas Huber ; dc is set as first coeff, so no need to load qcoeff 372f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rax, arg(0) ; qcoeff 373f71323e297a928af368937089d3ed71239786f86Andreas Huber 3741b362b15af34006e6a11974088a46d42b903418eJohann mov rdi, arg(2) ; dst 3751b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, arg(4) ; dc 3761b362b15af34006e6a11974088a46d42b903418eJohann 3771b362b15af34006e6a11974088a46d42b903418eJohann ; Zero out xmm5, for use unpacking 3781b362b15af34006e6a11974088a46d42b903418eJohann pxor xmm5, xmm5 379f71323e297a928af368937089d3ed71239786f86Andreas Huber 380f71323e297a928af368937089d3ed71239786f86Andreas Huber ; load up 2 dc words here == 2*16 = doubleword 381f71323e297a928af368937089d3ed71239786f86Andreas Huber movd xmm4, [rdx] 382f71323e297a928af368937089d3ed71239786f86Andreas Huber 3831b362b15af34006e6a11974088a46d42b903418eJohann movsxd rdx, dword ptr arg(3) ; dst_stride 3841b362b15af34006e6a11974088a46d42b903418eJohann lea rcx, [rdx + rdx*2] 385f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Load up predict blocks 3861b362b15af34006e6a11974088a46d42b903418eJohann movq xmm0, [rdi] 3871b362b15af34006e6a11974088a46d42b903418eJohann movq xmm1, [rdi+rdx*1] 3881b362b15af34006e6a11974088a46d42b903418eJohann movq xmm2, [rdi+rdx*2] 3891b362b15af34006e6a11974088a46d42b903418eJohann movq xmm3, [rdi+rcx] 390f71323e297a928af368937089d3ed71239786f86Andreas Huber 391f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Duplicate and expand dc across 392f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm4, xmm4 393f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm4, xmm4 394f71323e297a928af368937089d3ed71239786f86Andreas Huber 395f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Rounding to dequant and downshift 396538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm4, [GLOBAL(fours)] 397f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm4, 3 398f71323e297a928af368937089d3ed71239786f86Andreas Huber 399f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Predict buffer needs to be expanded from bytes to words 4001b362b15af34006e6a11974088a46d42b903418eJohann punpcklbw xmm0, xmm5 4011b362b15af34006e6a11974088a46d42b903418eJohann punpcklbw xmm1, xmm5 4021b362b15af34006e6a11974088a46d42b903418eJohann punpcklbw xmm2, xmm5 4031b362b15af34006e6a11974088a46d42b903418eJohann punpcklbw xmm3, xmm5 404f71323e297a928af368937089d3ed71239786f86Andreas Huber 405f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Add to predict buffer 406f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm0, xmm4 407f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm1, xmm4 408f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm4 409f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 410f71323e297a928af368937089d3ed71239786f86Andreas Huber 411f71323e297a928af368937089d3ed71239786f86Andreas Huber ; pack up before storing 4121b362b15af34006e6a11974088a46d42b903418eJohann packuswb xmm0, xmm5 4131b362b15af34006e6a11974088a46d42b903418eJohann packuswb xmm1, xmm5 4141b362b15af34006e6a11974088a46d42b903418eJohann packuswb xmm2, xmm5 4151b362b15af34006e6a11974088a46d42b903418eJohann packuswb xmm3, xmm5 416f71323e297a928af368937089d3ed71239786f86Andreas Huber 417f71323e297a928af368937089d3ed71239786f86Andreas Huber ; store blocks back out 418f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi], xmm0 419f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi + rdx], xmm1 4201b362b15af34006e6a11974088a46d42b903418eJohann movq [rdi + rdx*2], xmm2 4211b362b15af34006e6a11974088a46d42b903418eJohann movq [rdi + rcx], xmm3 422f71323e297a928af368937089d3ed71239786f86Andreas Huber 423f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 424f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 425f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 426f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 427f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 428f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 4291b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_idct_dequant_dc_full_2x_sse2 4301b362b15af34006e6a11974088a46d42b903418eJohann; ( 4311b362b15af34006e6a11974088a46d42b903418eJohann; short *qcoeff - 0 4321b362b15af34006e6a11974088a46d42b903418eJohann; short *dequant - 1 4331b362b15af34006e6a11974088a46d42b903418eJohann; unsigned char *dst - 2 4341b362b15af34006e6a11974088a46d42b903418eJohann; int dst_stride - 3 4351b362b15af34006e6a11974088a46d42b903418eJohann; short *dc - 4 4361b362b15af34006e6a11974088a46d42b903418eJohann; ) 4371b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE 4381b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_idct_dequant_dc_full_2x_sse2): 439f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 440f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 4411b362b15af34006e6a11974088a46d42b903418eJohann SHADOW_ARGS_TO_STACK 5 4421b362b15af34006e6a11974088a46d42b903418eJohann SAVE_XMM 7 443f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 444f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 445f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 446f71323e297a928af368937089d3ed71239786f86Andreas Huber 447f71323e297a928af368937089d3ed71239786f86Andreas Huber ; special case when 2 blocks have 0 or 1 coeffs 448f71323e297a928af368937089d3ed71239786f86Andreas Huber ; dc is set as first coeff, so no need to load qcoeff 449f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rax, arg(0) ; qcoeff 4501b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, arg(1) ; dequant 4511b362b15af34006e6a11974088a46d42b903418eJohann 4521b362b15af34006e6a11974088a46d42b903418eJohann mov rdi, arg(2) ; dst 453f71323e297a928af368937089d3ed71239786f86Andreas Huber 454f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Zero out xmm7, for use unpacking 455f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm7, xmm7 456f71323e297a928af368937089d3ed71239786f86Andreas Huber 457f71323e297a928af368937089d3ed71239786f86Andreas Huber 458f71323e297a928af368937089d3ed71239786f86Andreas Huber ; note the transpose of xmm1 and xmm2, necessary for shuffle 459f71323e297a928af368937089d3ed71239786f86Andreas Huber ; to spit out sensicle data 460f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm0, [rax] 461f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, [rax+16] 462f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, [rax+32] 463f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, [rax+48] 464f71323e297a928af368937089d3ed71239786f86Andreas Huber 465f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Clear out coeffs 466f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax], xmm7 467f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+16], xmm7 468f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+32], xmm7 469f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+48], xmm7 470f71323e297a928af368937089d3ed71239786f86Andreas Huber 471f71323e297a928af368937089d3ed71239786f86Andreas Huber ; dequantize qcoeff buffer 472f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm0, [rdx] 473f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm2, [rdx+16] 474f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm1, [rdx] 475f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm3, [rdx+16] 476f71323e297a928af368937089d3ed71239786f86Andreas Huber 477f71323e297a928af368937089d3ed71239786f86Andreas Huber ; DC component 4781b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, arg(4) 479f71323e297a928af368937089d3ed71239786f86Andreas Huber 480f71323e297a928af368937089d3ed71239786f86Andreas Huber ; repack so block 0 row x and block 1 row x are together 481f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 482f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm0, xmm1 483f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm4, xmm1 484f71323e297a928af368937089d3ed71239786f86Andreas Huber 485f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm0, 11011000b 486f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm4, 11011000b 487f71323e297a928af368937089d3ed71239786f86Andreas Huber 488f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm2 489f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm3 490f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm4, xmm3 491f71323e297a928af368937089d3ed71239786f86Andreas Huber 492f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm2, 11011000b 493f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm4, 11011000b 494f71323e297a928af368937089d3ed71239786f86Andreas Huber 495f71323e297a928af368937089d3ed71239786f86Andreas Huber ; insert DC component 496f71323e297a928af368937089d3ed71239786f86Andreas Huber pinsrw xmm0, [rdx], 0 497f71323e297a928af368937089d3ed71239786f86Andreas Huber pinsrw xmm0, [rdx+2], 4 498f71323e297a928af368937089d3ed71239786f86Andreas Huber 499f71323e297a928af368937089d3ed71239786f86Andreas Huber ; first pass 500f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm2 ; b1 = 0-2 501f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm2 ; 502f71323e297a928af368937089d3ed71239786f86Andreas Huber 503f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 504f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm0 ; a1 = 0+2 505f71323e297a928af368937089d3ed71239786f86Andreas Huber 506538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_s1sqr2)] 507f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 508f71323e297a928af368937089d3ed71239786f86Andreas Huber 509f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm3 510538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 511f71323e297a928af368937089d3ed71239786f86Andreas Huber 512f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 513f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm7, xmm5 ; c1 514f71323e297a928af368937089d3ed71239786f86Andreas Huber 515f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 516f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 517f71323e297a928af368937089d3ed71239786f86Andreas Huber 518538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 519f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 520f71323e297a928af368937089d3ed71239786f86Andreas Huber 521538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm3, [GLOBAL(x_s1sqr2)] 522f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 523f71323e297a928af368937089d3ed71239786f86Andreas Huber 524f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 ; d1 525f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm2 ; a1 526f71323e297a928af368937089d3ed71239786f86Andreas Huber 527f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 ; b1 528f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm3 ;0 529f71323e297a928af368937089d3ed71239786f86Andreas Huber 530f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm4, xmm7 ;1 531f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm7 ;2 532f71323e297a928af368937089d3ed71239786f86Andreas Huber 533f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm6, xmm3 ;3 534f71323e297a928af368937089d3ed71239786f86Andreas Huber 535f71323e297a928af368937089d3ed71239786f86Andreas Huber ; transpose for the second pass 536f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 537f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 538f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 539f71323e297a928af368937089d3ed71239786f86Andreas Huber 540f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 541f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 542f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 543f71323e297a928af368937089d3ed71239786f86Andreas Huber 544f71323e297a928af368937089d3ed71239786f86Andreas Huber 545f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 546f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 547f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 548f71323e297a928af368937089d3ed71239786f86Andreas Huber 549f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 550f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 551f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 552f71323e297a928af368937089d3ed71239786f86Andreas Huber 553f71323e297a928af368937089d3ed71239786f86Andreas Huber 554f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 555f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 556f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 557f71323e297a928af368937089d3ed71239786f86Andreas Huber 558f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 559f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 560f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 561f71323e297a928af368937089d3ed71239786f86Andreas Huber 562f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm2, 11011000b 563f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm1, 11011000b 564f71323e297a928af368937089d3ed71239786f86Andreas Huber 565f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm5, 11011000b 566f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm7, 11011000b 567f71323e297a928af368937089d3ed71239786f86Andreas Huber 568f71323e297a928af368937089d3ed71239786f86Andreas Huber ; second pass 569f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm2 ; b1 = 0-2 570f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm2 571f71323e297a928af368937089d3ed71239786f86Andreas Huber 572f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 573f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm0 ; a1 = 0+2 574f71323e297a928af368937089d3ed71239786f86Andreas Huber 575538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_s1sqr2)] 576f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 577f71323e297a928af368937089d3ed71239786f86Andreas Huber 578f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm3 579538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 580f71323e297a928af368937089d3ed71239786f86Andreas Huber 581f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 582f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm7, xmm5 ; c1 583f71323e297a928af368937089d3ed71239786f86Andreas Huber 584f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 585f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 586f71323e297a928af368937089d3ed71239786f86Andreas Huber 587538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 588f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 589f71323e297a928af368937089d3ed71239786f86Andreas Huber 590538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm3, [GLOBAL(x_s1sqr2)] 591f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 592f71323e297a928af368937089d3ed71239786f86Andreas Huber 593f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 ; d1 594538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm0, [GLOBAL(fours)] 595f71323e297a928af368937089d3ed71239786f86Andreas Huber 596538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm2, [GLOBAL(fours)] 597f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm2 ; a1 598f71323e297a928af368937089d3ed71239786f86Andreas Huber 599f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 ; b1 600f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm3 ;0 601f71323e297a928af368937089d3ed71239786f86Andreas Huber 602f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm4, xmm7 ;1 603f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm7 ;2 604f71323e297a928af368937089d3ed71239786f86Andreas Huber 605f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm6, xmm3 ;3 606f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm2, 3 607f71323e297a928af368937089d3ed71239786f86Andreas Huber 608f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm0, 3 609f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm4, 3 610f71323e297a928af368937089d3ed71239786f86Andreas Huber 611f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm6, 3 612f71323e297a928af368937089d3ed71239786f86Andreas Huber 613f71323e297a928af368937089d3ed71239786f86Andreas Huber ; transpose to save 614f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 615f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 616f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 617f71323e297a928af368937089d3ed71239786f86Andreas Huber 618f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 619f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 620f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 621f71323e297a928af368937089d3ed71239786f86Andreas Huber 622f71323e297a928af368937089d3ed71239786f86Andreas Huber 623f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 624f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 625f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 626f71323e297a928af368937089d3ed71239786f86Andreas Huber 627f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 628f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 629f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 630f71323e297a928af368937089d3ed71239786f86Andreas Huber 631f71323e297a928af368937089d3ed71239786f86Andreas Huber 632f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 633f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 634f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 635f71323e297a928af368937089d3ed71239786f86Andreas Huber 636f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 637f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 638f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 639f71323e297a928af368937089d3ed71239786f86Andreas Huber 640f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm2, 11011000b 641f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm1, 11011000b 642f71323e297a928af368937089d3ed71239786f86Andreas Huber 643f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm5, 11011000b 644f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm7, 11011000b 645f71323e297a928af368937089d3ed71239786f86Andreas Huber 646f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm7, xmm7 647f71323e297a928af368937089d3ed71239786f86Andreas Huber 648f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Load up predict blocks 6491b362b15af34006e6a11974088a46d42b903418eJohann movsxd rdx, dword ptr arg(3) ; dst_stride 6501b362b15af34006e6a11974088a46d42b903418eJohann movq xmm4, [rdi] 6511b362b15af34006e6a11974088a46d42b903418eJohann movq xmm5, [rdi+rdx] 6521b362b15af34006e6a11974088a46d42b903418eJohann lea rcx, [rdx + rdx*2] 653f71323e297a928af368937089d3ed71239786f86Andreas Huber 654f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm4, xmm7 655f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm5, xmm7 656f71323e297a928af368937089d3ed71239786f86Andreas Huber 657f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm0, xmm4 658f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm1, xmm5 659f71323e297a928af368937089d3ed71239786f86Andreas Huber 6601b362b15af34006e6a11974088a46d42b903418eJohann movq xmm4, [rdi+rdx*2] 6611b362b15af34006e6a11974088a46d42b903418eJohann movq xmm5, [rdi+rcx] 662f71323e297a928af368937089d3ed71239786f86Andreas Huber 663f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm4, xmm7 664f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm5, xmm7 665f71323e297a928af368937089d3ed71239786f86Andreas Huber 666f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm4 667f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 668f71323e297a928af368937089d3ed71239786f86Andreas Huber 669f71323e297a928af368937089d3ed71239786f86Andreas Huber.finish: 670f71323e297a928af368937089d3ed71239786f86Andreas Huber 671f71323e297a928af368937089d3ed71239786f86Andreas Huber ; pack up before storing 672f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm0, xmm7 673f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm1, xmm7 674f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm2, xmm7 675f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm3, xmm7 676f71323e297a928af368937089d3ed71239786f86Andreas Huber 677f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Load destination stride before writing out, 678f71323e297a928af368937089d3ed71239786f86Andreas Huber ; doesn't need to persist 6791b362b15af34006e6a11974088a46d42b903418eJohann movsxd rdx, dword ptr arg(3) ; dst_stride 680f71323e297a928af368937089d3ed71239786f86Andreas Huber 681f71323e297a928af368937089d3ed71239786f86Andreas Huber ; store blocks back out 682f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi], xmm0 683f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi + rdx], xmm1 684f71323e297a928af368937089d3ed71239786f86Andreas Huber 685f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rdi + 2*rdx] 686f71323e297a928af368937089d3ed71239786f86Andreas Huber 687f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi], xmm2 688f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi + rdx], xmm3 689f71323e297a928af368937089d3ed71239786f86Andreas Huber 690f71323e297a928af368937089d3ed71239786f86Andreas Huber 691f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 692f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 693f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 6941b362b15af34006e6a11974088a46d42b903418eJohann RESTORE_XMM 695f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 696f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 697f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 698f71323e297a928af368937089d3ed71239786f86Andreas Huber 699f71323e297a928af368937089d3ed71239786f86Andreas HuberSECTION_RODATA 700f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 701f71323e297a928af368937089d3ed71239786f86Andreas Huberfours: 702f71323e297a928af368937089d3ed71239786f86Andreas Huber times 8 dw 0x0004 703f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 704f71323e297a928af368937089d3ed71239786f86Andreas Huberx_s1sqr2: 705f71323e297a928af368937089d3ed71239786f86Andreas Huber times 8 dw 0x8A8C 706f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 707f71323e297a928af368937089d3ed71239786f86Andreas Huberx_c1sqr2less1: 708f71323e297a928af368937089d3ed71239786f86Andreas Huber times 8 dw 0x4E7B 709