1f71323e297a928af368937089d3ed71239786f86Andreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3f71323e297a928af368937089d3ed71239786f86Andreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 9f71323e297a928af368937089d3ed71239786f86Andreas Huber; 10f71323e297a928af368937089d3ed71239786f86Andreas Huber 11f71323e297a928af368937089d3ed71239786f86Andreas Huber 12f71323e297a928af368937089d3ed71239786f86Andreas Huber%include "vpx_ports/x86_abi_support.asm" 13f71323e297a928af368937089d3ed71239786f86Andreas Huber 14f71323e297a928af368937089d3ed71239786f86Andreas Huber;void idct_dequant_0_2x_sse2 15f71323e297a928af368937089d3ed71239786f86Andreas Huber; ( 16f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *qcoeff - 0 17f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *dequant - 1 18f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *pre - 2 19f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *dst - 3 20f71323e297a928af368937089d3ed71239786f86Andreas Huber; int dst_stride - 4 21f71323e297a928af368937089d3ed71239786f86Andreas Huber; int blk_stride - 5 22f71323e297a928af368937089d3ed71239786f86Andreas Huber; ) 23f71323e297a928af368937089d3ed71239786f86Andreas Huber 24f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(idct_dequant_0_2x_sse2) 25f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(idct_dequant_0_2x_sse2): 26f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 27f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 28f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 6 29f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 30f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 31f71323e297a928af368937089d3ed71239786f86Andreas Huber 32f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdx, arg(1) ; dequant 33f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rax, arg(0) ; qcoeff 34f71323e297a928af368937089d3ed71239786f86Andreas Huber 35f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Zero out xmm7, for use unpacking 36f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm7, xmm7 37f71323e297a928af368937089d3ed71239786f86Andreas Huber 38f71323e297a928af368937089d3ed71239786f86Andreas Huber movd xmm4, [rax] 39f71323e297a928af368937089d3ed71239786f86Andreas Huber movd xmm5, [rdx] 40f71323e297a928af368937089d3ed71239786f86Andreas Huber 41f71323e297a928af368937089d3ed71239786f86Andreas Huber pinsrw xmm4, [rax+32], 4 42f71323e297a928af368937089d3ed71239786f86Andreas Huber pinsrw xmm5, [rdx], 4 43f71323e297a928af368937089d3ed71239786f86Andreas Huber 44f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm4, xmm5 45f71323e297a928af368937089d3ed71239786f86Andreas Huber 46f71323e297a928af368937089d3ed71239786f86Andreas Huber ; clear coeffs 47f71323e297a928af368937089d3ed71239786f86Andreas Huber movd [rax], xmm7 48f71323e297a928af368937089d3ed71239786f86Andreas Huber movd [rax+32], xmm7 49f71323e297a928af368937089d3ed71239786f86Andreas Huber;pshufb 50f71323e297a928af368937089d3ed71239786f86Andreas Huber pshuflw xmm4, xmm4, 00000000b 51f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufhw xmm4, xmm4, 00000000b 52f71323e297a928af368937089d3ed71239786f86Andreas Huber 53f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rax, arg(2) ; pre 54538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm4, [GLOBAL(fours)] 55f71323e297a928af368937089d3ed71239786f86Andreas Huber 56f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rcx, dword ptr arg(5) ; blk_stride 57f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm4, 3 58f71323e297a928af368937089d3ed71239786f86Andreas Huber 59f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm0, [rax] 60f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm1, [rax+rcx] 61f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm2, [rax+2*rcx] 62f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rcx, [3*rcx] 63f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm3, [rax+rcx] 64f71323e297a928af368937089d3ed71239786f86Andreas Huber 65f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm0, xmm7 66f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm1, xmm7 67f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm2, xmm7 68f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm3, xmm7 69f71323e297a928af368937089d3ed71239786f86Andreas Huber 70f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rax, arg(3) ; dst 71f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rdx, dword ptr arg(4) ; dst_stride 72f71323e297a928af368937089d3ed71239786f86Andreas Huber 73f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Add to predict buffer 74f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm0, xmm4 75f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm1, xmm4 76f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm4 77f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 78f71323e297a928af368937089d3ed71239786f86Andreas Huber 79f71323e297a928af368937089d3ed71239786f86Andreas Huber ; pack up before storing 80f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm0, xmm7 81f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm1, xmm7 82f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm2, xmm7 83f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm3, xmm7 84f71323e297a928af368937089d3ed71239786f86Andreas Huber 85f71323e297a928af368937089d3ed71239786f86Andreas Huber ; store blocks back out 86f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rax], xmm0 87f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rax + rdx], xmm1 88f71323e297a928af368937089d3ed71239786f86Andreas Huber 89f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rax, [rax + 2*rdx] 90f71323e297a928af368937089d3ed71239786f86Andreas Huber 91f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rax], xmm2 92f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rax + rdx], xmm3 93f71323e297a928af368937089d3ed71239786f86Andreas Huber 94f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 95f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 96f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 97f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 98f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 99f71323e297a928af368937089d3ed71239786f86Andreas Huber 100f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(idct_dequant_full_2x_sse2) 101f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(idct_dequant_full_2x_sse2): 102f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 103f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 104f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 7 105f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 106f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 107f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 108f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 109f71323e297a928af368937089d3ed71239786f86Andreas Huber 110f71323e297a928af368937089d3ed71239786f86Andreas Huber ; special case when 2 blocks have 0 or 1 coeffs 111f71323e297a928af368937089d3ed71239786f86Andreas Huber ; dc is set as first coeff, so no need to load qcoeff 112f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rax, arg(0) ; qcoeff 113f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(2) ; pre 114f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdi, arg(3) ; dst 115f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rcx, dword ptr arg(5) ; blk_stride 116f71323e297a928af368937089d3ed71239786f86Andreas Huber 117f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Zero out xmm7, for use unpacking 118f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm7, xmm7 119f71323e297a928af368937089d3ed71239786f86Andreas Huber 120f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdx, arg(1) ; dequant 121f71323e297a928af368937089d3ed71239786f86Andreas Huber 122f71323e297a928af368937089d3ed71239786f86Andreas Huber ; note the transpose of xmm1 and xmm2, necessary for shuffle 123f71323e297a928af368937089d3ed71239786f86Andreas Huber ; to spit out sensicle data 124f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm0, [rax] 125f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, [rax+16] 126f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, [rax+32] 127f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, [rax+48] 128f71323e297a928af368937089d3ed71239786f86Andreas Huber 129f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Clear out coeffs 130f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax], xmm7 131f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+16], xmm7 132f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+32], xmm7 133f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+48], xmm7 134f71323e297a928af368937089d3ed71239786f86Andreas Huber 135f71323e297a928af368937089d3ed71239786f86Andreas Huber ; dequantize qcoeff buffer 136f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm0, [rdx] 137f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm2, [rdx+16] 138f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm1, [rdx] 139f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm3, [rdx+16] 140f71323e297a928af368937089d3ed71239786f86Andreas Huber 141f71323e297a928af368937089d3ed71239786f86Andreas Huber ; repack so block 0 row x and block 1 row x are together 142f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 143f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm0, xmm1 144f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm4, xmm1 145f71323e297a928af368937089d3ed71239786f86Andreas Huber 146f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm0, 11011000b 147f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm4, 11011000b 148f71323e297a928af368937089d3ed71239786f86Andreas Huber 149f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm2 150f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm3 151f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm4, xmm3 152f71323e297a928af368937089d3ed71239786f86Andreas Huber 153f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm2, 11011000b 154f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm4, 11011000b 155f71323e297a928af368937089d3ed71239786f86Andreas Huber 156f71323e297a928af368937089d3ed71239786f86Andreas Huber ; first pass 157f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm2 ; b1 = 0-2 158f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm2 ; 159f71323e297a928af368937089d3ed71239786f86Andreas Huber 160f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 161f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm0 ; a1 = 0+2 162f71323e297a928af368937089d3ed71239786f86Andreas Huber 163538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_s1sqr2)] 164f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 165f71323e297a928af368937089d3ed71239786f86Andreas Huber 166f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm3 167538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 168f71323e297a928af368937089d3ed71239786f86Andreas Huber 169f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 170f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm7, xmm5 ; c1 171f71323e297a928af368937089d3ed71239786f86Andreas Huber 172f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 173f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 174f71323e297a928af368937089d3ed71239786f86Andreas Huber 175538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 176f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 177f71323e297a928af368937089d3ed71239786f86Andreas Huber 178538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm3, [GLOBAL(x_s1sqr2)] 179f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 180f71323e297a928af368937089d3ed71239786f86Andreas Huber 181f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 ; d1 182f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm2 ; a1 183f71323e297a928af368937089d3ed71239786f86Andreas Huber 184f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 ; b1 185f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm3 ;0 186f71323e297a928af368937089d3ed71239786f86Andreas Huber 187f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm4, xmm7 ;1 188f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm7 ;2 189f71323e297a928af368937089d3ed71239786f86Andreas Huber 190f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm6, xmm3 ;3 191f71323e297a928af368937089d3ed71239786f86Andreas Huber 192f71323e297a928af368937089d3ed71239786f86Andreas Huber ; transpose for the second pass 193f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 194f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 195f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 196f71323e297a928af368937089d3ed71239786f86Andreas Huber 197f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 198f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 199f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 200f71323e297a928af368937089d3ed71239786f86Andreas Huber 201f71323e297a928af368937089d3ed71239786f86Andreas Huber 202f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 203f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 204f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 205f71323e297a928af368937089d3ed71239786f86Andreas Huber 206f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 207f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 208f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 209f71323e297a928af368937089d3ed71239786f86Andreas Huber 210f71323e297a928af368937089d3ed71239786f86Andreas Huber 211f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 212f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 213f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 214f71323e297a928af368937089d3ed71239786f86Andreas Huber 215f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 216f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 217f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 218f71323e297a928af368937089d3ed71239786f86Andreas Huber 219f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm2, 11011000b 220f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm1, 11011000b 221f71323e297a928af368937089d3ed71239786f86Andreas Huber 222f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm5, 11011000b 223f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm7, 11011000b 224f71323e297a928af368937089d3ed71239786f86Andreas Huber 225f71323e297a928af368937089d3ed71239786f86Andreas Huber ; second pass 226f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm2 ; b1 = 0-2 227f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm2 228f71323e297a928af368937089d3ed71239786f86Andreas Huber 229f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 230f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm0 ; a1 = 0+2 231f71323e297a928af368937089d3ed71239786f86Andreas Huber 232538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_s1sqr2)] 233f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 234f71323e297a928af368937089d3ed71239786f86Andreas Huber 235f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm3 236538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 237f71323e297a928af368937089d3ed71239786f86Andreas Huber 238f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 239f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm7, xmm5 ; c1 240f71323e297a928af368937089d3ed71239786f86Andreas Huber 241f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 242f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 243f71323e297a928af368937089d3ed71239786f86Andreas Huber 244538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 245f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 246f71323e297a928af368937089d3ed71239786f86Andreas Huber 247538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm3, [GLOBAL(x_s1sqr2)] 248f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 249f71323e297a928af368937089d3ed71239786f86Andreas Huber 250f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 ; d1 251538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm0, [GLOBAL(fours)] 252f71323e297a928af368937089d3ed71239786f86Andreas Huber 253538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm2, [GLOBAL(fours)] 254f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm2 ; a1 255f71323e297a928af368937089d3ed71239786f86Andreas Huber 256f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 ; b1 257f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm3 ;0 258f71323e297a928af368937089d3ed71239786f86Andreas Huber 259f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm4, xmm7 ;1 260f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm7 ;2 261f71323e297a928af368937089d3ed71239786f86Andreas Huber 262f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm6, xmm3 ;3 263f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm2, 3 264f71323e297a928af368937089d3ed71239786f86Andreas Huber 265f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm0, 3 266f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm4, 3 267f71323e297a928af368937089d3ed71239786f86Andreas Huber 268f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm6, 3 269f71323e297a928af368937089d3ed71239786f86Andreas Huber 270f71323e297a928af368937089d3ed71239786f86Andreas Huber ; transpose to save 271f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 272f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 273f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 274f71323e297a928af368937089d3ed71239786f86Andreas Huber 275f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 276f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 277f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 278f71323e297a928af368937089d3ed71239786f86Andreas Huber 279f71323e297a928af368937089d3ed71239786f86Andreas Huber 280f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 281f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 282f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 283f71323e297a928af368937089d3ed71239786f86Andreas Huber 284f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 285f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 286f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 287f71323e297a928af368937089d3ed71239786f86Andreas Huber 288f71323e297a928af368937089d3ed71239786f86Andreas Huber 289f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 290f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 291f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 292f71323e297a928af368937089d3ed71239786f86Andreas Huber 293f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 294f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 295f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 296f71323e297a928af368937089d3ed71239786f86Andreas Huber 297f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm2, 11011000b 298f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm1, 11011000b 299f71323e297a928af368937089d3ed71239786f86Andreas Huber 300f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm5, 11011000b 301f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm7, 11011000b 302f71323e297a928af368937089d3ed71239786f86Andreas Huber 303f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm7, xmm7 304f71323e297a928af368937089d3ed71239786f86Andreas Huber 305f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Load up predict blocks 306f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm4, [rsi] 307f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm5, [rsi+rcx] 308f71323e297a928af368937089d3ed71239786f86Andreas Huber 309f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm4, xmm7 310f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm5, xmm7 311f71323e297a928af368937089d3ed71239786f86Andreas Huber 312f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm0, xmm4 313f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm1, xmm5 314f71323e297a928af368937089d3ed71239786f86Andreas Huber 315f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm4, [rsi+2*rcx] 316f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rcx, [3*rcx] 317f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm5, [rsi+rcx] 318f71323e297a928af368937089d3ed71239786f86Andreas Huber 319f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm4, xmm7 320f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm5, xmm7 321f71323e297a928af368937089d3ed71239786f86Andreas Huber 322f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm4 323f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 324f71323e297a928af368937089d3ed71239786f86Andreas Huber 325f71323e297a928af368937089d3ed71239786f86Andreas Huber.finish: 326f71323e297a928af368937089d3ed71239786f86Andreas Huber 327f71323e297a928af368937089d3ed71239786f86Andreas Huber ; pack up before storing 328f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm0, xmm7 329f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm1, xmm7 330f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm2, xmm7 331f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm3, xmm7 332f71323e297a928af368937089d3ed71239786f86Andreas Huber 333f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Load destination stride before writing out, 334f71323e297a928af368937089d3ed71239786f86Andreas Huber ; doesn't need to persist 335f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rdx, dword ptr arg(4) ; dst_stride 336f71323e297a928af368937089d3ed71239786f86Andreas Huber 337f71323e297a928af368937089d3ed71239786f86Andreas Huber ; store blocks back out 338f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi], xmm0 339f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi + rdx], xmm1 340f71323e297a928af368937089d3ed71239786f86Andreas Huber 341f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rdi + 2*rdx] 342f71323e297a928af368937089d3ed71239786f86Andreas Huber 343f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi], xmm2 344f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi + rdx], xmm3 345f71323e297a928af368937089d3ed71239786f86Andreas Huber 346f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 347f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 348f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 349f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 350f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 351f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 352f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 353f71323e297a928af368937089d3ed71239786f86Andreas Huber 354f71323e297a928af368937089d3ed71239786f86Andreas Huber;void idct_dequant_dc_0_2x_sse2 355f71323e297a928af368937089d3ed71239786f86Andreas Huber; ( 356f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *qcoeff - 0 357f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *dequant - 1 358f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *pre - 2 359f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *dst - 3 360f71323e297a928af368937089d3ed71239786f86Andreas Huber; int dst_stride - 4 361f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *dc - 5 362f71323e297a928af368937089d3ed71239786f86Andreas Huber; ) 363f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(idct_dequant_dc_0_2x_sse2) 364f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(idct_dequant_dc_0_2x_sse2): 365f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 366f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 367f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 7 368f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 369f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 370f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 371f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 372f71323e297a928af368937089d3ed71239786f86Andreas Huber 373f71323e297a928af368937089d3ed71239786f86Andreas Huber ; special case when 2 blocks have 0 or 1 coeffs 374f71323e297a928af368937089d3ed71239786f86Andreas Huber ; dc is set as first coeff, so no need to load qcoeff 375f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rax, arg(0) ; qcoeff 376f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(2) ; pre 377f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdi, arg(3) ; dst 378f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdx, arg(5) ; dc 379f71323e297a928af368937089d3ed71239786f86Andreas Huber 380f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Zero out xmm7, for use unpacking 381f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm7, xmm7 382f71323e297a928af368937089d3ed71239786f86Andreas Huber 383f71323e297a928af368937089d3ed71239786f86Andreas Huber ; load up 2 dc words here == 2*16 = doubleword 384f71323e297a928af368937089d3ed71239786f86Andreas Huber movd xmm4, [rdx] 385f71323e297a928af368937089d3ed71239786f86Andreas Huber 386f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Load up predict blocks 387f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm0, [rsi] 388f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm1, [rsi+16] 389f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm2, [rsi+32] 390f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm3, [rsi+48] 391f71323e297a928af368937089d3ed71239786f86Andreas Huber 392f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Duplicate and expand dc across 393f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm4, xmm4 394f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm4, xmm4 395f71323e297a928af368937089d3ed71239786f86Andreas Huber 396f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Rounding to dequant and downshift 397538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm4, [GLOBAL(fours)] 398f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm4, 3 399f71323e297a928af368937089d3ed71239786f86Andreas Huber 400f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Predict buffer needs to be expanded from bytes to words 401f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm0, xmm7 402f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm1, xmm7 403f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm2, xmm7 404f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm3, xmm7 405f71323e297a928af368937089d3ed71239786f86Andreas Huber 406f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Add to predict buffer 407f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm0, xmm4 408f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm1, xmm4 409f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm4 410f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 411f71323e297a928af368937089d3ed71239786f86Andreas Huber 412f71323e297a928af368937089d3ed71239786f86Andreas Huber ; pack up before storing 413f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm0, xmm7 414f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm1, xmm7 415f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm2, xmm7 416f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm3, xmm7 417f71323e297a928af368937089d3ed71239786f86Andreas Huber 418f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Load destination stride before writing out, 419f71323e297a928af368937089d3ed71239786f86Andreas Huber ; doesn't need to persist 420f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rdx, dword ptr arg(4) ; dst_stride 421f71323e297a928af368937089d3ed71239786f86Andreas Huber 422f71323e297a928af368937089d3ed71239786f86Andreas Huber ; store blocks back out 423f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi], xmm0 424f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi + rdx], xmm1 425f71323e297a928af368937089d3ed71239786f86Andreas Huber 426f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rdi + 2*rdx] 427f71323e297a928af368937089d3ed71239786f86Andreas Huber 428f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi], xmm2 429f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi + rdx], xmm3 430f71323e297a928af368937089d3ed71239786f86Andreas Huber 431f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 432f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 433f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 434f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 435f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 436f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 437f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 438f71323e297a928af368937089d3ed71239786f86Andreas Huber 439f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(idct_dequant_dc_full_2x_sse2) 440f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(idct_dequant_dc_full_2x_sse2): 441f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 442f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 443f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 7 444f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 445f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 446f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 447f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 448f71323e297a928af368937089d3ed71239786f86Andreas Huber 449f71323e297a928af368937089d3ed71239786f86Andreas Huber ; special case when 2 blocks have 0 or 1 coeffs 450f71323e297a928af368937089d3ed71239786f86Andreas Huber ; dc is set as first coeff, so no need to load qcoeff 451f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rax, arg(0) ; qcoeff 452f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(2) ; pre 453f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdi, arg(3) ; dst 454f71323e297a928af368937089d3ed71239786f86Andreas Huber 455f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Zero out xmm7, for use unpacking 456f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm7, xmm7 457f71323e297a928af368937089d3ed71239786f86Andreas Huber 458f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdx, arg(1) ; dequant 459f71323e297a928af368937089d3ed71239786f86Andreas Huber 460f71323e297a928af368937089d3ed71239786f86Andreas Huber ; note the transpose of xmm1 and xmm2, necessary for shuffle 461f71323e297a928af368937089d3ed71239786f86Andreas Huber ; to spit out sensicle data 462f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm0, [rax] 463f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, [rax+16] 464f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, [rax+32] 465f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, [rax+48] 466f71323e297a928af368937089d3ed71239786f86Andreas Huber 467f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Clear out coeffs 468f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax], xmm7 469f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+16], xmm7 470f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+32], xmm7 471f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rax+48], xmm7 472f71323e297a928af368937089d3ed71239786f86Andreas Huber 473f71323e297a928af368937089d3ed71239786f86Andreas Huber ; dequantize qcoeff buffer 474f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm0, [rdx] 475f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm2, [rdx+16] 476f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm1, [rdx] 477f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm3, [rdx+16] 478f71323e297a928af368937089d3ed71239786f86Andreas Huber 479f71323e297a928af368937089d3ed71239786f86Andreas Huber ; DC component 480f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdx, arg(5) 481f71323e297a928af368937089d3ed71239786f86Andreas Huber 482f71323e297a928af368937089d3ed71239786f86Andreas Huber ; repack so block 0 row x and block 1 row x are together 483f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 484f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm0, xmm1 485f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm4, xmm1 486f71323e297a928af368937089d3ed71239786f86Andreas Huber 487f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm0, 11011000b 488f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm4, 11011000b 489f71323e297a928af368937089d3ed71239786f86Andreas Huber 490f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm2 491f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm3 492f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm4, xmm3 493f71323e297a928af368937089d3ed71239786f86Andreas Huber 494f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm2, 11011000b 495f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm4, 11011000b 496f71323e297a928af368937089d3ed71239786f86Andreas Huber 497f71323e297a928af368937089d3ed71239786f86Andreas Huber ; insert DC component 498f71323e297a928af368937089d3ed71239786f86Andreas Huber pinsrw xmm0, [rdx], 0 499f71323e297a928af368937089d3ed71239786f86Andreas Huber pinsrw xmm0, [rdx+2], 4 500f71323e297a928af368937089d3ed71239786f86Andreas Huber 501f71323e297a928af368937089d3ed71239786f86Andreas Huber ; first pass 502f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm2 ; b1 = 0-2 503f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm2 ; 504f71323e297a928af368937089d3ed71239786f86Andreas Huber 505f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 506f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm0 ; a1 = 0+2 507f71323e297a928af368937089d3ed71239786f86Andreas Huber 508538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_s1sqr2)] 509f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 510f71323e297a928af368937089d3ed71239786f86Andreas Huber 511f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm3 512538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 513f71323e297a928af368937089d3ed71239786f86Andreas Huber 514f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 515f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm7, xmm5 ; c1 516f71323e297a928af368937089d3ed71239786f86Andreas Huber 517f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 518f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 519f71323e297a928af368937089d3ed71239786f86Andreas Huber 520538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 521f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 522f71323e297a928af368937089d3ed71239786f86Andreas Huber 523538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm3, [GLOBAL(x_s1sqr2)] 524f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 525f71323e297a928af368937089d3ed71239786f86Andreas Huber 526f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 ; d1 527f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm2 ; a1 528f71323e297a928af368937089d3ed71239786f86Andreas Huber 529f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 ; b1 530f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm3 ;0 531f71323e297a928af368937089d3ed71239786f86Andreas Huber 532f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm4, xmm7 ;1 533f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm7 ;2 534f71323e297a928af368937089d3ed71239786f86Andreas Huber 535f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm6, xmm3 ;3 536f71323e297a928af368937089d3ed71239786f86Andreas Huber 537f71323e297a928af368937089d3ed71239786f86Andreas Huber ; transpose for the second pass 538f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 539f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 540f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 541f71323e297a928af368937089d3ed71239786f86Andreas Huber 542f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 543f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 544f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 545f71323e297a928af368937089d3ed71239786f86Andreas Huber 546f71323e297a928af368937089d3ed71239786f86Andreas Huber 547f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 548f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 549f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 550f71323e297a928af368937089d3ed71239786f86Andreas Huber 551f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 552f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 553f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 554f71323e297a928af368937089d3ed71239786f86Andreas Huber 555f71323e297a928af368937089d3ed71239786f86Andreas Huber 556f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 557f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 558f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 559f71323e297a928af368937089d3ed71239786f86Andreas Huber 560f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 561f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 562f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 563f71323e297a928af368937089d3ed71239786f86Andreas Huber 564f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm2, 11011000b 565f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm1, 11011000b 566f71323e297a928af368937089d3ed71239786f86Andreas Huber 567f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm5, 11011000b 568f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm7, 11011000b 569f71323e297a928af368937089d3ed71239786f86Andreas Huber 570f71323e297a928af368937089d3ed71239786f86Andreas Huber ; second pass 571f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm2 ; b1 = 0-2 572f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm2 573f71323e297a928af368937089d3ed71239786f86Andreas Huber 574f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 575f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm0 ; a1 = 0+2 576f71323e297a928af368937089d3ed71239786f86Andreas Huber 577538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_s1sqr2)] 578f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 579f71323e297a928af368937089d3ed71239786f86Andreas Huber 580f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm3 581538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 582f71323e297a928af368937089d3ed71239786f86Andreas Huber 583f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 584f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm7, xmm5 ; c1 585f71323e297a928af368937089d3ed71239786f86Andreas Huber 586f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 587f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 588f71323e297a928af368937089d3ed71239786f86Andreas Huber 589538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 590f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm1 591f71323e297a928af368937089d3ed71239786f86Andreas Huber 592538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm3, [GLOBAL(x_s1sqr2)] 593f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm4 594f71323e297a928af368937089d3ed71239786f86Andreas Huber 595f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 ; d1 596538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm0, [GLOBAL(fours)] 597f71323e297a928af368937089d3ed71239786f86Andreas Huber 598538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm2, [GLOBAL(fours)] 599f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm2 ; a1 600f71323e297a928af368937089d3ed71239786f86Andreas Huber 601f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm0 ; b1 602f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm3 ;0 603f71323e297a928af368937089d3ed71239786f86Andreas Huber 604f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm4, xmm7 ;1 605f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm0, xmm7 ;2 606f71323e297a928af368937089d3ed71239786f86Andreas Huber 607f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm6, xmm3 ;3 608f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm2, 3 609f71323e297a928af368937089d3ed71239786f86Andreas Huber 610f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm0, 3 611f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm4, 3 612f71323e297a928af368937089d3ed71239786f86Andreas Huber 613f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm6, 3 614f71323e297a928af368937089d3ed71239786f86Andreas Huber 615f71323e297a928af368937089d3ed71239786f86Andreas Huber ; transpose to save 616f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 617f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 618f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 619f71323e297a928af368937089d3ed71239786f86Andreas Huber 620f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 621f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 622f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 623f71323e297a928af368937089d3ed71239786f86Andreas Huber 624f71323e297a928af368937089d3ed71239786f86Andreas Huber 625f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 626f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 627f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 628f71323e297a928af368937089d3ed71239786f86Andreas Huber 629f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 630f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 631f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 632f71323e297a928af368937089d3ed71239786f86Andreas Huber 633f71323e297a928af368937089d3ed71239786f86Andreas Huber 634f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 635f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 636f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 637f71323e297a928af368937089d3ed71239786f86Andreas Huber 638f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 639f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 640f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 641f71323e297a928af368937089d3ed71239786f86Andreas Huber 642f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm2, 11011000b 643f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm1, 11011000b 644f71323e297a928af368937089d3ed71239786f86Andreas Huber 645f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm1, xmm5, 11011000b 646f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm7, 11011000b 647f71323e297a928af368937089d3ed71239786f86Andreas Huber 648f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm7, xmm7 649f71323e297a928af368937089d3ed71239786f86Andreas Huber 650f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Load up predict blocks 651f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm4, [rsi] 652f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm5, [rsi+16] 653f71323e297a928af368937089d3ed71239786f86Andreas Huber 654f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm4, xmm7 655f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm5, xmm7 656f71323e297a928af368937089d3ed71239786f86Andreas Huber 657f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm0, xmm4 658f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm1, xmm5 659f71323e297a928af368937089d3ed71239786f86Andreas Huber 660f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm4, [rsi+32] 661f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm5, [rsi+48] 662f71323e297a928af368937089d3ed71239786f86Andreas Huber 663f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm4, xmm7 664f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm5, xmm7 665f71323e297a928af368937089d3ed71239786f86Andreas Huber 666f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm4 667f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm5 668f71323e297a928af368937089d3ed71239786f86Andreas Huber 669f71323e297a928af368937089d3ed71239786f86Andreas Huber.finish: 670f71323e297a928af368937089d3ed71239786f86Andreas Huber 671f71323e297a928af368937089d3ed71239786f86Andreas Huber ; pack up before storing 672f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm0, xmm7 673f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm1, xmm7 674f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm2, xmm7 675f71323e297a928af368937089d3ed71239786f86Andreas Huber packuswb xmm3, xmm7 676f71323e297a928af368937089d3ed71239786f86Andreas Huber 677f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Load destination stride before writing out, 678f71323e297a928af368937089d3ed71239786f86Andreas Huber ; doesn't need to persist 679f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rdx, dword ptr arg(4) ; dst_stride 680f71323e297a928af368937089d3ed71239786f86Andreas Huber 681f71323e297a928af368937089d3ed71239786f86Andreas Huber ; store blocks back out 682f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi], xmm0 683f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi + rdx], xmm1 684f71323e297a928af368937089d3ed71239786f86Andreas Huber 685f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rdi + 2*rdx] 686f71323e297a928af368937089d3ed71239786f86Andreas Huber 687f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi], xmm2 688f71323e297a928af368937089d3ed71239786f86Andreas Huber movq [rdi + rdx], xmm3 689f71323e297a928af368937089d3ed71239786f86Andreas Huber 690f71323e297a928af368937089d3ed71239786f86Andreas Huber 691f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 692f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 693f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 694f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 695f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 696f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 697f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 698f71323e297a928af368937089d3ed71239786f86Andreas Huber 699f71323e297a928af368937089d3ed71239786f86Andreas HuberSECTION_RODATA 700f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 701f71323e297a928af368937089d3ed71239786f86Andreas Huberfours: 702f71323e297a928af368937089d3ed71239786f86Andreas Huber times 8 dw 0x0004 703f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 704f71323e297a928af368937089d3ed71239786f86Andreas Huberx_s1sqr2: 705f71323e297a928af368937089d3ed71239786f86Andreas Huber times 8 dw 0x8A8C 706f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 707f71323e297a928af368937089d3ed71239786f86Andreas Huberx_c1sqr2less1: 708f71323e297a928af368937089d3ed71239786f86Andreas Huber times 8 dw 0x4E7B 709