190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1479f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro STACK_FRAME_CREATE 0 1579f15823c34ae1e423108295e416213200bb280fAndreas Huber%if ABI_IS_32BIT 1679f15823c34ae1e423108295e416213200bb280fAndreas Huber %define input rsi 1779f15823c34ae1e423108295e416213200bb280fAndreas Huber %define output rdi 1879f15823c34ae1e423108295e416213200bb280fAndreas Huber %define pitch rax 1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 22f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 23f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(0) 2779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(1) 2879f15823c34ae1e423108295e416213200bb280fAndreas Huber 2979f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rax, dword ptr arg(2) 3079f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rcx, [rsi + rax*2] 3179f15823c34ae1e423108295e416213200bb280fAndreas Huber%else 32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if LIBVPX_YASM_WIN64 3379f15823c34ae1e423108295e416213200bb280fAndreas Huber %define input rcx 3479f15823c34ae1e423108295e416213200bb280fAndreas Huber %define output rdx 3579f15823c34ae1e423108295e416213200bb280fAndreas Huber %define pitch r8 361b362b15af34006e6a11974088a46d42b903418eJohann SAVE_XMM 7, u 3779f15823c34ae1e423108295e416213200bb280fAndreas Huber %else 3879f15823c34ae1e423108295e416213200bb280fAndreas Huber %define input rdi 3979f15823c34ae1e423108295e416213200bb280fAndreas Huber %define output rsi 4079f15823c34ae1e423108295e416213200bb280fAndreas Huber %define pitch rdx 4179f15823c34ae1e423108295e416213200bb280fAndreas Huber %endif 4279f15823c34ae1e423108295e416213200bb280fAndreas Huber%endif 4379f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro 4479f15823c34ae1e423108295e416213200bb280fAndreas Huber 4579f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro STACK_FRAME_DESTROY 0 4679f15823c34ae1e423108295e416213200bb280fAndreas Huber %define input 4779f15823c34ae1e423108295e416213200bb280fAndreas Huber %define output 4879f15823c34ae1e423108295e416213200bb280fAndreas Huber %define pitch 4979f15823c34ae1e423108295e416213200bb280fAndreas Huber 5079f15823c34ae1e423108295e416213200bb280fAndreas Huber%if ABI_IS_32BIT 5179f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rdi 5279f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rsi 5379f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_GOT 5479f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rbp 5579f15823c34ae1e423108295e416213200bb280fAndreas Huber%else 56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if LIBVPX_YASM_WIN64 571b362b15af34006e6a11974088a46d42b903418eJohann RESTORE_XMM 5879f15823c34ae1e423108295e416213200bb280fAndreas Huber %endif 5979f15823c34ae1e423108295e416213200bb280fAndreas Huber%endif 6079f15823c34ae1e423108295e416213200bb280fAndreas Huber ret 6179f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro 6279f15823c34ae1e423108295e416213200bb280fAndreas Huber 6379f15823c34ae1e423108295e416213200bb280fAndreas Huber;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) 641b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_short_fdct4x4_sse2) PRIVATE 6579f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_short_fdct4x4_sse2): 66f71323e297a928af368937089d3ed71239786f86Andreas Huber 6779f15823c34ae1e423108295e416213200bb280fAndreas Huber STACK_FRAME_CREATE 6879f15823c34ae1e423108295e416213200bb280fAndreas Huber 6979f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm0, MMWORD PTR[input ] ;03 02 01 00 7079f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 7179f15823c34ae1e423108295e416213200bb280fAndreas Huber lea input, [input+2*pitch] 7279f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm1, MMWORD PTR[input ] ;23 22 21 20 7379f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 74f71323e297a928af368937089d3ed71239786f86Andreas Huber 75f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 76f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 77f71323e297a928af368937089d3ed71239786f86Andreas Huber 78f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, xmm0 79f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 80f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 81f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm0 82f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 83f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx 84f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx 85f71323e297a928af368937089d3ed71239786f86Andreas Huber 86f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 87f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, xmm0 88f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 89f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 90f71323e297a928af368937089d3ed71239786f86Andreas Huber psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 91f71323e297a928af368937089d3ed71239786f86Andreas Huber psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 9279f15823c34ae1e423108295e416213200bb280fAndreas Huber 93f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm0 94538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 95538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 96f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 97538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 98538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 99f71323e297a928af368937089d3ed71239786f86Andreas Huber 100538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] 101538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] 102f71323e297a928af368937089d3ed71239786f86Andreas Huber psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 103f71323e297a928af368937089d3ed71239786f86Andreas Huber psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 104f71323e297a928af368937089d3ed71239786f86Andreas Huber 105f71323e297a928af368937089d3ed71239786f86Andreas Huber packssdw xmm0, xmm1 ;op[2] op[0] 106f71323e297a928af368937089d3ed71239786f86Andreas Huber packssdw xmm3, xmm4 ;op[3] op[1] 107f71323e297a928af368937089d3ed71239786f86Andreas Huber ; 23 22 21 20 03 02 01 00 108f71323e297a928af368937089d3ed71239786f86Andreas Huber ; 109f71323e297a928af368937089d3ed71239786f86Andreas Huber ; 33 32 31 30 13 12 11 10 110f71323e297a928af368937089d3ed71239786f86Andreas Huber ; 111f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, xmm0 112f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 113f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 114f71323e297a928af368937089d3ed71239786f86Andreas Huber 115f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, xmm0 116f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 117f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 118f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, xmm0 119f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 120f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 121f71323e297a928af368937089d3ed71239786f86Andreas Huber 122538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] 123f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm2, xmm2, 04eh 124f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, xmm0 125f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 126f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 127f71323e297a928af368937089d3ed71239786f86Andreas Huber 128f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 129f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, xmm3 ;save d1 for compare 130f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 131f71323e297a928af368937089d3ed71239786f86Andreas Huber pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 132f71323e297a928af368937089d3ed71239786f86Andreas Huber pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 133f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 134f71323e297a928af368937089d3ed71239786f86Andreas Huber pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 135f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm0 136538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 137538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 138f71323e297a928af368937089d3ed71239786f86Andreas Huber 139f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm4, xmm4 ;zero out for compare 140f71323e297a928af368937089d3ed71239786f86Andreas Huber paddd xmm0, xmm5 141f71323e297a928af368937089d3ed71239786f86Andreas Huber paddd xmm1, xmm5 142f71323e297a928af368937089d3ed71239786f86Andreas Huber pcmpeqw xmm2, xmm4 143f71323e297a928af368937089d3ed71239786f86Andreas Huber psrad xmm0, 4 ;(a1 + b1 + 7)>>4 144f71323e297a928af368937089d3ed71239786f86Andreas Huber psrad xmm1, 4 ;(a1 - b1 + 7)>>4 145538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, 146538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ;and keep bit 0 of lower 147f71323e297a928af368937089d3ed71239786f86Andreas Huber 148f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 149538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 150538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 151538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] 152538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] 153f71323e297a928af368937089d3ed71239786f86Andreas Huber packssdw xmm0, xmm1 ;op[8] op[0] 154f71323e297a928af368937089d3ed71239786f86Andreas Huber psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 155f71323e297a928af368937089d3ed71239786f86Andreas Huber psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 156f71323e297a928af368937089d3ed71239786f86Andreas Huber 157f71323e297a928af368937089d3ed71239786f86Andreas Huber packssdw xmm3, xmm4 ;op[12] op[4] 158f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm0 159f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm2 ;op[4] += (d1!=0) 160f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklqdq xmm0, xmm3 ;op[4] op[0] 161f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhqdq xmm1, xmm3 ;op[12] op[8] 162f71323e297a928af368937089d3ed71239786f86Andreas Huber 16379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa XMMWORD PTR[output + 0], xmm0 16479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa XMMWORD PTR[output + 16], xmm1 16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16679f15823c34ae1e423108295e416213200bb280fAndreas Huber STACK_FRAME_DESTROY 16779f15823c34ae1e423108295e416213200bb280fAndreas Huber 16879f15823c34ae1e423108295e416213200bb280fAndreas Huber;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) 1691b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_short_fdct8x4_sse2) PRIVATE 17079f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_short_fdct8x4_sse2): 17179f15823c34ae1e423108295e416213200bb280fAndreas Huber 17279f15823c34ae1e423108295e416213200bb280fAndreas Huber STACK_FRAME_CREATE 17379f15823c34ae1e423108295e416213200bb280fAndreas Huber 17479f15823c34ae1e423108295e416213200bb280fAndreas Huber ; read the input data 17579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm0, [input ] 17679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, [input+ pitch] 17779f15823c34ae1e423108295e416213200bb280fAndreas Huber lea input, [input+2*pitch] 17879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, [input ] 17979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, [input+ pitch] 18079f15823c34ae1e423108295e416213200bb280fAndreas Huber 18179f15823c34ae1e423108295e416213200bb280fAndreas Huber ; transpose for the first stage 18279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 18379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 18479f15823c34ae1e423108295e416213200bb280fAndreas Huber 18579f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 18679f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 18779f15823c34ae1e423108295e416213200bb280fAndreas Huber 18879f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 18979f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 19079f15823c34ae1e423108295e416213200bb280fAndreas Huber 19179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 19279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 19379f15823c34ae1e423108295e416213200bb280fAndreas Huber 19479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 19579f15823c34ae1e423108295e416213200bb280fAndreas Huber 19679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 19779f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 19879f15823c34ae1e423108295e416213200bb280fAndreas Huber 19979f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 20079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 20179f15823c34ae1e423108295e416213200bb280fAndreas Huber 20279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 20379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 20479f15823c34ae1e423108295e416213200bb280fAndreas Huber 20579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 20679f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 20779f15823c34ae1e423108295e416213200bb280fAndreas Huber 20879f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 20979f15823c34ae1e423108295e416213200bb280fAndreas Huber 21079f15823c34ae1e423108295e416213200bb280fAndreas Huber ; xmm0 0 21179f15823c34ae1e423108295e416213200bb280fAndreas Huber ; xmm1 1 21279f15823c34ae1e423108295e416213200bb280fAndreas Huber ; xmm2 2 21379f15823c34ae1e423108295e416213200bb280fAndreas Huber ; xmm3 3 21479f15823c34ae1e423108295e416213200bb280fAndreas Huber 21579f15823c34ae1e423108295e416213200bb280fAndreas Huber ; first stage 21679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm5, xmm0 21779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm1 21879f15823c34ae1e423108295e416213200bb280fAndreas Huber 21979f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm0, xmm3 ; a1 = 0 + 3 22079f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm2 ; b1 = 1 + 2 22179f15823c34ae1e423108295e416213200bb280fAndreas Huber 22279f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm4, xmm2 ; c1 = 1 - 2 22379f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm5, xmm3 ; d1 = 0 - 3 22479f15823c34ae1e423108295e416213200bb280fAndreas Huber 22579f15823c34ae1e423108295e416213200bb280fAndreas Huber psllw xmm5, 3 22679f15823c34ae1e423108295e416213200bb280fAndreas Huber psllw xmm4, 3 22779f15823c34ae1e423108295e416213200bb280fAndreas Huber 22879f15823c34ae1e423108295e416213200bb280fAndreas Huber psllw xmm0, 3 22979f15823c34ae1e423108295e416213200bb280fAndreas Huber psllw xmm1, 3 23079f15823c34ae1e423108295e416213200bb280fAndreas Huber 23179f15823c34ae1e423108295e416213200bb280fAndreas Huber ; output 0 and 2 23279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, xmm0 ; a1 23379f15823c34ae1e423108295e416213200bb280fAndreas Huber 23479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm0, xmm1 ; op[0] = a1 + b1 23579f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm2, xmm1 ; op[2] = a1 - b1 23679f15823c34ae1e423108295e416213200bb280fAndreas Huber 23779f15823c34ae1e423108295e416213200bb280fAndreas Huber ; output 1 and 3 23879f15823c34ae1e423108295e416213200bb280fAndreas Huber ; interleave c1, d1 23979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm5 ; d1 24079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm1, xmm4 ; c1 d1 24179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm5, xmm4 ; c1 d1 24279f15823c34ae1e423108295e416213200bb280fAndreas Huber 24379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm1 24479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm5 24579f15823c34ae1e423108295e416213200bb280fAndreas Huber 24679f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 24779f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 24879f15823c34ae1e423108295e416213200bb280fAndreas Huber 24979f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 25079f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 25179f15823c34ae1e423108295e416213200bb280fAndreas Huber 25279f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] 25379f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] 25479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] 25579f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] 25679f15823c34ae1e423108295e416213200bb280fAndreas Huber 25779f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 25879f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 25979f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 26079f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 26179f15823c34ae1e423108295e416213200bb280fAndreas Huber 26279f15823c34ae1e423108295e416213200bb280fAndreas Huber packssdw xmm1, xmm4 ; op[1] 26379f15823c34ae1e423108295e416213200bb280fAndreas Huber packssdw xmm3, xmm5 ; op[3] 26479f15823c34ae1e423108295e416213200bb280fAndreas Huber 26579f15823c34ae1e423108295e416213200bb280fAndreas Huber ; done with vertical 26679f15823c34ae1e423108295e416213200bb280fAndreas Huber ; transpose for the second stage 26779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 26879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 26979f15823c34ae1e423108295e416213200bb280fAndreas Huber 27079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 27179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 27279f15823c34ae1e423108295e416213200bb280fAndreas Huber 27379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 27479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 27579f15823c34ae1e423108295e416213200bb280fAndreas Huber 27679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 27779f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 27879f15823c34ae1e423108295e416213200bb280fAndreas Huber 27979f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 28079f15823c34ae1e423108295e416213200bb280fAndreas Huber 28179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 28279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 28379f15823c34ae1e423108295e416213200bb280fAndreas Huber 28479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 28579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 28679f15823c34ae1e423108295e416213200bb280fAndreas Huber 28779f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 28879f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 28979f15823c34ae1e423108295e416213200bb280fAndreas Huber 29079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 29179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 29279f15823c34ae1e423108295e416213200bb280fAndreas Huber 29379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 29479f15823c34ae1e423108295e416213200bb280fAndreas Huber 29579f15823c34ae1e423108295e416213200bb280fAndreas Huber ; xmm0 0 29679f15823c34ae1e423108295e416213200bb280fAndreas Huber ; xmm1 4 29779f15823c34ae1e423108295e416213200bb280fAndreas Huber ; xmm2 1 29879f15823c34ae1e423108295e416213200bb280fAndreas Huber ; xmm3 3 29979f15823c34ae1e423108295e416213200bb280fAndreas Huber 30079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm5, xmm0 30179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, xmm1 30279f15823c34ae1e423108295e416213200bb280fAndreas Huber 30379f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm0, xmm3 ; a1 = 0 + 3 30479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm4 ; b1 = 1 + 2 30579f15823c34ae1e423108295e416213200bb280fAndreas Huber 30679f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm4, xmm2 ; c1 = 1 - 2 30779f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm5, xmm3 ; d1 = 0 - 3 30879f15823c34ae1e423108295e416213200bb280fAndreas Huber 30979f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm6, xmm6 ; zero out for compare 31079f15823c34ae1e423108295e416213200bb280fAndreas Huber 31179f15823c34ae1e423108295e416213200bb280fAndreas Huber pcmpeqw xmm6, xmm5 ; d1 != 0 31279f15823c34ae1e423108295e416213200bb280fAndreas Huber 31379f15823c34ae1e423108295e416213200bb280fAndreas Huber pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, 31479f15823c34ae1e423108295e416213200bb280fAndreas Huber ; and keep bit 0 of lower 31579f15823c34ae1e423108295e416213200bb280fAndreas Huber 31679f15823c34ae1e423108295e416213200bb280fAndreas Huber ; output 0 and 2 31779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, xmm0 ; a1 31879f15823c34ae1e423108295e416213200bb280fAndreas Huber 31979f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm0, xmm1 ; a1 + b1 32079f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm2, xmm1 ; a1 - b1 32179f15823c34ae1e423108295e416213200bb280fAndreas Huber 32279f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] 32379f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] 32479f15823c34ae1e423108295e416213200bb280fAndreas Huber 32579f15823c34ae1e423108295e416213200bb280fAndreas Huber psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 32679f15823c34ae1e423108295e416213200bb280fAndreas Huber psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 32779f15823c34ae1e423108295e416213200bb280fAndreas Huber 32879f15823c34ae1e423108295e416213200bb280fAndreas Huber ; output 1 and 3 32979f15823c34ae1e423108295e416213200bb280fAndreas Huber ; interleave c1, d1 33079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm5 ; d1 33179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm1, xmm4 ; c1 d1 33279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm5, xmm4 ; c1 d1 33379f15823c34ae1e423108295e416213200bb280fAndreas Huber 33479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm1 33579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm5 33679f15823c34ae1e423108295e416213200bb280fAndreas Huber 33779f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 33879f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 33979f15823c34ae1e423108295e416213200bb280fAndreas Huber 34079f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 34179f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 34279f15823c34ae1e423108295e416213200bb280fAndreas Huber 34379f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] 34479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] 34579f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] 34679f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] 34779f15823c34ae1e423108295e416213200bb280fAndreas Huber 34879f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 34979f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 35079f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 35179f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 35279f15823c34ae1e423108295e416213200bb280fAndreas Huber 35379f15823c34ae1e423108295e416213200bb280fAndreas Huber packssdw xmm1, xmm4 ; op[4] 35479f15823c34ae1e423108295e416213200bb280fAndreas Huber packssdw xmm3, xmm5 ; op[12] 35579f15823c34ae1e423108295e416213200bb280fAndreas Huber 35679f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm6 ; op[4] += (d1!=0) 35779f15823c34ae1e423108295e416213200bb280fAndreas Huber 35879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm0 35979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm5, xmm2 36079f15823c34ae1e423108295e416213200bb280fAndreas Huber 36179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklqdq xmm0, xmm1 36279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhqdq xmm4, xmm1 36379f15823c34ae1e423108295e416213200bb280fAndreas Huber 36479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklqdq xmm2, xmm3 36579f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhqdq xmm5, xmm3 36679f15823c34ae1e423108295e416213200bb280fAndreas Huber 36779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa XMMWORD PTR[output + 0 ], xmm0 36879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa XMMWORD PTR[output + 16], xmm2 36979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa XMMWORD PTR[output + 32], xmm4 37079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa XMMWORD PTR[output + 48], xmm5 37179f15823c34ae1e423108295e416213200bb280fAndreas Huber 37279f15823c34ae1e423108295e416213200bb280fAndreas Huber STACK_FRAME_DESTROY 37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA 37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 376f71323e297a928af368937089d3ed71239786f86Andreas Huber_5352_2217: 377f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 5352 378f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 2217 379f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 5352 380f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 2217 381f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 5352 382f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 2217 383f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 5352 384f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 2217 38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 386f71323e297a928af368937089d3ed71239786f86Andreas Huber_2217_neg5352: 387f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 2217 388f71323e297a928af368937089d3ed71239786f86Andreas Huber dw -5352 389f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 2217 390f71323e297a928af368937089d3ed71239786f86Andreas Huber dw -5352 391f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 2217 392f71323e297a928af368937089d3ed71239786f86Andreas Huber dw -5352 393f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 2217 394f71323e297a928af368937089d3ed71239786f86Andreas Huber dw -5352 39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 396f71323e297a928af368937089d3ed71239786f86Andreas Huber_mult_add: 397f71323e297a928af368937089d3ed71239786f86Andreas Huber times 8 dw 1 398f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 399f71323e297a928af368937089d3ed71239786f86Andreas Huber_cmp_mask: 400f71323e297a928af368937089d3ed71239786f86Andreas Huber times 4 dw 1 401f71323e297a928af368937089d3ed71239786f86Andreas Huber times 4 dw 0 40279f15823c34ae1e423108295e416213200bb280fAndreas Huberalign 16 40379f15823c34ae1e423108295e416213200bb280fAndreas Huber_cmp_mask8x4: 40479f15823c34ae1e423108295e416213200bb280fAndreas Huber times 8 dw 1 405f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 406f71323e297a928af368937089d3ed71239786f86Andreas Huber_mult_sub: 407f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 1 408f71323e297a928af368937089d3ed71239786f86Andreas Huber dw -1 409f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 1 410f71323e297a928af368937089d3ed71239786f86Andreas Huber dw -1 411f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 1 412f71323e297a928af368937089d3ed71239786f86Andreas Huber dw -1 413f71323e297a928af368937089d3ed71239786f86Andreas Huber dw 1 414f71323e297a928af368937089d3ed71239786f86Andreas Huber dw -1 415f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 416f71323e297a928af368937089d3ed71239786f86Andreas Huber_7: 417f71323e297a928af368937089d3ed71239786f86Andreas Huber times 4 dd 7 418f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 41979f15823c34ae1e423108295e416213200bb280fAndreas Huber_7w: 42079f15823c34ae1e423108295e416213200bb280fAndreas Huber times 8 dw 7 42179f15823c34ae1e423108295e416213200bb280fAndreas Huberalign 16 422f71323e297a928af368937089d3ed71239786f86Andreas Huber_14500: 423f71323e297a928af368937089d3ed71239786f86Andreas Huber times 4 dd 14500 424f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 425f71323e297a928af368937089d3ed71239786f86Andreas Huber_7500: 426f71323e297a928af368937089d3ed71239786f86Andreas Huber times 4 dd 7500 427f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 428f71323e297a928af368937089d3ed71239786f86Andreas Huber_12000: 429f71323e297a928af368937089d3ed71239786f86Andreas Huber times 4 dd 12000 430f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16 431f71323e297a928af368937089d3ed71239786f86Andreas Huber_51000: 432f71323e297a928af368937089d3ed71239786f86Andreas Huber times 4 dd 51000 433