190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber; Use of pmaxub instead of psubusb to compute filter mask was seen 15538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber; in ffvp8 1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%macro LFH_FILTER_AND_HEV_MASK 1 18f71323e297a928af368937089d3ed71239786f86Andreas Huber%if %1 19f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, [rdi+2*rax] ; q3 20f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, [rsi+2*rax] ; q2 21538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, [rsi+rax] ; q1 22538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, [rsi] ; q0 23538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber neg rax ; negate pitch to deal with above border 24f71323e297a928af368937089d3ed71239786f86Andreas Huber%else 25538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movlps xmm2, [rsi + rcx*2] ; q3 26538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movlps xmm1, [rsi + rcx] ; q2 27538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movlps xmm4, [rsi] ; q1 28538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movlps xmm5, [rsi + rax] ; q0 2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps xmm2, [rdi + rcx*2] 31538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps xmm1, [rdi + rcx] 32538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps xmm4, [rdi] 33538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps xmm5, [rdi + rax] 3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rsi, [rsi + rax*4] 36538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rdi, [rdi + rax*4] 3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rsp], xmm1 ; store q2 39f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1 40f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm6, xmm1 ; q2 43f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, xmm4 ; q1 44538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 45538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm1, xmm2 ; q2-=q3 46538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm2, xmm6 ; q3-=q2 47538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 48f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm4, xmm6 ; q1-=q2 49f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm6, xmm3 ; q2-=q1 5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 51538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber por xmm4, xmm6 ; abs(q2-q1) 52538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber por xmm1, xmm2 ; abs(q3-q2) 5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm0, xmm5 ; q0 55538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm1, xmm4 5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm5, xmm3 ; q0-=q1 58f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm3, xmm0 ; q1-=q0 5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 60538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber por xmm5, xmm3 ; abs(q0-q1) 61538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa t0, xmm5 ; save to t0 6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 63538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm1, xmm5 64f71323e297a928af368937089d3ed71239786f86Andreas Huber 65538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 66f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, [rsi+4*rax] ; p3 67f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, [rdi+4*rax] ; p2 68538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm6, [rsi+2*rax] ; p1 69f71323e297a928af368937089d3ed71239786f86Andreas Huber%else 70538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movlps xmm2, [rsi + rax] ; p3 71538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movlps xmm4, [rsi] ; p2 72538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movlps xmm6, [rsi + rcx] ; p1 73538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 74538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps xmm2, [rdi + rax] 75538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps xmm4, [rdi] 76538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps xmm6, [rdi + rcx] 77f71323e297a928af368937089d3ed71239786f86Andreas Huber 78f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2 79538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1 80f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 82f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; p2 83538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, xmm6 ; p1 84538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 85f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm4, xmm2 ; p2-=p3 86f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm2, xmm5 ; p3-=p2 8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm3, xmm5 ; p1-=p2 89538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm1, xmm4 ; abs(p3 - p2) 9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 91538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm5, xmm6 ; p2-=p1 92538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm1, xmm2 ; abs(p3 - p2) 9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm1, xmm5 ; abs(p2 - p1) 95538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm6 ; p1 9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 97538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm1, xmm3 ; abs(p2 - p1) 98f71323e297a928af368937089d3ed71239786f86Andreas Huber%if %1 99f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, [rsi+rax] ; p0 100538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, [rdi] ; q1 101f71323e297a928af368937089d3ed71239786f86Andreas Huber%else 102538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movlps xmm4, [rsi + rcx*2] ; p0 103538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps xmm4, [rdi + rcx*2] 104538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, q1 ; q1 105f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; p0 108538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm4, xmm6 ; p0-=p1 10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 110538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm6, xmm5 ; p1-=p0 11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 112538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber por xmm6, xmm4 ; abs(p1 - p0) 113538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdx, arg(2) ; get flimit 114538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 115538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa t1, xmm6 ; save to t1 11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 117f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 ; q1 118538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm1, xmm6 119538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 120f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm3, xmm2 ; q1-=p1 121f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm2, xmm4 ; p1-=q1 122538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 123538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm1, xmm7 124f71323e297a928af368937089d3ed71239786f86Andreas Huber por xmm2, xmm3 ; abs(p1-q1) 12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 126538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, XMMWORD PTR [rdx] ; flimit 127538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 128f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, xmm0 ; q0 129538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero 130538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 131538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdx, arg(4) ; hev get thresh 132538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 133538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm6, xmm5 ; p0 134538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrlw xmm2, 1 ; abs(p1-q1)/2 135538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 136f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm5, xmm3 ; p0-=q0 137538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddb xmm4, xmm4 ; flimit*2 (less than 255) 138538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 139f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm3, xmm6 ; q0-=p0 140f71323e297a928af368937089d3ed71239786f86Andreas Huber por xmm5, xmm3 ; abs(p0 - q0) 141538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 142f71323e297a928af368937089d3ed71239786f86Andreas Huber paddusb xmm5, xmm5 ; abs(p0-q0)*2 143538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddb xmm7, xmm4 ; flimit * 2 + limit (less than 255) 144538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 145538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, t0 ; hev get abs (q1 - q0) 146538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 147538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, t1 ; get abs (p1 - p0) 148538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 149f71323e297a928af368937089d3ed71239786f86Andreas Huber paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 151538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, XMMWORD PTR [rdx] ; hev 15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 153f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 154538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm4, xmm2 ; hev 155538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 156538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm3, xmm2 ; hev 157f71323e297a928af368937089d3ed71239786f86Andreas Huber por xmm1, xmm5 15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 159538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm7, xmm7 160538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 162538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqb xmm4, xmm5 ; hev 163538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqb xmm3, xmm3 ; hev 16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 165538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqb xmm1, xmm7 ; mask xmm1 166538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm4, xmm3 ; hev 167f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro 16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%macro B_FILTER 1 170538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0 171f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, p1 ; p1 172f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, q1 ; q1 173538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1 174538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, [rsi+2*rax] ; p1 175538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, [rdi] ; q1 176538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 2 177538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rdx, srct 178538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 179538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, [rdx] ; p1 180538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, [rdx+48] ; q1 181538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm6, [rdx+16] ; p0 182538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm0, [rdx+32] ; q0 183f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 185538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 186538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 188f71323e297a928af368937089d3ed71239786f86Andreas Huber psubsb xmm2, xmm7 ; p1 - q1 189538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values 19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 191f71323e297a928af368937089d3ed71239786f86Andreas Huber pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) 192538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values 19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 194f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, xmm0 ; q0 195f71323e297a928af368937089d3ed71239786f86Andreas Huber psubsb xmm0, xmm6 ; q0 - p0 196538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 197f71323e297a928af368937089d3ed71239786f86Andreas Huber paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 198538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 199f71323e297a928af368937089d3ed71239786f86Andreas Huber paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 200538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 201f71323e297a928af368937089d3ed71239786f86Andreas Huber paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 202538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 203f71323e297a928af368937089d3ed71239786f86Andreas Huber pand xmm1, xmm2 ; mask filter values we don't care about 204538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 205f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, xmm1 206538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 207538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 208538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 210f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhbw xmm5, xmm2 ; axbxcxdx 211f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm2, xmm2 ; exfxgxhx 21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 213538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklbw xmm0, xmm1 ; exfxgxhx 214f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm5, 11 ; sign extended shift right by 3 21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 216f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhbw xmm1, xmm1 ; axbxcxdx 217538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm2, 11 ; sign extended shift right by 3 21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 219538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 220f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm0, 11 ; sign extended shift right by 3 221f71323e297a928af368937089d3ed71239786f86Andreas Huber 222538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm1, 11 ; sign extended shift right by 3 223f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm0 ; save results 224f71323e297a928af368937089d3ed71239786f86Andreas Huber 225538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 226538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsw xmm5, [GLOBAL(ones)] 227f71323e297a928af368937089d3ed71239786f86Andreas Huber 228538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsw xmm1, [GLOBAL(ones)] 229f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm5, 1 ; partial shifted one more time for 2nd tap 230f71323e297a928af368937089d3ed71239786f86Andreas Huber 231538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm1, 1 ; partial shifted one more time for 2nd tap 232f71323e297a928af368937089d3ed71239786f86Andreas Huber 233f71323e297a928af368937089d3ed71239786f86Andreas Huber paddsb xmm6, xmm2 ; p0+= p0 add 234538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 235f71323e297a928af368937089d3ed71239786f86Andreas Huber 236538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0 237538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, p1 ; p1 238538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1 239538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, [rsi+2*rax] ; p1 240538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 2 241538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, [rdx] ; p1 242f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 243538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pandn xmm4, xmm5 ; high edge variance additive 244538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, [GLOBAL(t80)] ; unoffset 245f71323e297a928af368937089d3ed71239786f86Andreas Huber 246538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm1, [GLOBAL(t80)] ; reoffset 247f71323e297a928af368937089d3ed71239786f86Andreas Huber psubsb xmm3, xmm0 ; q0-= q0 add 248f71323e297a928af368937089d3ed71239786f86Andreas Huber 249538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm1, xmm4 ; p1+= p1 add 250538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm3, [GLOBAL(t80)] ; unoffset 251538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 252538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm1, [GLOBAL(t80)] ; unoffset 253f71323e297a928af368937089d3ed71239786f86Andreas Huber psubsb xmm7, xmm4 ; q1-= q1 add 254538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 255538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm7, [GLOBAL(t80)] ; unoffset 256538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0 257538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rsi, [rsi + rcx*2] 258538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rdi, [rdi + rcx*2] 259538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq MMWORD PTR [rsi], xmm6 ; p0 260538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi], xmm6 261538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq MMWORD PTR [rsi + rax], xmm1 ; p1 262538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi + rax], xmm1 263538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq MMWORD PTR [rsi + rcx], xmm3 ; q0 264538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi + rcx], xmm3 265f71323e297a928af368937089d3ed71239786f86Andreas Huber movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1 266538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi + rcx*2],xmm7 267538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1 268538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa [rsi+rax], xmm6 ; write back 269538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa [rsi+2*rax], xmm1 ; write back 270538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa [rsi], xmm3 ; write back 271538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa [rdi], xmm7 ; write back 272f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 273538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 274f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro 27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 277f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_loop_filter_horizontal_edge_sse2 27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *flimit, 28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 283f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *thresh, 284f71323e297a928af368937089d3ed71239786f86Andreas Huber; int count 285f71323e297a928af368937089d3ed71239786f86Andreas Huber;) 286f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_loop_filter_horizontal_edge_sse2) 287f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_loop_filter_horizontal_edge_sse2): 288f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 289f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 290f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 6 291f71323e297a928af368937089d3ed71239786f86Andreas Huber SAVE_XMM 292f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 293f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 294f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 295f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 297f71323e297a928af368937089d3ed71239786f86Andreas Huber ALIGN_STACK 16, rax 298f71323e297a928af368937089d3ed71239786f86Andreas Huber sub rsp, 32 ; reserve 32 bytes 299f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 300f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 302f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(0) ;src_ptr 303f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step 30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 305f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdx, arg(3) ;limit 306f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, XMMWORD PTR [rdx] 30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 308f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 310538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; calculate breakout conditions and high edge variance 311538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber LFH_FILTER_AND_HEV_MASK 1 312538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; filter and write back the result 313538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber B_FILTER 1 31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 315f71323e297a928af368937089d3ed71239786f86Andreas Huber add rsp, 32 316f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsp 317f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 318f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 319f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 320f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 321f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_XMM 322f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 323f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 324f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 327f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_loop_filter_horizontal_edge_uv_sse2 328f71323e297a928af368937089d3ed71239786f86Andreas Huber;( 329f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *src_ptr, 330f71323e297a928af368937089d3ed71239786f86Andreas Huber; int src_pixel_step, 331f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *flimit, 332f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *limit, 333f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *thresh, 334f71323e297a928af368937089d3ed71239786f86Andreas Huber; int count 335f71323e297a928af368937089d3ed71239786f86Andreas Huber;) 336f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_loop_filter_horizontal_edge_uv_sse2) 337f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_loop_filter_horizontal_edge_uv_sse2): 338f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 339f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 340f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 6 341f71323e297a928af368937089d3ed71239786f86Andreas Huber SAVE_XMM 342f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 343f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 344f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 345f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 347f71323e297a928af368937089d3ed71239786f86Andreas Huber ALIGN_STACK 16, rax 348f71323e297a928af368937089d3ed71239786f86Andreas Huber sub rsp, 96 ; reserve 96 bytes 349f71323e297a928af368937089d3ed71239786f86Andreas Huber %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; 350f71323e297a928af368937089d3ed71239786f86Andreas Huber %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; 351f71323e297a928af368937089d3ed71239786f86Andreas Huber %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; 352f71323e297a928af368937089d3ed71239786f86Andreas Huber %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; 353f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; 354f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; 355f71323e297a928af368937089d3ed71239786f86Andreas Huber 356f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(0) ; u 357f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdi, arg(5) ; v 358f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rax, dword ptr arg(1) ; src_pixel_step 359f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rcx, rax 360f71323e297a928af368937089d3ed71239786f86Andreas Huber neg rax ; negate pitch to deal with above border 361f71323e297a928af368937089d3ed71239786f86Andreas Huber 362f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdx, arg(3) ;limit 363f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, XMMWORD PTR [rdx] 36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 365f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi + rcx] 366f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rdi + rcx] 36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 368538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; calculate breakout conditions and high edge variance 369538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber LFH_FILTER_AND_HEV_MASK 0 370538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; filter and write back the result 371538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber B_FILTER 0 37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 373f71323e297a928af368937089d3ed71239786f86Andreas Huber add rsp, 96 374f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsp 375f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 376f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 377f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 378f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 379f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_XMM 380f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 381f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 382f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 385538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%macro MB_FILTER_AND_WRITEBACK 1 386538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0 387538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, p1 ; p1 388538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, q1 ; q1 389538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1 390538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, [rsi+2*rax] ; p1 391538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, [rdi] ; q1 392538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 393538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rcx, rax 394538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber neg rcx 395538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 2 396538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rdx, srct 397538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 398538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, [rdx+32] ; p1 399538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, [rdx+80] ; q1 400538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm6, [rdx+48] ; p0 401538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm0, [rdx+64] ; q0 402f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 404538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 405538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 406538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values 407538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values 40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 409538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubsb xmm2, xmm7 ; p1 - q1 410538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, xmm0 ; q0 41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 412538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubsb xmm0, xmm6 ; q0 - p0 41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 414538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) 41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 416538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm2, xmm0 ; 2 * (q0 - p0) 41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 418538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) 41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 420538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm1, xmm2 ; mask filter values we don't care about 42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 422538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm1 ; vp8_filter 42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 424538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm2, xmm4 ; Filter2 = vp8_filter & hev 425538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm0, xmm0 42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 427538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pandn xmm4, xmm1 ; vp8_filter&=~hev 42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm1, xmm1 42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 430538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklbw xmm0, xmm4 ; Filter 2 (hi) 431538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, xmm2 43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 433538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhbw xmm1, xmm4 ; Filter 2 (lo) 434538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) 43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 436538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm1, [GLOBAL(s9)] ; Filter 2 (lo) * 9 43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 438538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm0, [GLOBAL(s9)] ; Filter 2 (hi) * 9 43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 440538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhbw xmm7, xmm5 ; axbxcxdx 441538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) 44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 443538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklbw xmm5, xmm5 ; exfxgxhx 444538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm7, 11 ; sign extended shift right by 3 44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 446538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm5, 11 ; sign extended shift right by 3 447538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhbw xmm4, xmm2 ; axbxcxdx 44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 449538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklbw xmm2, xmm2 ; exfxgxhx 450538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm4, 11 ; sign extended shift right by 3 451f71323e297a928af368937089d3ed71239786f86Andreas Huber 452538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packsswb xmm5, xmm7 ; Filter2 >>=3; 453538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm2, 11 ; sign extended shift right by 3 45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 455538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packsswb xmm2, xmm4 ; Filter1 >>=3; 456538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, xmm1 45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 458538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 459538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, xmm1 46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 461538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 462538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, xmm0 46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 464538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm5 465538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm0, [GLOBAL(s63)] ; Filter 2 (hi) * 9 + 63 46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 467538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm1, [GLOBAL(s63)] ; Filter 2 (lo) * 9 + 63 468538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm5, xmm5 ; Filter 2 (hi) * 18 46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 470538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm7, xmm7 ; Filter 2 (lo) * 18 471538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 473538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 474538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 476538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 477538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 479538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 480538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 482538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) 483538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 485538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 486538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) 487538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 488538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 489f71323e297a928af368937089d3ed71239786f86Andreas Huber 490538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) 491538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 492538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) 493538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) 494538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 495538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0 496538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, q2 ; q2 497538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, q1 ; q1 498538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, p1 ; p1 499538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, p2 ; p2 500538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 501538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1 502538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, XMMWORD PTR [rdi+rcx] ; q2 503538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, XMMWORD PTR [rdi] ; q1 504538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, XMMWORD PTR [rsi+rax*2] ; p1 505538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, XMMWORD PTR [rdi+rax*4] ; p2 506538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 2 507538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, XMMWORD PTR [rdx+96] ; q2 508538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, XMMWORD PTR [rdx+80] ; q1 509538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, XMMWORD PTR [rdx+32] ; p1 510538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, XMMWORD PTR [rdx+16] ; p2 511f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 513538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm3, [GLOBAL(t80)] ; *oq0 = sq^0x80 514538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, [GLOBAL(t80)] ; *oq0 = sp^0x80 51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 516538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm1, [GLOBAL(t80)] 517538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm4, [GLOBAL(t80)] 518f71323e297a928af368937089d3ed71239786f86Andreas Huber 519538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) 520538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) 521f71323e297a928af368937089d3ed71239786f86Andreas Huber 522538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm1, [GLOBAL(t80)] ; *oq1 = sq^0x80; 523538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm4, [GLOBAL(t80)] ; *op1 = sp^0x80; 524f71323e297a928af368937089d3ed71239786f86Andreas Huber 525538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm7, [GLOBAL(t80)] 526538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm5, [GLOBAL(t80)] 527f71323e297a928af368937089d3ed71239786f86Andreas Huber 528538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm7, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) 529538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) 530f71323e297a928af368937089d3ed71239786f86Andreas Huber 531538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm7, [GLOBAL(t80)] ; *op2 = sp^0x80; 532538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm5, [GLOBAL(t80)] ; *oq2 = sq^0x80; 533f71323e297a928af368937089d3ed71239786f86Andreas Huber 534538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0 535538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rsi, [rsi+rcx*2] 536538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rdi, [rdi+rcx*2] 537f71323e297a928af368937089d3ed71239786f86Andreas Huber 538538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq MMWORD PTR [rsi], xmm6 ; p0 539538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi], xmm6 540538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq MMWORD PTR [rsi + rcx], xmm3 ; q0 541538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi + rcx], xmm3 542f71323e297a928af368937089d3ed71239786f86Andreas Huber 543538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq MMWORD PTR [rsi+rcx*2], xmm1 ; q1 544538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi+rcx*2], xmm1 545f71323e297a928af368937089d3ed71239786f86Andreas Huber 546538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq MMWORD PTR [rsi + rax], xmm4 ; p1 547538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi + rax], xmm4 548538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 549538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq MMWORD PTR [rsi+rax*2], xmm7 ; p2 550538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi+rax*2], xmm7 551f71323e297a928af368937089d3ed71239786f86Andreas Huber 552f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi + rcx] 553f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rdi + rcx] 554538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq MMWORD PTR [rsi+rcx*2], xmm5 ; q2 555538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi+rcx*2], xmm5 556538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1 557538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rdi+rcx], xmm5 ; q2 558538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rdi], xmm1 ; q1 559538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rsi], xmm3 ; q0 560538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rsi+rax ],xmm6 ; p0 561538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rsi+rax*2],xmm4 ; p1 562538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rdi+rax*4],xmm7 ; p2 563538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 2 564538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rdx+80], xmm1 ; q1 565538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rdx+64], xmm3 ; q0 566538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rdx+48], xmm6 ; p0 567538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rdx+32], xmm4 ; p1 568f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 569538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 570f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro 571f71323e297a928af368937089d3ed71239786f86Andreas Huber 572f71323e297a928af368937089d3ed71239786f86Andreas Huber 573f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_mbloop_filter_horizontal_edge_sse2 574f71323e297a928af368937089d3ed71239786f86Andreas Huber;( 575f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *src_ptr, 576f71323e297a928af368937089d3ed71239786f86Andreas Huber; int src_pixel_step, 577f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *flimit, 578f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *limit, 579f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *thresh, 580f71323e297a928af368937089d3ed71239786f86Andreas Huber; int count 581f71323e297a928af368937089d3ed71239786f86Andreas Huber;) 582f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_mbloop_filter_horizontal_edge_sse2) 583f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_mbloop_filter_horizontal_edge_sse2): 584f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 585f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 586f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 6 587f71323e297a928af368937089d3ed71239786f86Andreas Huber SAVE_XMM 588f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 589f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 590f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 591f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 592f71323e297a928af368937089d3ed71239786f86Andreas Huber 593f71323e297a928af368937089d3ed71239786f86Andreas Huber ALIGN_STACK 16, rax 594f71323e297a928af368937089d3ed71239786f86Andreas Huber sub rsp, 32 ; reserve 32 bytes 595f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 596f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 597f71323e297a928af368937089d3ed71239786f86Andreas Huber 598f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(0) ;src_ptr 599f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step 600f71323e297a928af368937089d3ed71239786f86Andreas Huber 601f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdx, arg(3) ;limit 602f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, XMMWORD PTR [rdx] 603f71323e297a928af368937089d3ed71239786f86Andreas Huber 604f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 605f71323e297a928af368937089d3ed71239786f86Andreas Huber 606538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; calculate breakout conditions and high edge variance 607538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber LFH_FILTER_AND_HEV_MASK 1 608538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; filter and write back the results 609538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber MB_FILTER_AND_WRITEBACK 1 610f71323e297a928af368937089d3ed71239786f86Andreas Huber 611f71323e297a928af368937089d3ed71239786f86Andreas Huber add rsp, 32 612f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsp 613f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 614f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 615f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 616f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 617f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_XMM 618f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 619f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 620f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 621f71323e297a928af368937089d3ed71239786f86Andreas Huber 622f71323e297a928af368937089d3ed71239786f86Andreas Huber 623f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_mbloop_filter_horizontal_edge_uv_sse2 624f71323e297a928af368937089d3ed71239786f86Andreas Huber;( 625f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *u, 626f71323e297a928af368937089d3ed71239786f86Andreas Huber; int src_pixel_step, 627f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *flimit, 628f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *limit, 629f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *thresh, 630f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *v 631f71323e297a928af368937089d3ed71239786f86Andreas Huber;) 632f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) 633f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_mbloop_filter_horizontal_edge_uv_sse2): 634f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 635f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 636f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 6 637f71323e297a928af368937089d3ed71239786f86Andreas Huber SAVE_XMM 638f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 639f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 640f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 641f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 642f71323e297a928af368937089d3ed71239786f86Andreas Huber 643f71323e297a928af368937089d3ed71239786f86Andreas Huber ALIGN_STACK 16, rax 644f71323e297a928af368937089d3ed71239786f86Andreas Huber sub rsp, 96 ; reserve 96 bytes 645f71323e297a928af368937089d3ed71239786f86Andreas Huber %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; 646f71323e297a928af368937089d3ed71239786f86Andreas Huber %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; 647f71323e297a928af368937089d3ed71239786f86Andreas Huber %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; 648f71323e297a928af368937089d3ed71239786f86Andreas Huber %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; 649f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; 650f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; 651f71323e297a928af368937089d3ed71239786f86Andreas Huber 652f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(0) ; u 653f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdi, arg(5) ; v 654f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rax, dword ptr arg(1) ; src_pixel_step 655f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rcx, rax 656f71323e297a928af368937089d3ed71239786f86Andreas Huber neg rax ; negate pitch to deal with above border 657f71323e297a928af368937089d3ed71239786f86Andreas Huber 658f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdx, arg(3) ;limit 659f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, XMMWORD PTR [rdx] 660f71323e297a928af368937089d3ed71239786f86Andreas Huber 661f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi + rcx] 662f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rdi + rcx] 663f71323e297a928af368937089d3ed71239786f86Andreas Huber 664538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; calculate breakout conditions and high edge variance 665538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber LFH_FILTER_AND_HEV_MASK 0 666538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; filter and write back the results 667538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber MB_FILTER_AND_WRITEBACK 0 668f71323e297a928af368937089d3ed71239786f86Andreas Huber 669f71323e297a928af368937089d3ed71239786f86Andreas Huber add rsp, 96 670f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsp 671f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 672f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 673f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 674f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 675f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_XMM 676f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 677f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 678f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 679f71323e297a928af368937089d3ed71239786f86Andreas Huber 680f71323e297a928af368937089d3ed71239786f86Andreas Huber 681538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%macro TRANSPOSE_16X8 2 682538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 683538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 684f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 685538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 686538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 687538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 688f71323e297a928af368937089d3ed71239786f86Andreas Huber 689538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 690f71323e297a928af368937089d3ed71239786f86Andreas Huber 691538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 692538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 693538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 694f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 695f71323e297a928af368937089d3ed71239786f86Andreas Huber 696538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 697f71323e297a928af368937089d3ed71239786f86Andreas Huber 698f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 699538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 700538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rsi, [rsi+rax*8] 701538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%else 702538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rsi, arg(5) ; v_ptr 703538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%endif 704f71323e297a928af368937089d3ed71239786f86Andreas Huber 705f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 706f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 707538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 708f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 709f71323e297a928af368937089d3ed71239786f86Andreas Huber 710f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 711538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 712538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rdi, [rdi+rax*8] 713538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%else 714538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rsi, [rsi - 4] 715538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%endif 716f71323e297a928af368937089d3ed71239786f86Andreas Huber 717f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 718538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 719538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rdx, srct 720538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%else 721538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 722538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%endif 723538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 724538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 725f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 726f71323e297a928af368937089d3ed71239786f86Andreas Huber 727f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 728538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 729f71323e297a928af368937089d3ed71239786f86Andreas Huber 730f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 731538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 732f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 733f71323e297a928af368937089d3ed71239786f86Andreas Huber 734f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 735f71323e297a928af368937089d3ed71239786f86Andreas Huber 736f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa t0, xmm2 ; save to free XMM2 737538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 738538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 739538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 740538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 741538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 742f71323e297a928af368937089d3ed71239786f86Andreas Huber 743538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 744f71323e297a928af368937089d3ed71239786f86Andreas Huber 745538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 746f71323e297a928af368937089d3ed71239786f86Andreas Huber 747f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 748f71323e297a928af368937089d3ed71239786f86Andreas Huber 749538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 750f71323e297a928af368937089d3ed71239786f86Andreas Huber 751f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 752f71323e297a928af368937089d3ed71239786f86Andreas Huber 753f71323e297a928af368937089d3ed71239786f86Andreas Huber movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 754538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 755f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 756f71323e297a928af368937089d3ed71239786f86Andreas Huber 757f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm1 ; 758f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 759f71323e297a928af368937089d3ed71239786f86Andreas Huber 760f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 761f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 762f71323e297a928af368937089d3ed71239786f86Andreas Huber 763f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 764538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 765f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 766f71323e297a928af368937089d3ed71239786f86Andreas Huber 767f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm0, xmm5 768f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 769f71323e297a928af368937089d3ed71239786f86Andreas Huber 770f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 771f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 772f71323e297a928af368937089d3ed71239786f86Andreas Huber 773f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 774f71323e297a928af368937089d3ed71239786f86Andreas Huber 775538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 776f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 777538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 778f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 779f71323e297a928af368937089d3ed71239786f86Andreas Huber 780f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 781538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %2 782f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 783f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 784f71323e297a928af368937089d3ed71239786f86Andreas Huber 785f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 786538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 787f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx], xmm2 ; save 2 788f71323e297a928af368937089d3ed71239786f86Andreas Huber 789f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 790f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 791f71323e297a928af368937089d3ed71239786f86Andreas Huber 792f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx+16], xmm3 ; save 3 793538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 794f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 795f71323e297a928af368937089d3ed71239786f86Andreas Huber 796f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx+32], xmm4 ; save 4 797f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx+48], xmm5 ; save 5 798f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, t0 ; get 799f71323e297a928af368937089d3ed71239786f86Andreas Huber 800538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm1 ; 801f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 802538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 803f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 804f71323e297a928af368937089d3ed71239786f86Andreas Huber%else 805f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx+112], xmm7 ; save 7 806f71323e297a928af368937089d3ed71239786f86Andreas Huber 807f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx+96], xmm6 ; save 6 808f71323e297a928af368937089d3ed71239786f86Andreas Huber 809538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 810f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 811538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 812538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 813538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 814f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx+32], xmm2 ; save 2 815f71323e297a928af368937089d3ed71239786f86Andreas Huber 816f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 817f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 81890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 819f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx+48], xmm3 ; save 3 820538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 821f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 82290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 823f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx+64], xmm4 ; save 4 824f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx+80], xmm5 ; save 5 825f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, t0 ; get 82690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 827538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm1 828f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 829538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 830f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 83190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 832f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx+16], xmm1 833538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 834f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa [rdx], xmm2 835f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 836f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro 83790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 838538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%macro LFV_FILTER_MASK_HEV_MASK 1 839f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm0, xmm6 ; q2 840f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm0, xmm7 ; q2-q3 84190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 842f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm7, xmm6 ; q3-q2 843f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm5 ; q1 84490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 845538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber por xmm7, xmm0 ; abs (q3-q2) 846538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm4, xmm6 ; q1-q2 84790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 848f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm0, xmm1 849538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm6, xmm5 ; q2-q1 85090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 851538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber por xmm6, xmm4 ; abs (q2-q1) 852f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm0, xmm2 ; p2 - p3; 85390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 854538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm2, xmm1 ; p3 - p2; 855f71323e297a928af368937089d3ed71239786f86Andreas Huber por xmm0, xmm2 ; abs(p2-p3) 856f71323e297a928af368937089d3ed71239786f86Andreas Huber%if %1 857f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, [rdx] ; p1 858f71323e297a928af368937089d3ed71239786f86Andreas Huber%else 859f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, [rdx+32] ; p1 860f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 861f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm2 ; p1 862538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm0, xmm7 86390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 864f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm5, xmm1 ; p1-p2 865f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm1, xmm2 ; p2-p1 86690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 867538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, xmm3 ; p0 868538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm7, xmm2 ; p0-p1 86990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 870538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber por xmm1, xmm5 ; abs(p2-p1) 871538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm0, xmm6 87290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 873538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm0, xmm1 874f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm2 ; p1 87590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 876f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm2, xmm3 ; p1-p0 877538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rdx, srct 878538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 879f71323e297a928af368937089d3ed71239786f86Andreas Huber por xmm2, xmm7 ; abs(p1-p0) 88090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 881f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa t0, xmm2 ; save abs(p1-p0) 88290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 883538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm0, xmm2 884538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 885f71323e297a928af368937089d3ed71239786f86Andreas Huber%if %1 886f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, [rdx+32] ; q0 887f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, [rdx+48] ; q1 888f71323e297a928af368937089d3ed71239786f86Andreas Huber%else 889f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, [rdx+64] ; q0 890f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, [rdx+80] ; q1 891f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif 892538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdx, arg(3) ; limit 893538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 894f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm5 ; q0 895f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, xmm7 ; q1 896f71323e297a928af368937089d3ed71239786f86Andreas Huber 897538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm5, xmm7 ; q0-q1 898f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm7, xmm6 ; q1-q0 899538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 900f71323e297a928af368937089d3ed71239786f86Andreas Huber por xmm7, xmm5 ; abs(q1-q0) 901f71323e297a928af368937089d3ed71239786f86Andreas Huber 902f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa t1, xmm7 ; save abs(q1-q0) 903f71323e297a928af368937089d3ed71239786f86Andreas Huber 904538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, XMMWORD PTR [rdx]; limit 905538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 906538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaxub xmm0, xmm7 907538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdx, arg(2) ; flimit 908f71323e297a928af368937089d3ed71239786f86Andreas Huber 909538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm0, xmm4 910f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm2 ; q1 911538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 912f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm5, xmm1 ; q1-=p1 913f71323e297a928af368937089d3ed71239786f86Andreas Huber psubusb xmm1, xmm2 ; p1-=q1 914f71323e297a928af368937089d3ed71239786f86Andreas Huber 915538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber por xmm5, xmm1 ; abs(p1-q1) 916f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm3 ; p0 91790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 918538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero 919538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm1, xmm6 ; p0-q0 92090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 921538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrlw xmm5, 1 ; abs(p1-q1)/2 922538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm6, xmm3 ; q0-p0 92390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 924538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, XMMWORD PTR [rdx]; flimit 92590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 926538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdx, arg(4) ; get thresh 92790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 928538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber por xmm1, xmm6 ; abs(q0-p0) 929538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddb xmm2, xmm2 ; flimit*2 (less than 255) 93090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 931538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm6, t0 ; get abs (q1 - q0) 93290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 933538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddusb xmm1, xmm1 ; abs(q0-p0)*2 93490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 935538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, t1 ; get abs (p1 - p0) 93690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 937538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, XMMWORD PTR [rdx] 93890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 939538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 940538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh 94190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 942538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255) 943538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh 94490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 945538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 946538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 94790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 948538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber por xmm1, xmm0 ; mask 949538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqb xmm6, xmm0 95090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 951538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm0, xmm0 952538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqb xmm4, xmm4 95390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 954538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqb xmm1, xmm0 955538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm4, xmm6 956f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro 95790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 958f71323e297a928af368937089d3ed71239786f86Andreas Huber%macro BV_TRANSPOSE 0 959f71323e297a928af368937089d3ed71239786f86Andreas Huber ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 960f71323e297a928af368937089d3ed71239786f86Andreas Huber ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 961f71323e297a928af368937089d3ed71239786f86Andreas Huber ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 962f71323e297a928af368937089d3ed71239786f86Andreas Huber ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 963f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 964f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 96590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 966f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 967f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 96890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 969f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 970538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 971f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 97290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 973f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 974f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 97590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 976f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 977f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 97890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 979f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 980538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 981f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 982f71323e297a928af368937089d3ed71239786f86Andreas Huber ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 983f71323e297a928af368937089d3ed71239786f86Andreas Huber ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 984f71323e297a928af368937089d3ed71239786f86Andreas Huber ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 985f71323e297a928af368937089d3ed71239786f86Andreas Huber ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 986f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro 98790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 988f71323e297a928af368937089d3ed71239786f86Andreas Huber%macro BV_WRITEBACK 2 989f71323e297a928af368937089d3ed71239786f86Andreas Huber movd [rsi+2], %1 990f71323e297a928af368937089d3ed71239786f86Andreas Huber psrldq %1, 4 99190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 992f71323e297a928af368937089d3ed71239786f86Andreas Huber movd [rdi+2], %1 993f71323e297a928af368937089d3ed71239786f86Andreas Huber psrldq %1, 4 99490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 995f71323e297a928af368937089d3ed71239786f86Andreas Huber movd [rsi+2*rax+2], %1 996f71323e297a928af368937089d3ed71239786f86Andreas Huber psrldq %1, 4 99790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 998f71323e297a928af368937089d3ed71239786f86Andreas Huber movd [rdi+2*rax+2], %1 99990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1000f71323e297a928af368937089d3ed71239786f86Andreas Huber movd [rsi+4*rax+2], %2 1001f71323e297a928af368937089d3ed71239786f86Andreas Huber psrldq %2, 4 100290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1003f71323e297a928af368937089d3ed71239786f86Andreas Huber movd [rdi+4*rax+2], %2 1004f71323e297a928af368937089d3ed71239786f86Andreas Huber psrldq %2, 4 100590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1006f71323e297a928af368937089d3ed71239786f86Andreas Huber movd [rsi+2*rcx+2], %2 1007f71323e297a928af368937089d3ed71239786f86Andreas Huber psrldq %2, 4 100890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1009f71323e297a928af368937089d3ed71239786f86Andreas Huber movd [rdi+2*rcx+2], %2 1010f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro 101190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 101290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1013f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_loop_filter_vertical_edge_sse2 1014f71323e297a928af368937089d3ed71239786f86Andreas Huber;( 1015f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *src_ptr, 1016f71323e297a928af368937089d3ed71239786f86Andreas Huber; int src_pixel_step, 1017f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *flimit, 1018f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *limit, 1019f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *thresh, 1020f71323e297a928af368937089d3ed71239786f86Andreas Huber; int count 1021f71323e297a928af368937089d3ed71239786f86Andreas Huber;) 1022f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_loop_filter_vertical_edge_sse2) 1023f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_loop_filter_vertical_edge_sse2): 1024f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 1025f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 1026f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 6 1027f71323e297a928af368937089d3ed71239786f86Andreas Huber SAVE_XMM 1028f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 1029f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 1030f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 1031f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 103290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1033f71323e297a928af368937089d3ed71239786f86Andreas Huber ALIGN_STACK 16, rax 1034f71323e297a928af368937089d3ed71239786f86Andreas Huber sub rsp, 96 ; reserve 96 bytes 1035f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1036f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1037f71323e297a928af368937089d3ed71239786f86Andreas Huber %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; 103890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1039f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(0) ; src_ptr 1040f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rax, dword ptr arg(1) ; src_pixel_step 104190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1042f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi - 4] 1043f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1044f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rcx, [rax*2+rax] 104590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1046f71323e297a928af368937089d3ed71239786f86Andreas Huber ;transpose 16x8 to 8x16, and store the 8-line result on stack. 1047538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber TRANSPOSE_16X8 1, 1 104890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1049538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; calculate filter mask and high edge variance 1050538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber LFV_FILTER_MASK_HEV_MASK 1 105190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1052f71323e297a928af368937089d3ed71239786f86Andreas Huber ; start work on filters 1053538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber B_FILTER 2 105490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1055f71323e297a928af368937089d3ed71239786f86Andreas Huber ; tranpose and write back - only work on q1, q0, p0, p1 1056f71323e297a928af368937089d3ed71239786f86Andreas Huber BV_TRANSPOSE 1057f71323e297a928af368937089d3ed71239786f86Andreas Huber ; store 16-line result 105890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1059f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdx, [rax] 1060f71323e297a928af368937089d3ed71239786f86Andreas Huber neg rdx 106190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1062f71323e297a928af368937089d3ed71239786f86Andreas Huber BV_WRITEBACK xmm1, xmm5 106390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1064f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi+rdx*8] 1065f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rdi+rdx*8] 1066f71323e297a928af368937089d3ed71239786f86Andreas Huber BV_WRITEBACK xmm2, xmm6 106790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1068f71323e297a928af368937089d3ed71239786f86Andreas Huber add rsp, 96 1069f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsp 1070f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 1071f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 1072f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 1073f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 1074f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_XMM 1075f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 1076f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 1077f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 107890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1080f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_loop_filter_vertical_edge_uv_sse2 1081f71323e297a928af368937089d3ed71239786f86Andreas Huber;( 1082f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *u, 1083f71323e297a928af368937089d3ed71239786f86Andreas Huber; int src_pixel_step, 1084f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *flimit, 1085f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *limit, 1086f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *thresh, 1087f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *v 1088f71323e297a928af368937089d3ed71239786f86Andreas Huber;) 1089f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_loop_filter_vertical_edge_uv_sse2) 1090f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_loop_filter_vertical_edge_uv_sse2): 1091f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 1092f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 1093f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 6 1094f71323e297a928af368937089d3ed71239786f86Andreas Huber SAVE_XMM 1095f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 1096f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 1097f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 1098f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 109990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1100f71323e297a928af368937089d3ed71239786f86Andreas Huber ALIGN_STACK 16, rax 1101f71323e297a928af368937089d3ed71239786f86Andreas Huber sub rsp, 96 ; reserve 96 bytes 1102f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1103f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1104f71323e297a928af368937089d3ed71239786f86Andreas Huber %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; 110590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1106f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(0) ; u_ptr 1107f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rax, dword ptr arg(1) ; src_pixel_step 110890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1109f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi - 4] 1110f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1111f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rcx, [rax+2*rax] 111290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1113f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdx, srct 111490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1115538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ;transpose 16x8 to 8x16, and store the 8-line result on stack. 1116538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber TRANSPOSE_16X8 0, 1 1117538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 1118538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; calculate filter mask and high edge variance 1119538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber LFV_FILTER_MASK_HEV_MASK 1 112090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1121f71323e297a928af368937089d3ed71239786f86Andreas Huber ; start work on filters 1122538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber B_FILTER 2 112390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1124f71323e297a928af368937089d3ed71239786f86Andreas Huber ; tranpose and write back - only work on q1, q0, p0, p1 1125f71323e297a928af368937089d3ed71239786f86Andreas Huber BV_TRANSPOSE 112690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1127f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 112890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1129f71323e297a928af368937089d3ed71239786f86Andreas Huber ; store 16-line result 1130f71323e297a928af368937089d3ed71239786f86Andreas Huber BV_WRITEBACK xmm1, xmm5 113190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1132f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(0) ; u_ptr 1133f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi - 4] 1134f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1135f71323e297a928af368937089d3ed71239786f86Andreas Huber BV_WRITEBACK xmm2, xmm6 113690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1137f71323e297a928af368937089d3ed71239786f86Andreas Huber add rsp, 96 1138f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsp 1139f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 1140f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 1141f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 1142f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 1143f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_XMM 1144f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 1145f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 1146f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 114790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1148f71323e297a928af368937089d3ed71239786f86Andreas Huber%macro MBV_TRANSPOSE 0 1149f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1150f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 115190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1152538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklbw xmm0, xmm7 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1153538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhbw xmm1, xmm7 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1154f71323e297a928af368937089d3ed71239786f86Andreas Huber 1155f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1156f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1157f71323e297a928af368937089d3ed71239786f86Andreas Huber 1158f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1159f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1160f71323e297a928af368937089d3ed71239786f86Andreas Huber 1161538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1162f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1163f71323e297a928af368937089d3ed71239786f86Andreas Huber 1164538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhwd xmm3, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1165f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1166f71323e297a928af368937089d3ed71239786f86Andreas Huber 1167f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1168f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1169f71323e297a928af368937089d3ed71239786f86Andreas Huber 1170f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1171f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1172f71323e297a928af368937089d3ed71239786f86Andreas Huber 1173538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 1174f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 1175f71323e297a928af368937089d3ed71239786f86Andreas Huber 1176f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1177f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 1178f71323e297a928af368937089d3ed71239786f86Andreas Huber 1179f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 1180f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1181f71323e297a928af368937089d3ed71239786f86Andreas Huber 1182f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 1183f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 1184f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro 1185f71323e297a928af368937089d3ed71239786f86Andreas Huber 1186f71323e297a928af368937089d3ed71239786f86Andreas Huber%macro MBV_WRITEBACK_1 0 1187538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq QWORD PTR [rsi], xmm0 1188538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi], xmm0 1189f71323e297a928af368937089d3ed71239786f86Andreas Huber 1190538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq QWORD PTR [rsi+2*rax], xmm6 1191538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi+2*rax], xmm6 1192f71323e297a928af368937089d3ed71239786f86Andreas Huber 1193538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1194f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 119590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1196538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhdq xmm3, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 119790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1198538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq QWORD PTR [rsi+4*rax], xmm0 1199538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi+4*rax], xmm0 120090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1201538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq QWORD PTR [rsi+2*rcx], xmm3 1202538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi+2*rcx], xmm3 120390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1204f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1205f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 120690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1207538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhbw xmm5, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 1208f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm0, xmm2 120990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1210538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 1211538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhwd xmm2, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 121290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1213538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1214f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 121590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1216538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 1217f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro 1218f71323e297a928af368937089d3ed71239786f86Andreas Huber 1219f71323e297a928af368937089d3ed71239786f86Andreas Huber%macro MBV_WRITEBACK_2 0 1220538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq QWORD PTR [rsi], xmm1 1221538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi], xmm1 1222f71323e297a928af368937089d3ed71239786f86Andreas Huber 1223538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq QWORD PTR [rsi+2*rax], xmm5 1224538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi+2*rax], xmm5 1225f71323e297a928af368937089d3ed71239786f86Andreas Huber 1226f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1227f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 1228f71323e297a928af368937089d3ed71239786f86Andreas Huber punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 1229f71323e297a928af368937089d3ed71239786f86Andreas Huber 1230538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq QWORD PTR [rsi+4*rax], xmm1 1231538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi+4*rax], xmm1 1232f71323e297a928af368937089d3ed71239786f86Andreas Huber 1233538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq QWORD PTR [rsi+2*rcx], xmm4 1234538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movhps MMWORD PTR [rdi+2*rcx], xmm4 1235f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro 1236f71323e297a928af368937089d3ed71239786f86Andreas Huber 1237f71323e297a928af368937089d3ed71239786f86Andreas Huber 1238f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_mbloop_filter_vertical_edge_sse2 1239f71323e297a928af368937089d3ed71239786f86Andreas Huber;( 1240f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *src_ptr, 1241f71323e297a928af368937089d3ed71239786f86Andreas Huber; int src_pixel_step, 1242f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *flimit, 1243f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *limit, 1244f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *thresh, 1245f71323e297a928af368937089d3ed71239786f86Andreas Huber; int count 1246f71323e297a928af368937089d3ed71239786f86Andreas Huber;) 1247f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_mbloop_filter_vertical_edge_sse2) 1248f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_mbloop_filter_vertical_edge_sse2): 1249f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 1250f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 1251f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 6 1252f71323e297a928af368937089d3ed71239786f86Andreas Huber SAVE_XMM 1253f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 1254f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 1255f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 1256f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 125790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1258f71323e297a928af368937089d3ed71239786f86Andreas Huber ALIGN_STACK 16, rax 1259f71323e297a928af368937089d3ed71239786f86Andreas Huber sub rsp, 160 ; reserve 160 bytes 1260f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1261f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1262f71323e297a928af368937089d3ed71239786f86Andreas Huber %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; 126390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1264f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(0) ; src_ptr 1265f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rax, dword ptr arg(1) ; src_pixel_step 126690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1267f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi - 4] 1268f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1269f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rcx, [rax*2+rax] 127090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1271f71323e297a928af368937089d3ed71239786f86Andreas Huber ; Transpose 1272538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber TRANSPOSE_16X8 1, 0 127390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1274538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; calculate filter mask and high edge variance 1275538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber LFV_FILTER_MASK_HEV_MASK 0 127690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1277f71323e297a928af368937089d3ed71239786f86Andreas Huber neg rax 1278f71323e297a928af368937089d3ed71239786f86Andreas Huber ; start work on filters 1279538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber MB_FILTER_AND_WRITEBACK 2 128090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1281f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi+rax*8] 1282f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rdi+rax*8] 128390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1284f71323e297a928af368937089d3ed71239786f86Andreas Huber ; transpose and write back 1285f71323e297a928af368937089d3ed71239786f86Andreas Huber MBV_TRANSPOSE 128690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1287f71323e297a928af368937089d3ed71239786f86Andreas Huber neg rax 128890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1289f71323e297a928af368937089d3ed71239786f86Andreas Huber MBV_WRITEBACK_1 129090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1291f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi+rax*8] 1292f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rdi+rax*8] 1293f71323e297a928af368937089d3ed71239786f86Andreas Huber MBV_WRITEBACK_2 129490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1295f71323e297a928af368937089d3ed71239786f86Andreas Huber add rsp, 160 1296f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsp 1297f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 1298f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 1299f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 1300f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_GOT 1301f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_XMM 1302f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 1303f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 1304f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 130590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 130690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1307f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_mbloop_filter_vertical_edge_uv_sse2 1308f71323e297a928af368937089d3ed71239786f86Andreas Huber;( 1309f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *u, 1310f71323e297a928af368937089d3ed71239786f86Andreas Huber; int src_pixel_step, 1311f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *flimit, 1312f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *limit, 1313f71323e297a928af368937089d3ed71239786f86Andreas Huber; const char *thresh, 1314f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned char *v 1315f71323e297a928af368937089d3ed71239786f86Andreas Huber;) 1316f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_mbloop_filter_vertical_edge_uv_sse2) 1317f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_mbloop_filter_vertical_edge_uv_sse2): 1318f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 1319f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 1320f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 6 1321f71323e297a928af368937089d3ed71239786f86Andreas Huber SAVE_XMM 1322f71323e297a928af368937089d3ed71239786f86Andreas Huber GET_GOT rbx 1323f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 1324f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 1325f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 132690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1327f71323e297a928af368937089d3ed71239786f86Andreas Huber ALIGN_STACK 16, rax 1328f71323e297a928af368937089d3ed71239786f86Andreas Huber sub rsp, 160 ; reserve 160 bytes 1329f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1330f71323e297a928af368937089d3ed71239786f86Andreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1331f71323e297a928af368937089d3ed71239786f86Andreas Huber %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; 133290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1333f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(0) ; u_ptr 1334f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rax, dword ptr arg(1) ; src_pixel_step 133590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1336f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi - 4] 1337f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1338f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rcx, [rax+2*rax] 133990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1340f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdx, srct 134190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1342538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; Transpose 1343538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber TRANSPOSE_16X8 0, 0 1344538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 1345538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; calculate filter mask and high edge variance 1346538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber LFV_FILTER_MASK_HEV_MASK 0 134790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1348f71323e297a928af368937089d3ed71239786f86Andreas Huber ; start work on filters 1349538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber MB_FILTER_AND_WRITEBACK 2 135090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1351f71323e297a928af368937089d3ed71239786f86Andreas Huber ; transpose and write back 1352f71323e297a928af368937089d3ed71239786f86Andreas Huber MBV_TRANSPOSE 135390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1354f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(0) ;u_ptr 1355f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi - 4] 1356f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rsi + rax] 1357f71323e297a928af368937089d3ed71239786f86Andreas Huber MBV_WRITEBACK_1 1358f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(5) ;v_ptr 1359f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi - 4] 1360f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rdi, [rsi + rax] 1361f71323e297a928af368937089d3ed71239786f86Andreas Huber MBV_WRITEBACK_2 136290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 136390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 160 136490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 136590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 136690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 136790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 136890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 1369f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_XMM 137090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 137190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 137290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 137390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 137490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 137590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_horizontal_edge_sse2 137690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 137790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 137890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 137990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *flimit, 138090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 138190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 138290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 138390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 138490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_simple_horizontal_edge_sse2) 138590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_horizontal_edge_sse2): 138690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 138790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 138890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 1389f71323e297a928af368937089d3ed71239786f86Andreas Huber SAVE_XMM 139090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 139190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 139290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 139390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 139490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 139590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 139690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 139790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(2) ;flimit ; get flimit 139890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm3, XMMWORD PTR [rdx] 139990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ;limit 140090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, XMMWORD PTR [rdx] 140190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 140290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb xmm3, xmm3 ; flimit*2 (less than 255) 140390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb xmm3, xmm7 ; flimit * 2 + limit (less than 255) 140490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 140590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, rsi ; rdi points to row +1 for indirect addressing 140690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rax 140790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 140890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 140990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate mask 141090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm1, [rsi+2*rax] ; p1 141190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm0, [rdi] ; q1 141290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm2, xmm1 141390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, xmm0 141490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm4, xmm0 141590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb xmm0, xmm1 ; q1-=p1 141690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb xmm1, xmm4 ; p1-=q1 141790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por xmm1, xmm0 ; abs(p1-q1) 1418538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero 141990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw xmm1, 1 ; abs(p1-q1)/2 142090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 142190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm5, [rsi+rax] ; p0 142290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm4, [rsi] ; q0 142390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm0, xmm4 ; q0 142490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm6, xmm5 ; p0 142590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb xmm5, xmm4 ; p0-=q0 142690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb xmm4, xmm6 ; q0-=p0 142790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por xmm5, xmm4 ; abs(p0 - q0) 142890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb xmm5, xmm5 ; abs(p0-q0)*2 142990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 143090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 143190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 143290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm3, xmm3 143390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb xmm5, xmm3 143490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 143590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 1436538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 1437538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 143890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb xmm2, xmm7 ; p1 - q1 143990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1440538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values 1441538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values 144290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm3, xmm0 ; q0 144390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb xmm0, xmm6 ; q0 - p0 144490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) 144590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) 144690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) 144790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand xmm5, xmm2 ; mask filter values we don't care about 144890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 144990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; do + 4 side 1450538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 145190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 145290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm0, xmm5 ; get a copy of filters 145390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw xmm0, 8 ; shift left 8 145490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw xmm0, 3 ; arithmetic shift right 11 145590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw xmm0, 8 145690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm1, xmm5 ; get a copy of filters 145790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw xmm1, 11 ; arithmetic shift right 11 145890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw xmm1, 8 ; shift left 8 to put it back 145990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 146090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por xmm0, xmm1 ; put the two together to get result 146190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 146290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb xmm3, xmm0 ; q0-= q0 add 1463538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm3, [GLOBAL(t80)] ; unoffset 146490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu [rsi], xmm3 ; write back 146590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 146690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; now do +3 side 1467538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 146890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 146990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm0, xmm5 ; get a copy of filters 147090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw xmm0, 8 ; shift left 8 147190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw xmm0, 3 ; arithmetic shift right 11 147290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw xmm0, 8 147390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw xmm5, 11 ; arithmetic shift right 11 147490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw xmm5, 8 ; shift left 8 to put it back 147590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por xmm0, xmm5 ; put the two together to get result 147690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 147790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 147890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb xmm6, xmm0 ; p0+= p0 add 1479538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, [GLOBAL(t80)] ; unoffset 148090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu [rsi+rax], xmm6 ; write back 148190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 148290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 148390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 148490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 148590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 1486f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_XMM 148790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 148890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 148990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 149090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 149190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 149290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_vertical_edge_sse2 149390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 149490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 149590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 149690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *flimit, 149790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 149890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 149990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 150090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 150190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_simple_vertical_edge_sse2) 150290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_vertical_edge_sse2): 150390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp ; save old base pointer value. 150490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp ; set new base pointer value. 150590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 1506f71323e297a928af368937089d3ed71239786f86Andreas Huber SAVE_XMM 150790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx ; save callee-saved reg 150890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 150990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 151090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 151190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 151290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ALIGN_STACK 16, rax 151390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 32 ; reserve 32 bytes 151490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 151590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 151690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 151790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 151890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 151990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 152090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi - 2 ] 152190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdi, [rsi + rax] 152290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, [rsi + rax*4] 152390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rcx, [rdx + rax] 152490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 152590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 152690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 152790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm2, [rdi] ; 13 12 11 10 152890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm3, [rcx] ; 53 52 51 50 152990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 153090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 153190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 153290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm4, [rsi + rax*2] ; 23 22 21 20 153390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm5, [rdx + rax*2] ; 63 62 61 60 153490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm6, [rdi + rax*2] ; 33 32 31 30 153590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm7, [rcx + rax*2] ; 73 72 71 70 153690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 153790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 153890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 153990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 154090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 154190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 154290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm1, xmm0 154390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 154490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 154590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 154690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm2, xmm0 154790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 154890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 154990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 155090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa t0, xmm0 ; save to t0 155190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa t1, xmm2 ; save to t1 155290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 155390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi + rax*8] 155490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdi, [rsi + rax] 155590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, [rsi + rax*4] 155690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rcx, [rdx + rax] 155790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 155890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm4, [rsi] ; 83 82 81 80 155990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm1, [rdx] ; c3 c2 c1 c0 156090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm6, [rdi] ; 93 92 91 90 156190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm3, [rcx] ; d3 d2 d1 d0 156290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 156390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 156490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 156590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm0, [rsi + rax*2] ; a3 a2 a1 a0 156690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm5, [rdx + rax*2] ; e3 e2 e1 e0 156790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm2, [rdi + rax*2] ; b3 b2 b1 b0 156890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm7, [rcx + rax*2] ; f3 f2 f1 f0 156990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 157090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 157190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 157290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 157390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 157490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 157590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm1, xmm4 157690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 157790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 157890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 157990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm6, xmm4 158090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 158190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 158290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 158390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 158490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 158590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm1, xmm0 158690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm3, xmm2 158790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 158890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 158990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 159090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 159190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 159290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 159390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate mask 159490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm6, xmm0 ; p1 159590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, xmm3 ; q1 159690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb xmm7, xmm0 ; q1-=p1 159790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb xmm6, xmm3 ; p1-=q1 159890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por xmm6, xmm7 ; abs(p1-q1) 1599538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero 160090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw xmm6, 1 ; abs(p1-q1)/2 160190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 160290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm5, xmm1 ; p0 160390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm4, xmm2 ; q0 160490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb xmm5, xmm2 ; p0-=q0 160590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb xmm4, xmm1 ; q0-=p0 160690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por xmm5, xmm4 ; abs(p0 - q0) 160790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb xmm5, xmm5 ; abs(p0-q0)*2 160890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 160990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 161090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(2) ;flimit 161190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, XMMWORD PTR [rdx] 161290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ; get limit 161390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm6, XMMWORD PTR [rdx] 161490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb xmm7, xmm7 ; flimit*2 (less than 255) 161590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb xmm7, xmm6 ; flimit * 2 + limit (less than 255) 161690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 161790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 161890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm7, xmm7 161990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb xmm5, xmm7 ; mm5 = mask 162090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 162190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 162290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa t0, xmm0 162390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa t1, xmm3 162490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1625538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values 1626538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values 162790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 162890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb xmm0, xmm3 ; p1 - q1 162990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm6, xmm1 ; p0 163090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 163190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, xmm2 ; q0 1632538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values 163390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1634538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values 163590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm3, xmm7 ; offseted ; q0 163690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 163790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb xmm7, xmm6 ; q0 - p0 163890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0) 163990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 164090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0) 164190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0) 164290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 164390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand xmm5, xmm0 ; mask filter values we don't care about 164490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 164590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1646538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 164790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 164890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm0, xmm5 ; get a copy of filters 164990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw xmm0, 8 ; shift left 8 165090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 165190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw xmm0, 3 ; arithmetic shift right 11 165290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw xmm0, 8 165390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 165490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, xmm5 ; get a copy of filters 165590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw xmm7, 11 ; arithmetic shift right 11 165690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 165790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw xmm7, 8 ; shift left 8 to put it back 165890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por xmm0, xmm7 ; put the two together to get result 165990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 166090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb xmm3, xmm0 ; q0-= q0sz add 1661538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm3, [GLOBAL(t80)] ; unoffset q0 166290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 166390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; now do +3 side 1664538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 166590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm0, xmm5 ; get a copy of filters 166690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 166790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw xmm0, 8 ; shift left 8 166890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw xmm0, 3 ; arithmetic shift right 11 166990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 167090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw xmm0, 8 167190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw xmm5, 11 ; arithmetic shift right 11 167290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 167390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw xmm5, 8 ; shift left 8 to put it back 167490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por xmm0, xmm5 ; put the two together to get result 167590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 167690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb xmm6, xmm0 ; p0+= p0 add 1677538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, [GLOBAL(t80)] ; unoffset p0 167890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 167990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm0, t0 ; p1 168090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm4, t1 ; q1 168190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 168290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; transpose back to write out 168390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 168490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 168590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 168690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 168790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm1, xmm0 168890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 168990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 169090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm5, xmm3 169290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 169390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 169490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm2, xmm0 169690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 169790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 169890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm3, xmm1 170090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 170190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 170290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 170390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; write out order: xmm0 xmm2 xmm1 xmm3 170490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, [rsi + rax*4] 170590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 170690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi], xmm1 ; write the second 8-line result 170790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm1, 4 170890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi], xmm1 170990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm1, 4 171090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi + rax*2], xmm1 171190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm1, 4 171290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi + rax*2], xmm1 171390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 171490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdx], xmm3 171590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm3, 4 171690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rcx], xmm3 171790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm3, 4 171890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdx + rax*2], xmm3 171990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm3, 4 172090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rcx + rax*2], xmm3 172190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 172290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 172390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi + rax*8] 172490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 172590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdi, [rsi + rax] 172690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, [rsi + rax*4] 172790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rcx, [rdx + rax] 172890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 172990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi], xmm0 ; write the first 8-line result 173090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm0, 4 173190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi], xmm0 173290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm0, 4 173390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi + rax*2], xmm0 173490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm0, 4 173590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi + rax*2], xmm0 173690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 173790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdx], xmm2 173890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm2, 4 173990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rcx], xmm2 174090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm2, 4 174190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdx + rax*2], xmm2 174290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm2, 4 174390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rcx + rax*2], xmm2 174490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 174590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 32 174690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 174790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 174890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 174990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 175090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 1751f71323e297a928af368937089d3ed71239786f86Andreas Huber RESTORE_XMM 175290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 175390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 175490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 175590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 175690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA 175790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 175890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubertfe: 175990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 16 db 0xfe 176090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 176190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert80: 176290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 16 db 0x80 176390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 176490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert1s: 176590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 16 db 0x01 176690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 176790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert3: 176890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 16 db 0x03 176990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 177090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert4: 177190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 16 db 0x04 177290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 177390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberones: 177490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 dw 0x0001 177590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 177690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers9: 177790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 dw 0x0900 177890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 177990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers63: 178090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 dw 0x003f 1781