1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm" 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _t0 0 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _t1 _t0 + 16 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _p3 _t1 + 16 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _p2 _p3 + 16 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _p1 _p2 + 16 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _p0 _p1 + 16 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _q0 _p0 + 16 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _q1 _q0 + 16 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _q2 _q1 + 16 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _q3 _q2 + 16 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define lf_var_size 160 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of pmaxub instead of psubusb to compute filter mask was seen 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in ffvp8 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro LFH_FILTER_AND_HEV_MASK 1 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rdi+2*rax] ; q3 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rsi+2*rax] ; q2 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rsi+rax] ; q1 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, [rsi] ; q0 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax ; negate pitch to deal with above border 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan movlps xmm2, [rsi + rcx*2] ; q3 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan movlps xmm1, [rsi + rcx] ; q2 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan movlps xmm4, [rsi] ; q1 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan movlps xmm5, [rsi + rax] ; q0 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps xmm2, [rdi + rcx*2] 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps xmm1, [rdi + rcx] 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps xmm4, [rdi] 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps xmm5, [rdi + rax] 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rax*4] 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi + rax*4] 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_q2], xmm1 ; store q2 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_q1], xmm4 ; store q1 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rdx] ;limit 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm1 ; q2 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm4 ; q1 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm1, xmm2 ; q2-=q3 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm2, xmm6 ; q3-=q2 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm4, xmm6 ; q1-=q2 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm6, xmm3 ; q2-=q1 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm4, xmm6 ; abs(q2-q1) 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm1, xmm2 ; abs(q3-q2) 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm5 ; q0 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm1, xmm4 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm5, xmm3 ; q0-=q1 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm3, xmm0 ; q1-=q0 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm5, xmm3 ; abs(q0-q1) 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_t0], xmm5 ; save to t0 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm1, xmm5 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rsi+4*rax] ; p3 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rdi+4*rax] ; p2 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, [rsi+2*rax] ; p1 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan movlps xmm2, [rsi + rax] ; p3 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan movlps xmm4, [rsi] ; p2 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan movlps xmm6, [rsi + rcx] ; p1 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps xmm2, [rdi + rax] 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps xmm4, [rdi] 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps xmm6, [rdi + rcx] 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_p2], xmm4 ; store p2 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_p1], xmm6 ; store p1 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm4 ; p2 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm6 ; p1 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm4, xmm2 ; p2-=p3 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm2, xmm5 ; p3-=p2 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm3, xmm5 ; p1-=p2 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm1, xmm4 ; abs(p3 - p2) 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm5, xmm6 ; p2-=p1 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm1, xmm2 ; abs(p3 - p2) 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm1, xmm5 ; abs(p2 - p1) 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm6 ; p1 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm1, xmm3 ; abs(p2 - p1) 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rsi+rax] ; p0 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, [rdi] ; q1 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan movlps xmm4, [rsi + rcx*2] ; p0 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps xmm4, [rdi + rcx*2] 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, [rsp+_q1] ; q1 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm4 ; p0 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm4, xmm6 ; p0-=p1 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm6, xmm5 ; p1-=p0 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm6, xmm4 ; abs(p1 - p0) 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(2) ; get blimit 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_t1], xmm6 ; save to t1 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm3 ; q1 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm1, xmm6 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm3, xmm2 ; q1-=p1 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm2, xmm4 ; p1-=q1 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm1, xmm7 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm2, xmm3 ; abs(p1-q1) 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rdx] ; blimit 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(4) ; hev get thresh 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm0 ; q0 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm5 ; p0 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm2, 1 ; abs(p1-q1)/2 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm5, xmm3 ; p0-=q0 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm3, xmm6 ; q0-=p0 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm5, xmm3 ; abs(p0 - q0) 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm5, xmm5 ; abs(p0-q0)*2 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0) 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0) 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rdx] ; hev 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm4, xmm2 ; hev 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm3, xmm2 ; hev 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm1, xmm5 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm4, xmm5 ; hev 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm3, xmm3 ; hev 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm1, xmm7 ; mask xmm1 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm4, xmm3 ; hev 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro B_FILTER 1 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, [GLOBAL(t80)] 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rsp+_p1] ; p1 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rsp+_q1] ; q1 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rsi+2*rax] ; p1 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rdi] ; q1 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 2 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rsp+_p1] ; p1 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, [rsp+_p0] ; p0 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, [rsp+_q0] ; q0 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rsp+_q1] ; q1 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm2, xmm3 ; p1 offset to convert to signed values 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm3 ; q1 offset to convert to signed values 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm2, xmm7 ; p1 - q1 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm3 ; offset to convert to signed values 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm3 ; offset to convert to signed values 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm0 ; q0 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm0, xmm6 ; q0 - p0 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm1, xmm2 ; mask filter values we don't care about 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm5, xmm2 ; axbxcxdx 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm2 ; exfxgxhx 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm1 ; exfxgxhx 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm5, 11 ; sign extended shift right by 3 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm1, xmm1 ; axbxcxdx 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm2, 11 ; sign extended shift right by 3 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm0, 11 ; sign extended shift right by 3 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm1, 11 ; sign extended shift right by 3 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm0 ; save results 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan 226233d2500723e5594f3e7c70896ffeeef32b9c950ywan packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm6, xmm2 ; p0+= p0 add 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [GLOBAL(ones)] 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsw xmm5, xmm2 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsw xmm1, xmm2 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm5, 1 ; partial shifted one more time for 2nd tap 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm1, 1 ; partial shifted one more time for 2nd tap 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [GLOBAL(t80)] 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rsp+_p1] ; p1 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rcx*2] 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi + rcx*2] 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rsi+2*rax] ; p1 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 2 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rsp+_p1] ; p1 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan pandn xmm4, xmm5 ; high edge variance additive 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm2 ; unoffset 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm1, xmm2 ; reoffset 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm3, xmm0 ; q0-= q0 add 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm1, xmm4 ; p1+= p1 add 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm3, xmm2 ; unoffset 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm1, xmm2 ; unoffset 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm7, xmm4 ; q1-= q1 add 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm2 ; unoffset 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi], xmm6 ; p0 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi], xmm6 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi + rax], xmm1 ; p1 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi + rax], xmm1 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi + rcx], xmm3 ; q0 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi + rcx], xmm3 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi + rcx*2], xmm7 ; q1 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi + rcx*2], xmm7 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsi+rax], xmm6 ; write back 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsi+2*rax], xmm1 ; write back 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsi], xmm3 ; write back 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rdi], xmm7 ; write back 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_horizontal_edge_sse2 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixel_step, 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *blimit, 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *limit, 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *thresh, 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 289233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE 290233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_horizontal_edge_sse2): 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 5 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 297233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, lf_var_size 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;src_ptr 304233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ;src_pixel_step 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(3) ;limit 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan 308233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; calculate breakout conditions and high edge variance 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan LFH_FILTER_AND_HEV_MASK 1 312233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; filter and write back the result 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan B_FILTER 1 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, lf_var_size 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 321233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 323233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 324233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_horizontal_edge_uv_sse2 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixel_step, 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *blimit, 333233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *limit, 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *thresh, 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int count 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 337233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE 338233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_horizontal_edge_uv_sse2): 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 6 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, lf_var_size 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ; u 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(5) ; v 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ; src_pixel_step 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, rax 355233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax ; negate pitch to deal with above border 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(3) ;limit 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rcx] 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi + rcx] 361233d2500723e5594f3e7c70896ffeeef32b9c950ywan 362233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; calculate breakout conditions and high edge variance 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan LFH_FILTER_AND_HEV_MASK 0 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; filter and write back the result 365233d2500723e5594f3e7c70896ffeeef32b9c950ywan B_FILTER 0 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan 367233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, lf_var_size 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 375233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro MB_FILTER_AND_WRITEBACK 1 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, [GLOBAL(t80)] 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rsp+_p1] ; p1 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rsp+_q1] ; q1 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rsi+2*rax] ; p1 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rdi] ; q1 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, rax 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rcx 390233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 2 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rsp+_p1] ; p1 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, [rsp+_p0] ; p0 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, [rsp+_q0] ; q0 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rsp+_q1] ; q1 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan 397233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm2, xmm3 ; p1 offset to convert to signed values 398233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm3 ; q1 offset to convert to signed values 399233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm3 ; offset to convert to signed values 400233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm3 ; offset to convert to signed values 401233d2500723e5594f3e7c70896ffeeef32b9c950ywan 402233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm2, xmm7 ; p1 - q1 403233d2500723e5594f3e7c70896ffeeef32b9c950ywan 404233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm0 ; q0 405233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm0, xmm6 ; q0 - p0 406233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) 407233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, xmm0 ; 2 * (q0 - p0) 408233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) 409233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm1, xmm2 ; mask filter values we don't care about 410233d2500723e5594f3e7c70896ffeeef32b9c950ywan 411233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 ; vp8_filter 412233d2500723e5594f3e7c70896ffeeef32b9c950ywan 413233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm2, xmm4 ; Filter2 = vp8_filter & hev 414233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm0 415233d2500723e5594f3e7c70896ffeeef32b9c950ywan 416233d2500723e5594f3e7c70896ffeeef32b9c950ywan pandn xmm4, xmm1 ; vp8_filter&=~hev 417233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm1, xmm1 418233d2500723e5594f3e7c70896ffeeef32b9c950ywan 419233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm4 ; Filter 2 (hi) 420233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm1, xmm4 ; Filter 2 (lo) 421233d2500723e5594f3e7c70896ffeeef32b9c950ywan 422233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm2 423233d2500723e5594f3e7c70896ffeeef32b9c950ywan 424233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [GLOBAL(s9)] 425233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) 426233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) 427233d2500723e5594f3e7c70896ffeeef32b9c950ywan 428233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9 429233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9 430233d2500723e5594f3e7c70896ffeeef32b9c950ywan 431233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm7, xmm5 ; axbxcxdx 432233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm5, xmm5 ; exfxgxhx 433233d2500723e5594f3e7c70896ffeeef32b9c950ywan 434233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm7, 11 ; sign extended shift right by 3 435233d2500723e5594f3e7c70896ffeeef32b9c950ywan 436233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm5, 11 ; sign extended shift right by 3 437233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm4, xmm2 ; axbxcxdx 438233d2500723e5594f3e7c70896ffeeef32b9c950ywan 439233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm2 ; exfxgxhx 440233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm4, 11 ; sign extended shift right by 3 441233d2500723e5594f3e7c70896ffeeef32b9c950ywan 442233d2500723e5594f3e7c70896ffeeef32b9c950ywan packsswb xmm5, xmm7 ; Filter2 >>=3; 443233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm2, 11 ; sign extended shift right by 3 444233d2500723e5594f3e7c70896ffeeef32b9c950ywan 445233d2500723e5594f3e7c70896ffeeef32b9c950ywan packsswb xmm2, xmm4 ; Filter1 >>=3; 446233d2500723e5594f3e7c70896ffeeef32b9c950ywan 447233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 448233d2500723e5594f3e7c70896ffeeef32b9c950ywan 449233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 450233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm1 451233d2500723e5594f3e7c70896ffeeef32b9c950ywan 452233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [GLOBAL(s63)] 453233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm0 454233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm5 455233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63 456233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63 457233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm7 458233d2500723e5594f3e7c70896ffeeef32b9c950ywan 459233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm5 ; Filter 2 (hi) * 18 460233d2500723e5594f3e7c70896ffeeef32b9c950ywan 461233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm7 ; Filter 2 (lo) * 18 462233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 463233d2500723e5594f3e7c70896ffeeef32b9c950ywan 464233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 465233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 466233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 467233d2500723e5594f3e7c70896ffeeef32b9c950ywan 468233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 469233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 470233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 471233d2500723e5594f3e7c70896ffeeef32b9c950ywan 472233d2500723e5594f3e7c70896ffeeef32b9c950ywan packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) 473233d2500723e5594f3e7c70896ffeeef32b9c950ywan 474233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 475233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 476233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 477233d2500723e5594f3e7c70896ffeeef32b9c950ywan 478233d2500723e5594f3e7c70896ffeeef32b9c950ywan packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) 479233d2500723e5594f3e7c70896ffeeef32b9c950ywan packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) 480233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [GLOBAL(t80)] 481233d2500723e5594f3e7c70896ffeeef32b9c950ywan 482233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0 483233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rsp+_q1] ; q1 484233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rsp+_p1] ; p1 485233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi+rcx*2] 486233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi+rcx*2] 487233d2500723e5594f3e7c70896ffeeef32b9c950ywan 488233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1 489233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rdi] ; q1 490233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rsi+rax*2] ; p1 491233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 2 492233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rsp+_p1] ; p1 493233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rsp+_q1] ; q1 494233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 495233d2500723e5594f3e7c70896ffeeef32b9c950ywan 496233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm1, xmm7 497233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm4, xmm7 498233d2500723e5594f3e7c70896ffeeef32b9c950ywan 499233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) 500233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) 501233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) 502233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) 503233d2500723e5594f3e7c70896ffeeef32b9c950ywan 504233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 1 505233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rdi+rax*4] ; p2 506233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, [rdi+rcx] ; q2 507233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 508233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rsp+_p2] ; p2 509233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, [rsp+_q2] ; q2 510233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 511233d2500723e5594f3e7c70896ffeeef32b9c950ywan 512233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm1, xmm7 ; *oq1 = sq^0x80; 513233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm4, xmm7 ; *op1 = sp^0x80; 514233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm2, xmm7 515233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm5, xmm7 516233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) 517233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) 518233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm2, xmm7 ; *op2 = sp^0x80; 519233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm5, xmm7 ; *oq2 = sq^0x80; 520233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm3, xmm7 ; *oq0 = sq^0x80 521233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm7 ; *oq0 = sp^0x80 522233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0 523233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi], xmm6 ; p0 524233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi], xmm6 525233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi + rcx], xmm3 ; q0 526233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi + rcx], xmm3 527233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdx, [rcx + rcx*2] 528233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi+rcx*2], xmm1 ; q1 529233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi+rcx*2], xmm1 530233d2500723e5594f3e7c70896ffeeef32b9c950ywan 531233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi + rax], xmm4 ; p1 532233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi + rax], xmm4 533233d2500723e5594f3e7c70896ffeeef32b9c950ywan 534233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi+rax*2], xmm2 ; p2 535233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi+rax*2], xmm2 536233d2500723e5594f3e7c70896ffeeef32b9c950ywan 537233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi+rdx], xmm5 ; q2 538233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi+rdx], xmm5 539233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1 540233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rdi+rcx], xmm5 ; q2 541233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rdi], xmm1 ; q1 542233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsi], xmm3 ; q0 543233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsi+rax ], xmm6 ; p0 544233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsi+rax*2], xmm4 ; p1 545233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rdi+rax*4], xmm2 ; p2 546233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 2 547233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_p1], xmm4 ; p1 548233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_p0], xmm6 ; p0 549233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_q0], xmm3 ; q0 550233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_q1], xmm1 ; q1 551233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 552233d2500723e5594f3e7c70896ffeeef32b9c950ywan 553233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 554233d2500723e5594f3e7c70896ffeeef32b9c950ywan 555233d2500723e5594f3e7c70896ffeeef32b9c950ywan 556233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbloop_filter_horizontal_edge_sse2 557233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 558233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 559233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixel_step, 560233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *blimit, 561233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *limit, 562233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *thresh, 563233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 564233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE 565233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbloop_filter_horizontal_edge_sse2): 566233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 567233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 568233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 5 569233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 570233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 571233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 572233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 573233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 574233d2500723e5594f3e7c70896ffeeef32b9c950ywan 575233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 576233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, lf_var_size 577233d2500723e5594f3e7c70896ffeeef32b9c950ywan 578233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;src_ptr 579233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ;src_pixel_step 580233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(3) ;limit 581233d2500723e5594f3e7c70896ffeeef32b9c950ywan 582233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 583233d2500723e5594f3e7c70896ffeeef32b9c950ywan 584233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; calculate breakout conditions and high edge variance 585233d2500723e5594f3e7c70896ffeeef32b9c950ywan LFH_FILTER_AND_HEV_MASK 1 586233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; filter and write back the results 587233d2500723e5594f3e7c70896ffeeef32b9c950ywan MB_FILTER_AND_WRITEBACK 1 588233d2500723e5594f3e7c70896ffeeef32b9c950ywan 589233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, lf_var_size 590233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 591233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 592233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 593233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 594233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 595233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 596233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 597233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 598233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 599233d2500723e5594f3e7c70896ffeeef32b9c950ywan 600233d2500723e5594f3e7c70896ffeeef32b9c950ywan 601233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbloop_filter_horizontal_edge_uv_sse2 602233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 603233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *u, 604233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixel_step, 605233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *blimit, 606233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *limit, 607233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *thresh, 608233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *v 609233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 610233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE 611233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbloop_filter_horizontal_edge_uv_sse2): 612233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 613233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 614233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 6 615233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 616233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 617233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 618233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 619233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 620233d2500723e5594f3e7c70896ffeeef32b9c950ywan 621233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 622233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, lf_var_size 623233d2500723e5594f3e7c70896ffeeef32b9c950ywan 624233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ; u 625233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(5) ; v 626233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ; src_pixel_step 627233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, rax 628233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax ; negate pitch to deal with above border 629233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(3) ;limit 630233d2500723e5594f3e7c70896ffeeef32b9c950ywan 631233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rcx] 632233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi + rcx] 633233d2500723e5594f3e7c70896ffeeef32b9c950ywan 634233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; calculate breakout conditions and high edge variance 635233d2500723e5594f3e7c70896ffeeef32b9c950ywan LFH_FILTER_AND_HEV_MASK 0 636233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; filter and write back the results 637233d2500723e5594f3e7c70896ffeeef32b9c950ywan MB_FILTER_AND_WRITEBACK 0 638233d2500723e5594f3e7c70896ffeeef32b9c950ywan 639233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, lf_var_size 640233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 641233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 642233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 643233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 644233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 645233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 646233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 647233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 648233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 649233d2500723e5594f3e7c70896ffeeef32b9c950ywan 650233d2500723e5594f3e7c70896ffeeef32b9c950ywan 651233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro TRANSPOSE_16X8 2 652233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 653233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 654233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 655233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 656233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 657233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 658233d2500723e5594f3e7c70896ffeeef32b9c950ywan 659233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 660233d2500723e5594f3e7c70896ffeeef32b9c950ywan 661233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 662233d2500723e5594f3e7c70896ffeeef32b9c950ywan 663233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 664233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 665233d2500723e5594f3e7c70896ffeeef32b9c950ywan 666233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 667233d2500723e5594f3e7c70896ffeeef32b9c950ywan 668233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 669233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 670233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi+rax*8] 671233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi+rax*8] 672233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 673233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(5) ; v_ptr 674233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 675233d2500723e5594f3e7c70896ffeeef32b9c950ywan 676233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 677233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 678233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 679233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 680233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 681233d2500723e5594f3e7c70896ffeeef32b9c950ywan 682233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0 683233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing 684233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi - 4] 685233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 686233d2500723e5594f3e7c70896ffeeef32b9c950ywan 687233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 688233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 689233d2500723e5594f3e7c70896ffeeef32b9c950ywan 690233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 691233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 692233d2500723e5594f3e7c70896ffeeef32b9c950ywan 693233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 694233d2500723e5594f3e7c70896ffeeef32b9c950ywan 695233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 696233d2500723e5594f3e7c70896ffeeef32b9c950ywan 697233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 698233d2500723e5594f3e7c70896ffeeef32b9c950ywan 699233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_t0], xmm2 ; save to free XMM2 700233d2500723e5594f3e7c70896ffeeef32b9c950ywan 701233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 702233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 703233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 704233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 705233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 706233d2500723e5594f3e7c70896ffeeef32b9c950ywan 707233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 708233d2500723e5594f3e7c70896ffeeef32b9c950ywan 709233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 710233d2500723e5594f3e7c70896ffeeef32b9c950ywan 711233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 712233d2500723e5594f3e7c70896ffeeef32b9c950ywan 713233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 714233d2500723e5594f3e7c70896ffeeef32b9c950ywan 715233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 716233d2500723e5594f3e7c70896ffeeef32b9c950ywan 717233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 718233d2500723e5594f3e7c70896ffeeef32b9c950ywan 719233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 720233d2500723e5594f3e7c70896ffeeef32b9c950ywan 721233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm1 ; 722233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 723233d2500723e5594f3e7c70896ffeeef32b9c950ywan 724233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 725233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 726233d2500723e5594f3e7c70896ffeeef32b9c950ywan 727233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 728233d2500723e5594f3e7c70896ffeeef32b9c950ywan 729233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 730233d2500723e5594f3e7c70896ffeeef32b9c950ywan 731233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm5 732233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 733233d2500723e5594f3e7c70896ffeeef32b9c950ywan 734233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 735233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 736233d2500723e5594f3e7c70896ffeeef32b9c950ywan 737233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 738233d2500723e5594f3e7c70896ffeeef32b9c950ywan 739233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 740233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 741233d2500723e5594f3e7c70896ffeeef32b9c950ywan 742233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 743233d2500723e5594f3e7c70896ffeeef32b9c950ywan 744233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 745233d2500723e5594f3e7c70896ffeeef32b9c950ywan 746233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %2 == 0 747233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_q3], xmm7 ; save 7 748233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_q2], xmm6 ; save 6 749233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 750233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 751233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 752233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 753233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_p1], xmm2 ; save 2 754233d2500723e5594f3e7c70896ffeeef32b9c950ywan 755233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 756233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 757233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_p0], xmm3 ; save 3 758233d2500723e5594f3e7c70896ffeeef32b9c950ywan 759233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 760233d2500723e5594f3e7c70896ffeeef32b9c950ywan 761233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_q0], xmm4 ; save 4 762233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_q1], xmm5 ; save 5 763233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rsp+_t0] 764233d2500723e5594f3e7c70896ffeeef32b9c950ywan 765233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 ; 766233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 767233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 768233d2500723e5594f3e7c70896ffeeef32b9c950ywan 769233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %2 == 0 770233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_p2], xmm1 771233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp+_p3], xmm2 772233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 773233d2500723e5594f3e7c70896ffeeef32b9c950ywan 774233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 775233d2500723e5594f3e7c70896ffeeef32b9c950ywan 776233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro LFV_FILTER_MASK_HEV_MASK 0 777233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm6 ; q2 778233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm0, xmm7 ; q2-q3 779233d2500723e5594f3e7c70896ffeeef32b9c950ywan 780233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm7, xmm6 ; q3-q2 781233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm5 ; q1 782233d2500723e5594f3e7c70896ffeeef32b9c950ywan 783233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm7, xmm0 ; abs (q3-q2) 784233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm4, xmm6 ; q1-q2 785233d2500723e5594f3e7c70896ffeeef32b9c950ywan 786233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm1 787233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm6, xmm5 ; q2-q1 788233d2500723e5594f3e7c70896ffeeef32b9c950ywan 789233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm6, xmm4 ; abs (q2-q1) 790233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm0, xmm2 ; p2 - p3; 791233d2500723e5594f3e7c70896ffeeef32b9c950ywan 792233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm2, xmm1 ; p3 - p2; 793233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm0, xmm2 ; abs(p2-p3) 794233d2500723e5594f3e7c70896ffeeef32b9c950ywan 795233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, [rsp+_p1] ; p1 796233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm0, xmm7 797233d2500723e5594f3e7c70896ffeeef32b9c950ywan 798233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm5 ; p1 799233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm5, xmm1 ; p1-p2 800233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm1, xmm2 ; p2-p1 801233d2500723e5594f3e7c70896ffeeef32b9c950ywan 802233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm3 ; p0 803233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm7, xmm2 ; p0-p1 804233d2500723e5594f3e7c70896ffeeef32b9c950ywan 805233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm1, xmm5 ; abs(p2-p1) 806233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm0, xmm6 807233d2500723e5594f3e7c70896ffeeef32b9c950ywan 808233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm0, xmm1 809233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm2 ; p1 810233d2500723e5594f3e7c70896ffeeef32b9c950ywan 811233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm2, xmm3 ; p1-p0 812233d2500723e5594f3e7c70896ffeeef32b9c950ywan 813233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm2, xmm7 ; abs(p1-p0) 814233d2500723e5594f3e7c70896ffeeef32b9c950ywan 815233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm0, xmm2 816233d2500723e5594f3e7c70896ffeeef32b9c950ywan 817233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, [rsp+_q0] ; q0 818233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rsp+_q1] ; q1 819233d2500723e5594f3e7c70896ffeeef32b9c950ywan 820233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(3) ; limit 821233d2500723e5594f3e7c70896ffeeef32b9c950ywan 822233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm5 ; q0 823233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm7 ; q1 824233d2500723e5594f3e7c70896ffeeef32b9c950ywan 825233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm5, xmm7 ; q0-q1 826233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm7, xmm6 ; q1-q0 827233d2500723e5594f3e7c70896ffeeef32b9c950ywan 828233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm7, xmm5 ; abs(q1-q0) 829233d2500723e5594f3e7c70896ffeeef32b9c950ywan 830233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaxub xmm0, xmm7 831233d2500723e5594f3e7c70896ffeeef32b9c950ywan 832233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm0, [rdx] ; limit 833233d2500723e5594f3e7c70896ffeeef32b9c950ywan 834233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(2) ; blimit 835233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm4 ; q1 836233d2500723e5594f3e7c70896ffeeef32b9c950ywan 837233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm5, xmm1 ; q1-=p1 838233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm1, xmm4 ; p1-=q1 839233d2500723e5594f3e7c70896ffeeef32b9c950ywan 840233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm5, xmm1 ; abs(p1-q1) 841233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm3 ; p0 842233d2500723e5594f3e7c70896ffeeef32b9c950ywan 843233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero 844233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm1, xmm6 ; p0-q0 845233d2500723e5594f3e7c70896ffeeef32b9c950ywan 846233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rdx] ; blimit 847233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(4) ; get thresh 848233d2500723e5594f3e7c70896ffeeef32b9c950ywan 849233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm5, 1 ; abs(p1-q1)/2 850233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm6, xmm3 ; q0-p0 851233d2500723e5594f3e7c70896ffeeef32b9c950ywan 852233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm1, xmm6 ; abs(q0-p0) 853233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm1, xmm1 ; abs(q0-p0)*2 854233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, [rdx] 855233d2500723e5594f3e7c70896ffeeef32b9c950ywan 856233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 857233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh 858233d2500723e5594f3e7c70896ffeeef32b9c950ywan 859233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh 860233d2500723e5594f3e7c70896ffeeef32b9c950ywan 861233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 862233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 863233d2500723e5594f3e7c70896ffeeef32b9c950ywan 864233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm1, xmm0 ; mask 865233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm2, xmm0 866233d2500723e5594f3e7c70896ffeeef32b9c950ywan 867233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm0 868233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm4, xmm4 869233d2500723e5594f3e7c70896ffeeef32b9c950ywan 870233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm1, xmm0 871233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm4, xmm2 872233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 873233d2500723e5594f3e7c70896ffeeef32b9c950ywan 874233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro BV_TRANSPOSE 0 875233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 876233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 877233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 878233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 879233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 880233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 881233d2500723e5594f3e7c70896ffeeef32b9c950ywan 882233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 883233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 884233d2500723e5594f3e7c70896ffeeef32b9c950ywan 885233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 886233d2500723e5594f3e7c70896ffeeef32b9c950ywan 887233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 888233d2500723e5594f3e7c70896ffeeef32b9c950ywan 889233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 890233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 891233d2500723e5594f3e7c70896ffeeef32b9c950ywan 892233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 893233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 894233d2500723e5594f3e7c70896ffeeef32b9c950ywan 895233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 896233d2500723e5594f3e7c70896ffeeef32b9c950ywan 897233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 898233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 899233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 900233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 901233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 902233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 903233d2500723e5594f3e7c70896ffeeef32b9c950ywan 904233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro BV_WRITEBACK 2 905233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi+2], %1 906233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi+4*rax+2], %2 907233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq %1, 4 908233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq %2, 4 909233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdi+2], %1 910233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdi+4*rax+2], %2 911233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq %1, 4 912233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq %2, 4 913233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi+2*rax+2], %1 914233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi+2*rcx+2], %2 915233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq %1, 4 916233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq %2, 4 917233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdi+2*rax+2], %1 918233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdi+2*rcx+2], %2 919233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 920233d2500723e5594f3e7c70896ffeeef32b9c950ywan 921233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT 922233d2500723e5594f3e7c70896ffeeef32b9c950ywan 923233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_vertical_edge_sse2 924233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 925233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 926233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixel_step, 927233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *blimit, 928233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *limit, 929233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *thresh, 930233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 931233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE 932233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_vertical_edge_sse2): 933233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 934233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 935233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 5 936233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 937233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 938233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 939233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 940233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 941233d2500723e5594f3e7c70896ffeeef32b9c950ywan 942233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 943233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, lf_var_size 944233d2500723e5594f3e7c70896ffeeef32b9c950ywan 945233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ; src_ptr 946233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ; src_pixel_step 947233d2500723e5594f3e7c70896ffeeef32b9c950ywan 948233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi - 4] 949233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 950233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rax*2+rax] 951233d2500723e5594f3e7c70896ffeeef32b9c950ywan 952233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;transpose 16x8 to 8x16, and store the 8-line result on stack. 953233d2500723e5594f3e7c70896ffeeef32b9c950ywan TRANSPOSE_16X8 1, 1 954233d2500723e5594f3e7c70896ffeeef32b9c950ywan 955233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; calculate filter mask and high edge variance 956233d2500723e5594f3e7c70896ffeeef32b9c950ywan LFV_FILTER_MASK_HEV_MASK 957233d2500723e5594f3e7c70896ffeeef32b9c950ywan 958233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; start work on filters 959233d2500723e5594f3e7c70896ffeeef32b9c950ywan B_FILTER 2 960233d2500723e5594f3e7c70896ffeeef32b9c950ywan 961233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose and write back - only work on q1, q0, p0, p1 962233d2500723e5594f3e7c70896ffeeef32b9c950ywan BV_TRANSPOSE 963233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; store 16-line result 964233d2500723e5594f3e7c70896ffeeef32b9c950ywan 965233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdx, [rax] 966233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rdx 967233d2500723e5594f3e7c70896ffeeef32b9c950ywan 968233d2500723e5594f3e7c70896ffeeef32b9c950ywan BV_WRITEBACK xmm1, xmm5 969233d2500723e5594f3e7c70896ffeeef32b9c950ywan 970233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi+rdx*8] 971233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi+rdx*8] 972233d2500723e5594f3e7c70896ffeeef32b9c950ywan BV_WRITEBACK xmm2, xmm6 973233d2500723e5594f3e7c70896ffeeef32b9c950ywan 974233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, lf_var_size 975233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 976233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 977233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 978233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 979233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 980233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 981233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 982233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 983233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 984233d2500723e5594f3e7c70896ffeeef32b9c950ywan 985233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 986233d2500723e5594f3e7c70896ffeeef32b9c950ywan 987233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_vertical_edge_uv_sse2 988233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 989233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *u, 990233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixel_step, 991233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *blimit, 992233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *limit, 993233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *thresh, 994233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *v 995233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 996233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE 997233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_vertical_edge_uv_sse2): 998233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 999233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 1000233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 6 1001233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 1002233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 1003233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 1004233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 1005233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 1006233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1007233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 1008233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, lf_var_size 1009233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1010233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ; u_ptr 1011233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ; src_pixel_step 1012233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1013233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi - 4] 1014233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1015233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rax+2*rax] 1016233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1017233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;transpose 16x8 to 8x16, and store the 8-line result on stack. 1018233d2500723e5594f3e7c70896ffeeef32b9c950ywan TRANSPOSE_16X8 0, 1 1019233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1020233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; calculate filter mask and high edge variance 1021233d2500723e5594f3e7c70896ffeeef32b9c950ywan LFV_FILTER_MASK_HEV_MASK 1022233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1023233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; start work on filters 1024233d2500723e5594f3e7c70896ffeeef32b9c950ywan B_FILTER 2 1025233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1026233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose and write back - only work on q1, q0, p0, p1 1027233d2500723e5594f3e7c70896ffeeef32b9c950ywan BV_TRANSPOSE 1028233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1029233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1030233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1031233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; store 16-line result 1032233d2500723e5594f3e7c70896ffeeef32b9c950ywan BV_WRITEBACK xmm1, xmm5 1033233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1034233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ; u_ptr 1035233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi - 4] 1036233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1037233d2500723e5594f3e7c70896ffeeef32b9c950ywan BV_WRITEBACK xmm2, xmm6 1038233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1039233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, lf_var_size 1040233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 1041233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 1042233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 1043233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 1044233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 1045233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 1046233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 1047233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 1048233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 1049233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1050233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro MBV_TRANSPOSE 0 1051233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1052233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1053233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1054233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1055233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1056233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1057233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1058233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1059233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1060233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1061233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1062233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1063233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1064233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1065233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1066233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1067233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1068233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1069233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1070233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1071233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1072233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1073233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1074233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1075233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 1076233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 1077233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1078233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1079233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 1080233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1081233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 1082233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1083233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1084233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 1085233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 1086233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 1087233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1088233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro MBV_WRITEBACK_1 0 1089233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi], xmm0 1090233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi], xmm0 1091233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1092233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi+2*rax], xmm6 1093233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi+2*rax], xmm6 1094233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1095233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1096233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 1097233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 1098233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1099233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi+4*rax], xmm0 1100233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi+4*rax], xmm0 1101233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1102233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi+2*rcx], xmm3 1103233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi+2*rcx], xmm3 1104233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1105233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1106233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 1107233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 1108233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1109233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm7 1110233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 1111233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 1112233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1113233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1114233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 1115233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 1116233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 1117233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1118233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro MBV_WRITEBACK_2 0 1119233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi], xmm1 1120233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi], xmm1 1121233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1122233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi+2*rax], xmm5 1123233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi+2*rax], xmm5 1124233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1125233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1126233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 1127233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 1128233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1129233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi+4*rax], xmm1 1130233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi+4*rax], xmm1 1131233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1132233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi+2*rcx], xmm4 1133233d2500723e5594f3e7c70896ffeeef32b9c950ywan movhps [rdi+2*rcx], xmm4 1134233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 1135233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1136233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1137233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbloop_filter_vertical_edge_sse2 1138233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 1139233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 1140233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixel_step, 1141233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *blimit, 1142233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *limit, 1143233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *thresh, 1144233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 1145233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE 1146233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbloop_filter_vertical_edge_sse2): 1147233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 1148233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 1149233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 5 1150233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 1151233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 1152233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 1153233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 1154233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 1155233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1156233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 1157233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, lf_var_size 1158233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1159233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ; src_ptr 1160233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ; src_pixel_step 1161233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1162233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi - 4] 1163233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1164233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rax*2+rax] 1165233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1166233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Transpose 1167233d2500723e5594f3e7c70896ffeeef32b9c950ywan TRANSPOSE_16X8 1, 0 1168233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1169233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; calculate filter mask and high edge variance 1170233d2500723e5594f3e7c70896ffeeef32b9c950ywan LFV_FILTER_MASK_HEV_MASK 1171233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1172233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax 1173233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; start work on filters 1174233d2500723e5594f3e7c70896ffeeef32b9c950ywan MB_FILTER_AND_WRITEBACK 2 1175233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1176233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi+rax*8] 1177233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi+rax*8] 1178233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1179233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose and write back 1180233d2500723e5594f3e7c70896ffeeef32b9c950ywan MBV_TRANSPOSE 1181233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1182233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax 1183233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1184233d2500723e5594f3e7c70896ffeeef32b9c950ywan MBV_WRITEBACK_1 1185233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1186233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1187233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi+rax*8] 1188233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi+rax*8] 1189233d2500723e5594f3e7c70896ffeeef32b9c950ywan MBV_WRITEBACK_2 1190233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1191233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, lf_var_size 1192233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 1193233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 1194233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 1195233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 1196233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 1197233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 1198233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 1199233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 1200233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 1201233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1202233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1203233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbloop_filter_vertical_edge_uv_sse2 1204233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 1205233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *u, 1206233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixel_step, 1207233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *blimit, 1208233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *limit, 1209233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *thresh, 1210233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *v 1211233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 1212233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE 1213233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbloop_filter_vertical_edge_uv_sse2): 1214233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 1215233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 1216233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 6 1217233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 1218233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 1219233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 1220233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 1221233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 1222233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1223233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 1224233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, lf_var_size 1225233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1226233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ; u_ptr 1227233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ; src_pixel_step 1228233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1229233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi - 4] 1230233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1231233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rax+2*rax] 1232233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1233233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Transpose 1234233d2500723e5594f3e7c70896ffeeef32b9c950ywan TRANSPOSE_16X8 0, 0 1235233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1236233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; calculate filter mask and high edge variance 1237233d2500723e5594f3e7c70896ffeeef32b9c950ywan LFV_FILTER_MASK_HEV_MASK 1238233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1239233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; start work on filters 1240233d2500723e5594f3e7c70896ffeeef32b9c950ywan MB_FILTER_AND_WRITEBACK 2 1241233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1242233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose and write back 1243233d2500723e5594f3e7c70896ffeeef32b9c950ywan MBV_TRANSPOSE 1244233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1245233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;u_ptr 1246233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi - 4] 1247233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax] 1248233d2500723e5594f3e7c70896ffeeef32b9c950ywan MBV_WRITEBACK_1 1249233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(5) ;v_ptr 1250233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi - 4] 1251233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax] 1252233d2500723e5594f3e7c70896ffeeef32b9c950ywan MBV_WRITEBACK_2 1253233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1254233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, lf_var_size 1255233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 1256233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 1257233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 1258233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 1259233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 1260233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 1261233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 1262233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 1263233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 1264233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1265233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1266233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_simple_horizontal_edge_sse2 1267233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 1268233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 1269233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixel_step, 1270233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *blimit, 1271233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 1272233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE 1273233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_simple_horizontal_edge_sse2): 1274233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 1275233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 1276233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 3 1277233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 1278233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 1279233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 1280233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1281233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, arg(0) ;src_ptr 1282233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1283233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, [GLOBAL(tfe)] 1284233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdx, [rcx + rax] 1285233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax 1286233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1287233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; calculate mask 1288233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, [rdx] ; q1 1289233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(2) ;blimit 1290233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rcx+2*rax] ; p1 1291233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1292233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 1293233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm0 1294233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1295233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm0, xmm1 ; q1-=p1 1296233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm1, xmm3 ; p1-=q1 1297233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm1, xmm0 ; abs(p1-q1) 1298233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm1, xmm6 ; set lsb of each byte to zero 1299233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm1, 1 ; abs(p1-q1)/2 1300233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1301233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, XMMWORD PTR [rdx] 1302233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1303233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, [rcx+rax] ; p0 1304233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rcx] ; q0 1305233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm4 ; q0 1306233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm5 ; p0 1307233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm5, xmm4 ; p0-=q0 1308233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm4, xmm6 ; q0-=p0 1309233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm5, xmm4 ; abs(p0 - q0) 1310233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1311233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [GLOBAL(t80)] 1312233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1313233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm5, xmm5 ; abs(p0-q0)*2 1314233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1315233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit 1316233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 1317233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm5, xmm7 1318233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1319233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1320233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; start work on filters 1321233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm2, xmm4 ; p1 offset to convert to signed values 1322233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm3, xmm4 ; q1 offset to convert to signed values 1323233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm2, xmm3 ; p1 - q1 1324233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1325233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm4 ; offset to convert to signed values 1326233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm4 ; offset to convert to signed values 1327233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm0 ; q0 1328233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm0, xmm6 ; q0 - p0 1329233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) 1330233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) 1331233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) 1332233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm5, xmm2 ; mask filter values we don't care about 1333233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1334233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm5 1335233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 1336233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 1337233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1338233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [GLOBAL(te0)] 1339233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [GLOBAL(t1f)] 1340233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1341233d2500723e5594f3e7c70896ffeeef32b9c950ywan; pxor xmm7, xmm7 1342233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpgtb xmm7, xmm0 ;save sign 1343233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm7, xmm1 ;preserve the upper 3 bits 1344233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm0, 3 1345233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm0, xmm2 ;clear out upper 3 bits 1346233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm0, xmm7 ;add sign 1347233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm3, xmm0 ; q0-= q0sz add 1348233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1349233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 1350233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpgtb xmm7, xmm5 ;save sign 1351233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm7, xmm1 ;preserve the upper 3 bits 1352233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm5, 3 1353233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm5, xmm2 ;clear out upper 3 bits 1354233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm5, xmm7 ;add sign 1355233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm6, xmm5 ; p0+= p0 add 1356233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1357233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm3, xmm4 ; unoffset 1358233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rcx], xmm3 ; write back 1359233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1360233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm4 ; unoffset 1361233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rcx+rax], xmm6 ; write back 1362233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1363233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 1364233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 1365233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 1366233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 1367233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 1368233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 1369233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1370233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1371233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_simple_vertical_edge_sse2 1372233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 1373233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 1374233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixel_step, 1375233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const char *blimit, 1376233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 1377233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE 1378233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_simple_vertical_edge_sse2): 1379233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp ; save old base pointer value. 1380233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp ; set new base pointer value. 1381233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 3 1382233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 1383233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx ; save callee-saved reg 1384233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 1385233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 1386233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 1387233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1388233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 1389233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, 32 ; reserve 32 bytes 1390233d2500723e5594f3e7c70896ffeeef32b9c950ywan %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1391233d2500723e5594f3e7c70896ffeeef32b9c950ywan %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1392233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1393233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;src_ptr 1394233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1395233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1396233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi - 2 ] 1397233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax] 1398233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdx, [rsi + rax*4] 1399233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rdx + rax] 1400233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1401233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 1402233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 1403233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm2, [rdi] ; 13 12 11 10 1404233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm3, [rcx] ; 53 52 51 50 1405233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 1406233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 1407233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1408233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm4, [rsi + rax*2] ; 23 22 21 20 1409233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm5, [rdx + rax*2] ; 63 62 61 60 1410233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm6, [rdi + rax*2] ; 33 32 31 30 1411233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm7, [rcx + rax*2] ; 73 72 71 70 1412233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 1413233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 1414233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1415233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 1416233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 1417233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1418233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm0 1419233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 1420233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 1421233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1422233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm0 1423233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 1424233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 1425233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1426233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rax*8] 1427233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax] 1428233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdx, [rsi + rax*4] 1429233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rdx + rax] 1430233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1431233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm4, [rsi] ; 83 82 81 80 1432233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm1, [rdx] ; c3 c2 c1 c0 1433233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm6, [rdi] ; 93 92 91 90 1434233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm3, [rcx] ; d3 d2 d1 d0 1435233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 1436233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 1437233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1438233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0 1439233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 1440233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0 1441233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 1442233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 1443233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 1444233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1445233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 1446233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 1447233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1448233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm4 1449233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 1450233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 1451233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1452233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm4 1453233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 1454233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 1455233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1456233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm0 1457233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm2 1458233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1459233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1460233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 1461233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1462233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 1463233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1464233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(2) ;blimit 1465233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1466233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; calculate mask 1467233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm0 ; p1 1468233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm3 ; q1 1469233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm7, xmm0 ; q1-=p1 1470233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm6, xmm3 ; p1-=q1 1471233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm6, xmm7 ; abs(p1-q1) 1472233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero 1473233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm6, 1 ; abs(p1-q1)/2 1474233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1475233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rdx] 1476233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1477233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 ; p0 1478233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm2 ; q0 1479233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm5, xmm2 ; p0-=q0 1480233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm4, xmm1 ; q0-=p0 1481233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm5, xmm4 ; abs(p0 - q0) 1482233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm5, xmm5 ; abs(p0-q0)*2 1483233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1484233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1485233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [GLOBAL(t80)] 1486233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1487233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit 1488233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 1489233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm5, xmm7 ; mm5 = mask 1490233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1491233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; start work on filters 1492233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa t0, xmm0 1493233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa t1, xmm3 1494233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1495233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm4 ; p1 offset to convert to signed values 1496233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm3, xmm4 ; q1 offset to convert to signed values 1497233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm0, xmm3 ; p1 - q1 1498233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1499233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm1, xmm4 ; offset to convert to signed values 1500233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm2, xmm4 ; offset to convert to signed values 1501233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1502233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm2 ; offseted ; q0 1503233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm2, xmm1 ; q0 - p0 1504233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0) 1505233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0) 1506233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0) 1507233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm5, xmm0 ; mask filter values we don't care about 1508233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1509233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm5 1510233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 1511233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 1512233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1513233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, [GLOBAL(te0)] 1514233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [GLOBAL(t1f)] 1515233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1516233d2500723e5594f3e7c70896ffeeef32b9c950ywan; pxor xmm7, xmm7 1517233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpgtb xmm7, xmm0 ;save sign 1518233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm7, xmm6 ;preserve the upper 3 bits 1519233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm0, 3 1520233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm0, xmm2 ;clear out upper 3 bits 1521233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm0, xmm7 ;add sign 1522233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsb xmm3, xmm0 ; q0-= q0sz add 1523233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1524233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 1525233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpgtb xmm7, xmm5 ;save sign 1526233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm7, xmm6 ;preserve the upper 3 bits 1527233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm5, 3 1528233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm5, xmm2 ;clear out upper 3 bits 1529233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm5, xmm7 ;add sign 1530233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddsb xmm1, xmm5 ; p0+= p0 add 1531233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1532233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm3, xmm4 ; unoffset q0 1533233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm1, xmm4 ; unoffset p0 1534233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1535233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, t0 ; p1 1536233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, t1 ; q1 1537233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1538233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; write out order: xmm0 xmm2 xmm1 xmm3 1539233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdx, [rsi + rax*4] 1540233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1541233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; transpose back to write out 1542233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1543233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 1544233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1545233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 1546233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm0 1547233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1548233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1549233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1550233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm3 1551233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1552233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1553233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1554233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm0 1555233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1556233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1557233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1558233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm6 1559233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1560233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1561233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1562233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi], xmm6 ; write the second 8-line result 1563233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdx], xmm3 1564233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm6, 4 1565233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm3, 4 1566233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdi], xmm6 1567233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rcx], xmm3 1568233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm6, 4 1569233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm3, 4 1570233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi + rax*2], xmm6 1571233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdx + rax*2], xmm3 1572233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm6, 4 1573233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm3, 4 1574233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdi + rax*2], xmm6 1575233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rcx + rax*2], xmm3 1576233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1577233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax 1578233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rax*8] 1579233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax 1580233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rsi + rax] 1581233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdx, [rsi + rax*4] 1582233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rdx + rax] 1583233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1584233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi], xmm0 ; write the first 8-line result 1585233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdx], xmm2 1586233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm0, 4 1587233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm2, 4 1588233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdi], xmm0 1589233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rcx], xmm2 1590233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm0, 4 1591233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm2, 4 1592233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi + rax*2], xmm0 1593233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdx + rax*2], xmm2 1594233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm0, 4 1595233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm2, 4 1596233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdi + rax*2], xmm0 1597233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rcx + rax*2], xmm2 1598233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1599233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, 32 1600233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 1601233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 1602233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 1603233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 1604233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 1605233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 1606233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 1607233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 1608233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 1609233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1610233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA 1611233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 1612233d2500723e5594f3e7c70896ffeeef32b9c950ywantfe: 1613233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 16 db 0xfe 1614233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 1615233d2500723e5594f3e7c70896ffeeef32b9c950ywant80: 1616233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 16 db 0x80 1617233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 1618233d2500723e5594f3e7c70896ffeeef32b9c950ywant1s: 1619233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 16 db 0x01 1620233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 1621233d2500723e5594f3e7c70896ffeeef32b9c950ywant3: 1622233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 16 db 0x03 1623233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 1624233d2500723e5594f3e7c70896ffeeef32b9c950ywant4: 1625233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 16 db 0x04 1626233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 1627233d2500723e5594f3e7c70896ffeeef32b9c950ywanones: 1628233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 8 dw 0x0001 1629233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 1630233d2500723e5594f3e7c70896ffeeef32b9c950ywans9: 1631233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 8 dw 0x0900 1632233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 1633233d2500723e5594f3e7c70896ffeeef32b9c950ywans63: 1634233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 8 dw 0x003f 1635233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 1636233d2500723e5594f3e7c70896ffeeef32b9c950ywante0: 1637233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 16 db 0xe0 1638233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 1639233d2500723e5594f3e7c70896ffeeef32b9c950ywant1f: 1640233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 16 db 0x1f 1641