190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_horizontal_edge_mmx 1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 191b362b15af34006e6a11974088a46d42b903418eJohann; const char *blimit, 2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 241b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE 2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_horizontal_edge_mmx): 2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ALIGN_STACK 16, rax 3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 32 ; reserve 32 bytes 3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(5) ;count 431b362b15af34006e6a11974088a46d42b903418eJohann.next8_h: 4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ;limit 4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] 4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, rsi ; rdi points to row +1 for indirect addressing 4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rax 4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate breakout conditions 5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdi+2*rax] ; q3 5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi+2*rax] ; q2 5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm1 ; q2 5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm2 ; q2-=q3 5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm6 ; q3-=q2 5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm2 ; abs(q3-q2) 5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm7 ; 5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+rax] ; q1 6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm4 ; q1 6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm6 ; q1-=q2 6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm3 ; q2-=q1 6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm6 ; abs(q2-q1) 6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi] ; q0 6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm4 ; q0 7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm3 ; q0-=q1 7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm0 ; q1-=q0 7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(q0-q1) 7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t0, mm4 ; save to t0 7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax ; negate pitch to deal with above border 7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rsi+4*rax] ; p3 8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rdi+4*rax] ; p2 8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; p2 8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm2 ; p2-=p3 8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm5 ; p3-=p2 8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm2 ; abs(p3 - p2) 8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+2*rax] ; p1 9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm4 ; p1 9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm5 ; p1-=p2 9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p2-=p1 9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm5 ; abs(p2 - p1) 9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm3 ; p1 9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+rax] ; p0 10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; p0 10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm3 ; p0-=p1 10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm5 ; p1-=p0 10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(p1 - p0) 10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t1, mm4 ; save to t1 10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdi] ; q1 11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; q1 11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm2 ; q1-=p1 11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm4 ; p1-=q1 11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm2, mm3 ; abs(p1-q1) 114538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero 11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm2, 1 ; abs(p1-q1)/2 11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm5 ; p0 11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rsi] ; q0 11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p0-=q0 12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm6 ; q0-=p0 12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm3 ; abs(p0 - q0) 12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm5 ; abs(p0-q0)*2 12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1251b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, arg(2) ;blimit ; get blimit 1261b362b15af34006e6a11974088a46d42b903418eJohann movq mm7, [rdx] ; blimit 12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1281b362b15af34006e6a11974088a46d42b903418eJohann psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm5 13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm1, mm5 ; mask mm1 13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate high edge variance 13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(4) ;thresh ; get thresh 13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] ; 13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, t0 ; get abs (q1 - q0) 13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, t1 ; get abs (p1 - p0) 13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm7 14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm4, mm5 14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm5, mm5 14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm4, mm5 14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rsi+2*rax] ; p1 15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdi] ; q1 151538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 152538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm2, mm7 ; p1 - q1 15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm2, mm4 ; high var mask (hvm)(p1 - q1) 155538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 156538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm6 ; q0 - p0 15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm1, mm2 ; mask filter values we don't care about 16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 164538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 165538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm2 ; 17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm5, mm2 ; 17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; 17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 11 17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm5 17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; abcdefgh 17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm1 ; e0f0g0h0 17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 ; 0 18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm1, mm5 ; a0b0c0d0 18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 11 ; sign extended shift right by 3 18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; save results 18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 186538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsw mm5, [GLOBAL(ones)] 187538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsw mm1, [GLOBAL(ones)] 18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 1 ; partial shifted one more time for 2nd tap 18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 1 ; partial shifted one more time for 2nd tap 19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pandn mm4, mm5 ; high edge variance additive 19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm2 ; p0+= p0 add 194538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; unoffset 19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax], mm6 ; write back 19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+2*rax] ; p1 198538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; reoffset 19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm4 ; p1+= p1 add 200538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; unoffset 20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+2*rax], mm6 ; write back 20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; q0-= q0 add 204538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] ; unoffset 20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi], mm3 ; write back 20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm7, mm4 ; q1-= q1 add 208538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm7, [GLOBAL(t80)] ; unoffset 20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi], mm7 ; write back 21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi,8 21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 2141b362b15af34006e6a11974088a46d42b903418eJohann jnz .next8_h 21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 32 21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_vertical_edge_mmx 22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 2311b362b15af34006e6a11974088a46d42b903418eJohann; const char *blimit, 23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 2361b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE 23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_vertical_edge_mmx): 23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ALIGN_STACK 16, rax 24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 64 ; reserve 64 bytes 24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; 25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi + rax*4 - 4] 25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(5) ;count 2581b362b15af34006e6a11974088a46d42b903418eJohann.next8_v: 25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, rsi ; rdi points to row +1 for indirect addressing 26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rax 26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ;transpose 26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm6 ; 77 76 75 74 73 72 71 70 26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; 47 46 45 44 43 42 41 40 27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 57 47 56 46 55 45 54 44 27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm4 ; 53 43 52 42 51 41 50 40 28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm6 ; 27 26 25 24 23 22 21 20 28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm7 ; 17 07 16 06 15 05 14 04 29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm7 ; 37 27 17 07 36 26 16 06 30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm6 ; 76 66 56 46 36 26 16 06 30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm7 ; q2-q3 30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm6 ; q3-q2 30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm5; ; mm7=abs (q3-q2) 31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; 35 25 15 05 34 24 14 04 31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm6 ; q1-q2 31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm5 ; q2-q1 31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm6, mm3 ; mm6=abs(q2-q1) 32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+24], mm5 ; save q1 32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+16], mm0 ; save q0 32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm3 ; 13 03 12 02 11 01 10 00 33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm0 ; 31 21 11 01 30 20 10 00 33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm0 ; p2-p3 34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm0, mm1 ; p3-p2 34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm2 ; mm0=abs(p3-p2) 34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm3 ; 33 23 13 03 32 22 12 02 34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+8], mm3 ; save p0 34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx], mm2 ; save p1 35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; mm5 = p1 35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm1 ; p1-p2 35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm5 ; p2-p1 35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm2 ; mm1=abs(p2-p1) 35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ;limit 35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rdx] ; mm4 = limit 36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm4 36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm0, mm4 36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm4 36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm4 36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm6 36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm1 36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit 37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm5 ; p1 37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm3 ; mm3=mm7=p0 37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm5 ; p0 - p1 37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p1 - p0 37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm7 ; abs(p1-p0) 37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t0, mm5 ; save abs(p1-p0) 38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm4 38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm5 ; mm0=mask 38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, [rdx+16] ; mm5=q0 38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx+24] ; mm7=q1 38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm5 ; mm6=q0 38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm7 ; q1 39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm7 ; q0-q1 39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm6 ; q1-q0 39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm5 ; abs(q1-q0) 39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t1, mm7 ; save abs(q1-q0) 39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm4 39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm7 ; mask 39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; q1 40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm1 ; q1-=p1 40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm2 ; p1-=q1 40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm1 ; abs(p1-q1) 404538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero 40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm5, 1 ; abs(p1-q1)/2 40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4071b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, arg(2) ;blimit ; 40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4091b362b15af34006e6a11974088a46d42b903418eJohann movq mm4, [rdx] ;blimit 41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm3 ; mm1=mm3=p0 41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm6 ; mm7=mm6=q0 41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm7 ; p0-q0 41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm3 ; q0-p0 41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm7 ; abs(q0-p0) 41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm1, mm1 ; abs(q0-p0)*2 41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4201b362b15af34006e6a11974088a46d42b903418eJohann psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm0; ; mask 42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm1, mm0 42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate high edge variance 42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(4) ;thresh ; get thresh 42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] 42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; 43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, t0 ; get abs (q1 - q0) 43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, t1 ; get abs (p1 - p0) 43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm7 43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm4, mm0 43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm0, mm0 44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm4, mm0 44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx] ; p1 44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx+24] ; q1 44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rdx+8] ; p0 45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rdx+16] ; q0 45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 453538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 454538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm2, mm7 ; p1 - q1 45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm2, mm4 ; high var mask (hvm)(p1 - q1) 45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 459538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 460538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm6 ; q0 - p0 46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm1, mm2 ; mask filter values we don't care about 47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 472538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 474538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm2 ; 47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm5, mm2 ; 48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; 48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 11 48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm5 48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; abcdefgh 49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm1 ; e0f0g0h0 49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 ; 0 49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm1, mm5 ; a0b0c0d0 49690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 11 ; sign extended shift right by 3 49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; save results 49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 501538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsw mm5, [GLOBAL(ones)] 50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 503538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsw mm1, [GLOBAL(ones)] 50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 1 ; partial shifted one more time for 2nd tap 50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 1 ; partial shifted one more time for 2nd tap 50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pandn mm4, mm5 ; high edge variance additive 51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm2 ; p0+= p0 add 512538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; unoffset 51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6=p0 ; 51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rdx] ; p1 516538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm1, [GLOBAL(t80)] ; reoffset 51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 51890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm1, mm4 ; p1+= p1 add 519538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm1, [GLOBAL(t80)] ; unoffset 52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = p0 mm1 = p1 52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; q0-= q0 add 523538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] ; unoffset 52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm3 = q0 52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm7, mm4 ; q1-= q1 add 527538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm7, [GLOBAL(t80)] ; unoffset 52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm7 = q1 52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 530b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian ; transpose and write back 53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = 72 62 52 42 32 22 12 02 53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = 73 63 53 43 33 23 13 03 53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm3 = 74 64 54 44 34 24 14 04 53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm7 = 75 65 55 45 35 25 15 05 53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; 72 62 52 42 32 22 12 02 53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; 74 64 54 44 34 24 14 04 54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 54490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; 33 32 23 22 13 12 03 02 54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; 73 72 63 62 53 52 43 42 55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 55390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm2 = 15 14 13 12 05 04 03 02 55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = 35 34 33 32 25 24 23 22 55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm5 = 55 54 53 52 45 44 43 42 55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = 75 74 73 72 65 64 63 62 55990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+rax*4+2], mm2 56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm2, 32 56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi+rax*4+2], mm2 56690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+rax*2+2], mm6 56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+rax+2],mm6 57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+2], mm1 57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm1, 32 57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi+2], mm1 57590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 57690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi+rax+2],mm5 57890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm5, 32 57990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi+rax*2+2], mm5 58190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi+rax*8] 58390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 5841b362b15af34006e6a11974088a46d42b903418eJohann jnz .next8_v 58590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 64 58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 59590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_mbloop_filter_horizontal_edge_mmx 59890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 59990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 6011b362b15af34006e6a11974088a46d42b903418eJohann; const char *blimit, 60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 60490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 6061b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE 60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_mbloop_filter_horizontal_edge_mmx): 60890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 61090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 61190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 61690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ALIGN_STACK 16, rax 61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 32 ; reserve 32 bytes 61890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 62090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(5) ;count 6251b362b15af34006e6a11974088a46d42b903418eJohann.next8_mbh: 62690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ;limit 62790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] 62890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, rsi ; rdi points to row +1 for indirect addressing 62990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rax 63090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 63190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate breakout conditions 63290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdi+2*rax] ; q3 63390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 63490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi+2*rax] ; q2 63590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm1 ; q2 63690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm2 ; q2-=q3 63790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm6 ; q3-=q2 63890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm2 ; abs(q3-q2) 63990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm7 64090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 64190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 64290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit 64390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+rax] ; q1 64490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm4 ; q1 64590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm6 ; q1-=q2 64690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm3 ; q2-=q1 64790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm6 ; abs(q2-q1) 64890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 64990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 65090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm3=q1, mm7 = limit 65390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi] ; q0 65590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm4 ; q0 65690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm3 ; q0-=q1 65790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm0 ; q1-=q0 65890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(q0-q1) 65990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t0, mm4 ; save to t0 66090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 66190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 66290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 66390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 66490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 66590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 66690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax ; negate pitch to deal with above border 66790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 66890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rsi+4*rax] ; p3 66990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rdi+4*rax] ; p2 67090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; p2 67190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm2 ; p2-=p3 67290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm5 ; p3-=p2 67390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm2 ; abs(p3 - p2) 67490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 67590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 67690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 67790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 67890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+2*rax] ; p1 67990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm4 ; p1 68090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm5 ; p1-=p2 68190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p2-=p1 68290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm5 ; abs(p2 - p1) 68390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 68490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 68590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 68690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm3 ; p1 68790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 68890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 68990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 69090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 69190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+rax] ; p0 69290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; p0 69390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm3 ; p0-=p1 69490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm5 ; p1-=p0 69590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(p1 - p0) 69690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t1, mm4 ; save to t1 69790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 69890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 69990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) 70090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm5 = p0 70190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdi] ; q1 70290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; q1 70390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm2 ; q1-=p1 70490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm4 ; p1-=q1 70590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm2, mm3 ; abs(p1-q1) 706538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero 70790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm2, 1 ; abs(p1-q1)/2 70890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 70990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm5 ; p0 71090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 71190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p0-=q0 71290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm6 ; q0-=p0 71390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm3 ; abs(p0 - q0) 71490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm5 ; abs(p0-q0)*2 71590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 71690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7171b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, arg(2) ;blimit ; get blimit 7181b362b15af34006e6a11974088a46d42b903418eJohann movq mm7, [rdx] ; blimit 71990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7201b362b15af34006e6a11974088a46d42b903418eJohann psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 72190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm5 72290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 72390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm1, mm5 ; mask mm1 72490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7251b362b15af34006e6a11974088a46d42b903418eJohann ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) 72690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = p0, 72790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 72890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate high edge variance 72990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(4) ;thresh ; get thresh 73090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] ; 73190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, t0 ; get abs (q1 - q0) 73290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 73390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, t1 ; get abs (p1 - p0) 73490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm7 73590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 73690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 73790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm4, mm5 73890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 73990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm5, mm5 74090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm4, mm5 74190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) 74590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = p0, mm4=hev 74690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 74790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rsi+2*rax] ; p1 74890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdi] ; q1 749538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 750538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 75190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm2, mm7 ; p1 - q1 75290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 753538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 754538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 75590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 75690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm6 ; q0 - p0 75790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) 75890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 2 * (q0 - p0) 75990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) 76090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm1, mm2 ; mask filter values we don't care about 76190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 76290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 76390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 76490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; vp8_filter 76590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm2, mm4; ; Filter2 = vp8_filter & hev 76690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 76790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; 768538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb mm5, [GLOBAL(t3)]; 76990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 77190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 0 77290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm5 ; e0f0g0h0 77490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 77590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, mm5 ; a0b0c0d0 77690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm7, 11 ; sign extended shift right by 3 77790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm7 ; Filter2 >>=3; 77890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; Filter2 78090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 781538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) 78290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 78390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 0 78490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 78590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm2 ; e0f0g0h0 78690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 78790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, mm2 ; a0b0c0d0 78890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm7, 11 ; sign extended shift right by 3 78990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm7 ; Filter2 >>=3; 79090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 79190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 79290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; qs0 =qs0 - filter1 79390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 79490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 79590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 79690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; vp8_filter &= ~hev; 79790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Filter2 = vp8_filter; 79890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pandn mm4, mm1 ; vp8_filter&=~hev 79990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 80090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 80190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm3=qs0, mm4=filter2, mm6=ps0 80290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 80390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); 80490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs0 - u); 80590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq0 = s^0x80; 80690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps0 + u); 80790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op0 = s^0x80; 80890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 80990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 81090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 81190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 81290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 81390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 814538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm1, [GLOBAL(s27)] 815538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm2, [GLOBAL(s27)] 816538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(s63)] 817538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(s63)] 81890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 81990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 82090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 82190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 82290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 82390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 82490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 825538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] 826538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] 82790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax], mm6 82890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi], mm3 82990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 83090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; roughly 2/7th difference across boundary 83190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); 83290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs1 - u); 83390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq1 = s^0x80; 83490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps1 + u); 83590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op1 = s^0x80; 83690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 83790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 83890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 83990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 840538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm1, [GLOBAL(s18)] 841538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm2, [GLOBAL(s18)] 842538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(s63)] 843538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(s63)] 84490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 84590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 84690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 84790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 84890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdi] 84990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+rax*2] ; p1 85090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 851538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] 852538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] 85390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 85490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 85590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 85690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 857538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] 858538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] 85990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi], mm3 86090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax*2], mm6 86190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 86290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; roughly 1/7th difference across boundary 86390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); 86490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs2 - u); 86590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq2 = s^0x80; 86690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps2 + u); 86790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op2 = s^0x80; 86890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 86990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 87090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 87190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 872538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm1, [GLOBAL(s9)] 873538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm2, [GLOBAL(s9)] 874538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(s63)] 875538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(s63)] 87690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 87790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 87890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 87990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rdi+rax*4] 88290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 88390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdi+rax ] 88490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 885538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] 886538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] 88790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 88990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 89090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 891538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] 892538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] 89390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi+rax ], mm3 89490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 89590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi+rax*4], mm6 89690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 89790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;EARLY_BREAK_OUT: 89890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 89990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi,8 90090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 9011b362b15af34006e6a11974088a46d42b903418eJohann jnz .next8_mbh 90290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 90390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 32 90490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 90590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 90690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 90790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 90890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 90990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 91090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 91190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 91290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 91390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 91490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_mbloop_filter_vertical_edge_mmx 91590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 91690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 91790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 9181b362b15af34006e6a11974088a46d42b903418eJohann; const char *blimit, 91990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 92090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 92190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 92290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 9231b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE 92490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_mbloop_filter_vertical_edge_mmx): 92590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 92690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 92790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 92890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 92990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 93090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 93190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 93290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 93390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ALIGN_STACK 16, rax 93490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 96 ; reserve 96 bytes 93590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 93690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 93790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; 93890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 93990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 94090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 94190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi + rax*4 - 4] 94390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(5) ;count 9451b362b15af34006e6a11974088a46d42b903418eJohann.next8_mbv: 94690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 94790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ;transpose 94990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70 95090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 95190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 95290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm6 ; 77 76 75 74 73 72 71 70 95390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64 95490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 95590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 95690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50 95790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 95890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 95990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; 47 46 45 44 43 42 41 40 96090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 96190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44 96290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 96390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 96490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 57 47 56 46 55 45 54 44 96590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 96690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 96790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 96890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm4 ; 53 43 52 42 51 41 50 40 96990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 97090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 97190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 97290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 97390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 97490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 97590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30 97690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 97790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 97890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm6 ; 27 26 25 24 23 22 21 20 97990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24 98090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 98190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20 98290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 98390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 98490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 98590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 98690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm7 ; 17 07 16 06 15 05 14 04 98790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 98890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 98990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 99090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm7 ; 37 27 17 07 36 26 16 06 99190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 99290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 99390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 99490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 99590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 99690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm6 ; 76 66 56 46 36 26 16 06 99790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 99890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+56], mm7 99990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm7 ; q2-q3 100090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 100190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 100290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+48], mm6 100390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm6 ; q3-q2 100490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 100590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm5; ; mm7=abs (q3-q2) 100690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; 35 25 15 05 34 24 14 04 100790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 100890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 100990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 101090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 101190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 101290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm6 ; q1-q2 101390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 101490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm5 ; q2-q1 101590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm6, mm3 ; mm6=abs(q2-q1) 101690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 101790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+40], mm5 ; save q1 101890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+32], mm0 ; save q0 101990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 102090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 102190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 102290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 102390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm3 ; 13 03 12 02 11 01 10 00 102490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 102590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 102690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 102790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm0 ; 31 21 11 01 30 20 10 00 102890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 102990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 103090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 103190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 103290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx], mm0 ; save p3 103390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+8], mm1 ; save p2 103490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 103590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 103690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm0 ; p2-p3 103790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 103890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm0, mm1 ; p3-p2 103990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm2 ; mm0=abs(p3-p2) 104090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 104190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm3 ; 33 23 13 03 32 22 12 02 104290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 104390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 104490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 104590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+24], mm3 ; save p0 104690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 104790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+16], mm2 ; save p1 104890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; mm5 = p1 104990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 105090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm1 ; p1-p2 105190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm5 ; p2-p1 105290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 105390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm2 ; mm1=abs(p2-p1) 105490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ;limit 105590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 105690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rdx] ; mm4 = limit 105790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm4 ; abs(q3-q2) > limit 105890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 105990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm0, mm4 ; abs(p3-p2) > limit 106090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm4 ; abs(p2-p1) > limit 106190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 106290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm4 ; abs(q2-q1) > limit 106390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm6 ; or 106490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 106590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm1 ; 106690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit 106790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 106890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm5 ; p1 106990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm3 ; mm3=mm7=p0 107190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm5 ; p0 - p1 107290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p1 - p0 107490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm7 ; abs(p1-p0) 107590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t0, mm5 ; save abs(p1-p0) 107790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 107890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit 108090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm5 ; mm0=mask 108190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 108290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, [rdx+32] ; mm5=q0 108390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx+40] ; mm7=q1 108490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 108590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm5 ; mm6=q0 108690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm7 ; q1 108790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm7 ; q0-q1 108890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 108990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm6 ; q1-q0 109090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm5 ; abs(q1-q0) 109190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 109290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t1, mm7 ; save abs(q1-q0) 109390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit 109490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 109590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm7 ; mask 109690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 109790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; q1 109890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm1 ; q1-=p1 109990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm2 ; p1-=q1 110090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm1 ; abs(p1-q1) 1101538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero 110290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm5, 1 ; abs(p1-q1)/2 110390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11041b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, arg(2) ;blimit ; 110590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11061b362b15af34006e6a11974088a46d42b903418eJohann movq mm4, [rdx] ;blimit 110790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm3 ; mm1=mm3=p0 110890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 110990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm6 ; mm7=mm6=q0 111090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm7 ; p0-q0 111190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 111290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm3 ; q0-p0 111390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm7 ; abs(q0-p0) 111490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm1, mm1 ; abs(q0-p0)*2 111590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 111690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11171b362b15af34006e6a11974088a46d42b903418eJohann psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 111890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm0; ; mask 111990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 112090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 112190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm1, mm0 112290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 112390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate high edge variance 112490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(4) ;thresh ; get thresh 112590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] 112690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; 112790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, t0 ; get abs (q1 - q0) 112890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 ; abs(q1 - q0) > thresh 112990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 113090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, t1 ; get abs (p1 - p0) 113190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm7 ; abs(p1 - p0)> thresh 113290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 113390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 113490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm4, mm0 113590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 113690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm0, mm0 113790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm4, mm0 113890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 113990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 114090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 114190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 114290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 114390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 114490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 114590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 114690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx+16] ; p1 114790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx+40] ; q1 1148538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 1149538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 115090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm2, mm7 ; p1 - q1 115190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 115290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rdx+24] ; p0 115390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rdx+32] ; q0 1154538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 1155538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 115690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 115790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 115890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm6 ; q0 - p0 115990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) 116090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 2 * (q0 - p0) 116190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) 116290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm1, mm2 ; mask filter values we don't care about 116390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 116490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 116590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; vp8_filter 116690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm2, mm4; ; Filter2 = vp8_filter & hev 116790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 116890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; 1169538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb mm5, [GLOBAL(t3)]; 117090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 117190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 117290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 0 117390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 117490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm5 ; e0f0g0h0 117590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 117690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, mm5 ; a0b0c0d0 117790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm7, 11 ; sign extended shift right by 3 117890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm7 ; Filter2 >>=3; 117990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 118090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; Filter2 118190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1182538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) 118390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 118490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 0 118590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 118690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm2 ; e0f0g0h0 118790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 118890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, mm2 ; a0b0c0d0 118990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm7, 11 ; sign extended shift right by 3 119090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm7 ; Filter2 >>=3; 119190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 119290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 119390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; qs0 =qs0 - filter1 119490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 119590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 119690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 119790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; vp8_filter &= ~hev; 119890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Filter2 = vp8_filter; 119990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pandn mm4, mm1 ; vp8_filter&=~hev 120090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 120190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 120290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm3=qs0, mm4=filter2, mm6=ps0 120390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 120490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); 120590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs0 - u); 120690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq0 = s^0x80; 120790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps0 + u); 120890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op0 = s^0x80; 120990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 121090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 121190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 121290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 121390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 121490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 1215538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm1, [GLOBAL(s27)] 1216538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm2, [GLOBAL(s27)] 1217538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(s63)] 1218538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(s63)] 121990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 122090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 122190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 122290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 122390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 122490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 122590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1226538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] 1227538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] 122890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+24], mm6 122990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+32], mm3 123090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 123190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; roughly 2/7th difference across boundary 123290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); 123390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs1 - u); 123490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq1 = s^0x80; 123590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps1 + u); 123690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op1 = s^0x80; 123790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 123890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 123990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 124090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 1241538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm1, [GLOBAL(s18)] 1242538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm2, [GLOBAL(s18)] 1243538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(s63)] 1244538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(s63)] 124590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 124690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 124790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 124890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 124990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdx + 40] 125090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rdx + 16] ; p1 1251538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] 1252538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] 125390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 125490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 125590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 125690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1257538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] 1258538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] 125990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx + 40], mm3 126090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx + 16], mm6 126190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 126290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; roughly 1/7th difference across boundary 126390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); 126490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs2 - u); 126590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq2 = s^0x80; 126690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps2 + u); 126790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op2 = s^0x80; 126890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 126990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 127090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 127190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 1272538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm1, [GLOBAL(s9)] 1273538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw mm2, [GLOBAL(s9)] 1274538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(s63)] 1275538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(s63)] 127690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 127790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 127890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 127990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 128090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rdx+ 8] 128190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdx+48] 128290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1283538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] 1284538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] 128590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 128690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 128790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 128890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1289538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 1290538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 129190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian ; transpose and write back 129390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 129490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 129590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 129690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 129790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 129890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 129990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 130090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 130190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 130290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 130390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 130490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 130590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 130690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 130790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 130890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 130990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 131090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 131190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 131290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 131390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 131490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 131590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 131690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 131790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 131890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 131990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 132090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 132190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 132290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 132390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 132490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 132590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 132690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 132790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 132890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 132990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax*4], mm0 ; write out 133090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi+rax*4], mm6 ; write out 133190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 133290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 133390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 133490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 133590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 133690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax*2], mm0 ; write out 133790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 133890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi+rax*2], mm5 ; write out 133990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 134090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 134190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 134290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 134390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 134490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 134590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 134690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 134790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 134890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 134990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 135090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 135190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 135290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 135390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 135490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi], mm0 ; write out 135590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 135690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi], mm1 ; write out 135790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 135890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 135990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 136090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 136190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 136290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax*2], mm3 136390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi+rax*2], mm4 136490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 136590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi+rax*8] 136690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 136790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13681b362b15af34006e6a11974088a46d42b903418eJohann jnz .next8_mbv 136990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 137090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 96 137190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 137290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 137390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 137490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 137590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 137690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 137790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 137890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 137990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 138090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 138190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_horizontal_edge_mmx 138290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 138390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 138490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 13851b362b15af34006e6a11974088a46d42b903418eJohann; const char *blimit 138690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 13871b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE 138890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_horizontal_edge_mmx): 138990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 139090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 13911b362b15af34006e6a11974088a46d42b903418eJohann SHADOW_ARGS_TO_STACK 3 139290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 139390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 139490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 139590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 139690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 139790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 139890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 139990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14001b362b15af34006e6a11974088a46d42b903418eJohann mov rcx, 2 ; count 14011b362b15af34006e6a11974088a46d42b903418eJohann.nexts8_h: 14021b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, arg(2) ;blimit ; get blimit 140390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdx] ; 140490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 140590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, rsi ; rdi points to row +1 for indirect addressing 140690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rax 140790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 140890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 140990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate mask 141090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi+2*rax] ; p1 141190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rdi] ; q1 141290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 141390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm0 141490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm0 141590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm0, mm1 ; q1-=p1 141690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm4 ; p1-=q1 141790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm0 ; abs(p1-q1) 1418538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero 141990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm1, 1 ; abs(p1-q1)/2 142090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 142190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, [rsi+rax] ; p0 142290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi] ; q0 142390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm4 ; q0 142490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm5 ; p0 142590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm4 ; p0-=q0 142690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm6 ; q0-=p0 142790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm4 ; abs(p0 - q0) 142890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm5 ; abs(p0-q0)*2 142990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 143090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14311b362b15af34006e6a11974088a46d42b903418eJohann psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit 143290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, mm3 143390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm5, mm3 143490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 143590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 1436538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 1437538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 143890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm2, mm7 ; p1 - q1 143990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1440538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 1441538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 144290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 144390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm6 ; q0 - p0 144490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) 144590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) 144690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) 144790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm5, mm2 ; mask filter values we don't care about 144890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 144990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; do + 4 side 1450538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 145190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 145290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm5 ; get a copy of filters 145390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm0, 8 ; shift left 8 145490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 3 ; arithmetic shift right 11 145590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm0, 8 145690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm5 ; get a copy of filters 145790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 11 ; arithmetic shift right 11 145890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm1, 8 ; shift left 8 to put it back 145990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 146090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm1 ; put the two together to get result 146190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 146290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; q0-= q0 add 1463538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] ; unoffset 146490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi], mm3 ; write back 146590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 146690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 146790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; now do +3 side 1468538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 146990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 147090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm5 ; get a copy of filters 147190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm0, 8 ; shift left 8 147290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 3 ; arithmetic shift right 11 147390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm0, 8 147490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 11 ; arithmetic shift right 11 147590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm5, 8 ; shift left 8 to put it back 147690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm5 ; put the two together to get result 147790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 147890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 147990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm0 ; p0+= p0 add 1480538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; unoffset 148190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax], mm6 ; write back 148290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 148390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi,8 148490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 148590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 14861b362b15af34006e6a11974088a46d42b903418eJohann jnz .nexts8_h 148790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 148890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 148990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 149090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 149190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 149290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 149390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 149490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 149590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 149690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 149790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_vertical_edge_mmx 149890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 149990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 150090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 15011b362b15af34006e6a11974088a46d42b903418eJohann; const char *blimit 150290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 15031b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE 150490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_vertical_edge_mmx): 150590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 150690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 15071b362b15af34006e6a11974088a46d42b903418eJohann SHADOW_ARGS_TO_STACK 3 150890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 150990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 151090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 151190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 151290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 151390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ALIGN_STACK 16, rax 151490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 32 ; reserve 32 bytes 151590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 151690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 151790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 151890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 151990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 152090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 152190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi + rax*4- 2]; ; 15221b362b15af34006e6a11974088a46d42b903418eJohann mov rcx, 2 ; count 15231b362b15af34006e6a11974088a46d42b903418eJohann.nexts8_v: 152490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 152590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdi, [rsi + rax]; 152690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 152790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 152890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 152990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 153090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 153190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 153290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 153390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 153490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 153590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; 53 43 52 42 51 41 50 40 153690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 153790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 153890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 153990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 154090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 154190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 154290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 154390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 154490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 154590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 154690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 154790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 154890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 154990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 155090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 155190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; 13 03 12 02 11 01 10 00 155290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 155390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 155490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 155590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm0 ; 13 03 12 02 11 01 10 00 155690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 155790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 155890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm2 ; 33 23 13 03 32 22 12 02 155990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 156090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 156190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 156290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 156390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 156490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 156590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 156690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate mask 156790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm0 ; p1 156890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm3 ; q1 156990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm6 ; q1-=p1 157090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm3 ; p1-=q1 157190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm6, mm7 ; abs(p1-q1) 1572538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero 157390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm6, 1 ; abs(p1-q1)/2 157490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 157590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; p0 157690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm2 ; q0 157790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 157890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm2 ; p0-=q0 157990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm1 ; q0-=p0 158090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 158190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm4 ; abs(p0 - q0) 158290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm5 ; abs(p0-q0)*2 158390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 158490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15851b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, arg(2) ;blimit ; get blimit 158690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] 158790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15881b362b15af34006e6a11974088a46d42b903418eJohann psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit 158990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 159090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm5, mm7 ; mm5 = mask 159190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 159290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 159390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t0, mm0 159490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t1, mm3 159590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1596538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values 1597538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values 159890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 159990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm3 ; p1 - q1 160090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm1 ; p0 160190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 160290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm2 ; q0 1603538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 160490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1605538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values 160690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm7 ; offseted ; q0 160790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 160890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm7, mm6 ; q0 - p0 160990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) 161090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 161190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) 161290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) 161390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 161490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm5, mm0 ; mask filter values we don't care about 161590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1616538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 161790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 161890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm5 ; get a copy of filters 161990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm0, 8 ; shift left 8 162090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 3 ; arithmetic shift right 11 162190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm0, 8 162290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 162390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm5 ; get a copy of filters 162490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm7, 11 ; arithmetic shift right 11 162590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm7, 8 ; shift left 8 to put it back 162690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 162790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm7 ; put the two together to get result 162890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 162990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; q0-= q0sz add 1630538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm3, [GLOBAL(t80)] ; unoffset 163190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 163290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; now do +3 side 1633538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 163490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 163590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm5 ; get a copy of filters 163690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm0, 8 ; shift left 8 163790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 3 ; arithmetic shift right 11 163890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm0, 8 163990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 164090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 11 ; arithmetic shift right 11 164190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm5, 8 ; shift left 8 to put it back 164290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm5 ; put the two together to get result 164390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 164490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm0 ; p0+= p0 add 1645538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor mm6, [GLOBAL(t80)] ; unoffset 164690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 164790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 164890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, t0 164990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, t1 165090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 165190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm0 = 70 60 50 40 30 20 10 00 165290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = 71 61 51 41 31 21 11 01 165390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm3 = 72 62 52 42 32 22 12 02 165490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm4 = 73 63 53 43 33 23 13 03 165590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; transpose back to write out 165690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 165790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm0 ; 165890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 165990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 166090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 166190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm3 ; 166290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 166390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 166490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; 71 70 61 60 51 50 41 40 166590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 166690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 166790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm0 ; 31 30 21 20 11 10 01 00 166890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 166990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 167090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 167190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 167290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+rax*4], mm0 ; write 03 02 01 00 167390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 167490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 167590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 167690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 167790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 167890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi+rax*4], mm0 ; write 13 12 11 10 167990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+rax*2], mm6 ; write 23 22 21 20 168090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 168190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 ; 33 32 31 30 168290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi], mm1 ; write 43 42 41 40 168390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 168490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi + rax], mm6 ; write 33 32 31 30 168590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 168690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 168790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi + rax*2], mm5 ; write 63 62 61 60 168890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm1, 32 ; 53 52 51 50 168990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi], mm1 ; write out 53 52 51 50 169190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm5, 32 ; 73 72 71 70 169290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi + rax*2], mm5 ; write 73 72 71 70 169490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi+rax*8] ; next 8 169690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 16981b362b15af34006e6a11974088a46d42b903418eJohann jnz .nexts8_v 169990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 170090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 32 170190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 170290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 170390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 170490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 170590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 170690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 170790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 170890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 170990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 171090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 171190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 171290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, 171390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int y_stride, 171490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; loop_filter_info *lfi) 171590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;{ 171690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 171790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 171890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 171990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 172090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 172190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;} 172290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 172390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA 172490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 172590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubertfe: 172690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 db 0xfe 172790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 172890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert80: 172990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 db 0x80 173090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 173190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert1s: 173290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 db 0x01 173390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 173490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert3: 173590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 db 0x03 173690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 173790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert4: 173890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 db 0x04 173990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 174090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberones: 174190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 0x0001 174290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 174390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers27: 174490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 0x1b00 174590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 174690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers18: 174790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 0x1200 174890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 174990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers9: 175090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 0x0900 175190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 175290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers63: 175390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 0x003f 1754