loopfilter_mmx.asm revision f71323e297a928af368937089d3ed71239786f86
190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_horizontal_edge_mmx 1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *flimit, 2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_horizontal_edge_mmx) 2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_horizontal_edge_mmx): 2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ALIGN_STACK 16, rax 3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 32 ; reserve 32 bytes 3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(5) ;count 4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernext8_h: 4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ;limit 4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] 4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, rsi ; rdi points to row +1 for indirect addressing 4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rax 4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate breakout conditions 5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdi+2*rax] ; q3 5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi+2*rax] ; q2 5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm1 ; q2 5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm2 ; q2-=q3 5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm6 ; q3-=q2 5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm2 ; abs(q3-q2) 5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm7 ; 5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+rax] ; q1 6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm4 ; q1 6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm6 ; q1-=q2 6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm3 ; q2-=q1 6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm6 ; abs(q2-q1) 6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi] ; q0 6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm4 ; q0 7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm3 ; q0-=q1 7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm0 ; q1-=q0 7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(q0-q1) 7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t0, mm4 ; save to t0 7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax ; negate pitch to deal with above border 7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rsi+4*rax] ; p3 8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rdi+4*rax] ; p2 8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; p2 8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm2 ; p2-=p3 8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm5 ; p3-=p2 8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm2 ; abs(p3 - p2) 8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+2*rax] ; p1 9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm4 ; p1 9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm5 ; p1-=p2 9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p2-=p1 9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm5 ; abs(p2 - p1) 9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm3 ; p1 9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+rax] ; p0 10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; p0 10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm3 ; p0-=p1 10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm5 ; p1-=p0 10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(p1 - p0) 10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t1, mm4 ; save to t1 10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdi] ; q1 11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; q1 11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm2 ; q1-=p1 11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm4 ; p1-=q1 11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm2, mm3 ; abs(p1-q1) 11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm2, [tfe GLOBAL] ; set lsb of each byte to zero 11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm2, 1 ; abs(p1-q1)/2 11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm5 ; p0 11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rsi] ; q0 11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p0-=q0 12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm6 ; q0-=p0 12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm3 ; abs(p0 - q0) 12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm5 ; abs(p0-q0)*2 12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(2) ;flimit ; get flimit 12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx] ; flimit mm2 12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm2, mm2 ; flimit*2 (less than 255) 12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm7, mm2 ; flimit * 2 + limit (less than 255) 12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm5 13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm1, mm5 ; mask mm1 13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate high edge variance 13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(4) ;thresh ; get thresh 13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] ; 13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, t0 ; get abs (q1 - q0) 13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, t1 ; get abs (p1 - p0) 14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm7 14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm4, mm5 14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm5, mm5 14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm4, mm5 14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rsi+2*rax] ; p1 15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdi] ; q1 15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, [t80 GLOBAL] ; p1 offset to convert to signed values 15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, [t80 GLOBAL] ; q1 offset to convert to signed values 15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm2, mm7 ; p1 - q1 15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm2, mm4 ; high var mask (hvm)(p1 - q1) 15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; offset to convert to signed values 15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, [t80 GLOBAL] ; offset to convert to signed values 15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm6 ; q0 - p0 16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm1, mm2 ; mask filter values we don't care about 16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm2 ; 17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm5, mm2 ; 17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; 17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 11 17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm5 17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; abcdefgh 18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm1 ; e0f0g0h0 18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 ; 0 18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm1, mm5 ; a0b0c0d0 18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 11 ; sign extended shift right by 3 18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; save results 18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsw mm5, [ones GLOBAL] 18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsw mm1, [ones GLOBAL] 19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 1 ; partial shifted one more time for 2nd tap 19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 1 ; partial shifted one more time for 2nd tap 19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pandn mm4, mm5 ; high edge variance additive 19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm2 ; p0+= p0 add 19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; unoffset 19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax], mm6 ; write back 19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+2*rax] ; p1 20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; reoffset 20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm4 ; p1+= p1 add 20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; unoffset 20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+2*rax], mm6 ; write back 20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; q0-= q0 add 20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] ; unoffset 20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi], mm3 ; write back 20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm7, mm4 ; q1-= q1 add 21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, [t80 GLOBAL] ; unoffset 21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi], mm7 ; write back 21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi,8 21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz next8_h 21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 32 21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_vertical_edge_mmx 23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *flimit, 23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_vertical_edge_mmx) 23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_vertical_edge_mmx): 24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ALIGN_STACK 16, rax 24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 64 ; reserve 64 bytes 25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; 25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi + rax*4 - 4] 25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(5) ;count 26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernext8_v: 26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, rsi ; rdi points to row +1 for indirect addressing 26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rax 26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ;transpose 26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm6 ; 77 76 75 74 73 72 71 70 26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; 47 46 45 44 43 42 41 40 27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 57 47 56 46 55 45 54 44 27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm4 ; 53 43 52 42 51 41 50 40 28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm6 ; 27 26 25 24 23 22 21 20 29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm7 ; 17 07 16 06 15 05 14 04 29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm7 ; 37 27 17 07 36 26 16 06 30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm6 ; 76 66 56 46 36 26 16 06 30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm7 ; q2-q3 30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm6 ; q3-q2 31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm5; ; mm7=abs (q3-q2) 31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; 35 25 15 05 34 24 14 04 31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm6 ; q1-q2 32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm5 ; q2-q1 32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm6, mm3 ; mm6=abs(q2-q1) 32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+24], mm5 ; save q1 32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+16], mm0 ; save q0 32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm3 ; 13 03 12 02 11 01 10 00 33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm0 ; 31 21 11 01 30 20 10 00 33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm0 ; p2-p3 34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm0, mm1 ; p3-p2 34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm2 ; mm0=abs(p3-p2) 34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm3 ; 33 23 13 03 32 22 12 02 34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+8], mm3 ; save p0 35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx], mm2 ; save p1 35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; mm5 = p1 35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm1 ; p1-p2 35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm5 ; p2-p1 35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm2 ; mm1=abs(p2-p1) 35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ;limit 36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rdx] ; mm4 = limit 36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm4 36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm0, mm4 36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm4 36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm4 36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm6 36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm1 37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit 37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm5 ; p1 37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm3 ; mm3=mm7=p0 37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm5 ; p0 - p1 37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p1 - p0 37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm7 ; abs(p1-p0) 38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t0, mm5 ; save abs(p1-p0) 38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm4 38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm5 ; mm0=mask 38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, [rdx+16] ; mm5=q0 38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx+24] ; mm7=q1 38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm5 ; mm6=q0 39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm7 ; q1 39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm7 ; q0-q1 39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm6 ; q1-q0 39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm5 ; abs(q1-q0) 39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t1, mm7 ; save abs(q1-q0) 39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm4 39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm7 ; mask 40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; q1 40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm1 ; q1-=p1 40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm2 ; p1-=q1 40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm1 ; abs(p1-q1) 40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm5, [tfe GLOBAL] ; set lsb of each byte to zero 40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm5, 1 ; abs(p1-q1)/2 40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(2) ;flimit ; 41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx] ;flimit mm2 41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm3 ; mm1=mm3=p0 41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm6 ; mm7=mm6=q0 41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm7 ; p0-q0 41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm3 ; q0-p0 41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm7 ; abs(q0-p0) 41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm1, mm1 ; abs(q0-p0)*2 42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm2, mm2 ; flimit*2 (less than 255) 42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm4, mm2 ; flimit * 2 + limit (less than 255) 42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm0; ; mask 42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm1, mm0 43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate high edge variance 43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(4) ;thresh ; get thresh 43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] 43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; 43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, t0 ; get abs (q1 - q0) 43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, t1 ; get abs (p1 - p0) 43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm7 44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm4, mm0 44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm0, mm0 44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm4, mm0 44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx] ; p1 45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx+24] ; q1 45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rdx+8] ; p0 45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rdx+16] ; q0 45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, [t80 GLOBAL] ; p1 offset to convert to signed values 45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, [t80 GLOBAL] ; q1 offset to convert to signed values 46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm2, mm7 ; p1 - q1 46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm2, mm4 ; high var mask (hvm)(p1 - q1) 46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; offset to convert to signed values 46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, [t80 GLOBAL] ; offset to convert to signed values 46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm6 ; q0 - p0 46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm1, mm2 ; mask filter values we don't care about 47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm1, [t4 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, [t3 GLOBAL] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm2 ; 48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm5, mm2 ; 48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; 48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 11 48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm5 49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; abcdefgh 49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm1 ; e0f0g0h0 49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 ; 0 50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm1, mm5 ; a0b0c0d0 50190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 11 ; sign extended shift right by 3 50390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; save results 50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsw mm5, [ones GLOBAL] 50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsw mm1, [ones GLOBAL] 50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 1 ; partial shifted one more time for 2nd tap 51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 1 ; partial shifted one more time for 2nd tap 51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pandn mm4, mm5 ; high edge variance additive 51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 51690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm2 ; p0+= p0 add 51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; unoffset 51890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 51990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6=p0 ; 52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rdx] ; p1 52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, [t80 GLOBAL] ; reoffset 52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 52390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm1, mm4 ; p1+= p1 add 52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, [t80 GLOBAL] ; unoffset 52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = p0 mm1 = p1 52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 52790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; q0-= q0 add 52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] ; unoffset 52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 53090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm3 = q0 53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm7, mm4 ; q1-= q1 add 53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, [t80 GLOBAL] ; unoffset 53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm7 = q1 53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; tranpose and write back 53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = 72 62 52 42 32 22 12 02 53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = 73 63 53 43 33 23 13 03 53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm3 = 74 64 54 44 34 24 14 04 53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm7 = 75 65 55 45 35 25 15 05 54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; 72 62 52 42 32 22 12 02 54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; 74 64 54 44 34 24 14 04 54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; 33 32 23 22 13 12 03 02 55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 55390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; 73 72 63 62 53 52 43 42 55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 55990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm2 = 15 14 13 12 05 04 03 02 56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = 35 34 33 32 25 24 23 22 56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm5 = 55 54 53 52 45 44 43 42 56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = 75 74 73 72 65 64 63 62 56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+rax*4+2], mm2 56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm2, 32 56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi+rax*4+2], mm2 57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+rax*2+2], mm6 57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 57490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+rax+2],mm6 57590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+2], mm1 57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm1, 32 57890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi+2], mm1 58090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 58190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi+rax+2],mm5 58390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm5, 32 58490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi+rax*2+2], mm5 58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi+rax*8] 58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz next8_v 59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 64 59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 59590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 59890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 59990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 60190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_mbloop_filter_horizontal_edge_mmx 60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 60490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 60690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *flimit, 60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 60890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 61090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 61190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_mbloop_filter_horizontal_edge_mmx) 61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_mbloop_filter_horizontal_edge_mmx): 61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 61690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 61890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 62090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ALIGN_STACK 16, rax 62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 32 ; reserve 32 bytes 62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 62490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 62590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 62790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 62890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(5) ;count 63090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernext8_mbh: 63190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ;limit 63290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] 63390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, rsi ; rdi points to row +1 for indirect addressing 63490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rax 63590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 63690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate breakout conditions 63790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdi+2*rax] ; q3 63890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 63990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi+2*rax] ; q2 64090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm1 ; q2 64190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm2 ; q2-=q3 64290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm6 ; q3-=q2 64390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm2 ; abs(q3-q2) 64490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm7 64590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 64690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 64790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit 64890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+rax] ; q1 64990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm4 ; q1 65090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm6 ; q1-=q2 65190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm3 ; q2-=q1 65290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm6 ; abs(q2-q1) 65390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 65490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 65590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm3=q1, mm7 = limit 65890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi] ; q0 66090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm4 ; q0 66190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm3 ; q0-=q1 66290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm0 ; q1-=q0 66390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(q0-q1) 66490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t0, mm4 ; save to t0 66590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 66690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 66790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 66890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 66990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 67090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 67190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax ; negate pitch to deal with above border 67290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 67390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rsi+4*rax] ; p3 67490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rdi+4*rax] ; p2 67590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; p2 67690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm2 ; p2-=p3 67790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm5 ; p3-=p2 67890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm2 ; abs(p3 - p2) 67990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 68090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 68190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 68290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 68390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+2*rax] ; p1 68490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm4 ; p1 68590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm5 ; p1-=p2 68690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p2-=p1 68790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm5 ; abs(p2 - p1) 68890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 68990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 69090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 69190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm3 ; p1 69290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 69390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 69490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 69590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 69690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi+rax] ; p0 69790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; p0 69890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm3 ; p0-=p1 69990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm5 ; p1-=p0 70090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(p1 - p0) 70190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t1, mm4 ; save to t1 70290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 70390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm4 70490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) 70590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm5 = p0 70690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdi] ; q1 70790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; q1 70890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm2 ; q1-=p1 70990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm4 ; p1-=q1 71090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm2, mm3 ; abs(p1-q1) 71190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm2, [tfe GLOBAL] ; set lsb of each byte to zero 71290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm2, 1 ; abs(p1-q1)/2 71390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 71490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm5 ; p0 71590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 71690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p0-=q0 71790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm6 ; q0-=p0 71890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm3 ; abs(p0 - q0) 71990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm5 ; abs(p0-q0)*2 72090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 72190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 72290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(2) ;flimit ; get flimit 72390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx] ; flimit mm2 72490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm2, mm2 ; flimit*2 (less than 255) 72590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm7, mm2 ; flimit * 2 + limit (less than 255) 72690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 72790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 72890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm5 72990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 73090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm1, mm5 ; mask mm1 73190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 73290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0) 73390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = p0, 73490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 73590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate high edge variance 73690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(4) ;thresh ; get thresh 73790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] ; 73890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, t0 ; get abs (q1 - q0) 73990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 74090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, t1 ; get abs (p1 - p0) 74190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm7 74290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 74390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm4, mm5 74590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm5, mm5 74790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm4, mm5 74890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 75090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 75190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) 75290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = p0, mm4=hev 75390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 75490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rsi+2*rax] ; p1 75590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdi] ; q1 75690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, [t80 GLOBAL] ; p1 offset to convert to signed values 75790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, [t80 GLOBAL] ; q1 offset to convert to signed values 75890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm2, mm7 ; p1 - q1 75990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 76090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; offset to convert to signed values 76190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, [t80 GLOBAL] ; offset to convert to signed values 76290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 76390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm6 ; q0 - p0 76490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) 76590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 2 * (q0 - p0) 76690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) 76790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm1, mm2 ; mask filter values we don't care about 76890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 76990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 77190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; vp8_filter 77290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm2, mm4; ; Filter2 = vp8_filter & hev 77390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; 77590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm5, [t3 GLOBAL]; 77690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 77890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 0 77990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 78090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm5 ; e0f0g0h0 78190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 78290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, mm5 ; a0b0c0d0 78390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm7, 11 ; sign extended shift right by 3 78490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm7 ; Filter2 >>=3; 78590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 78690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; Filter2 78790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 78890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) 78990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 79090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 0 79190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 79290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm2 ; e0f0g0h0 79390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 79490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, mm2 ; a0b0c0d0 79590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm7, 11 ; sign extended shift right by 3 79690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm7 ; Filter2 >>=3; 79790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 79890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 79990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; qs0 =qs0 - filter1 80090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 80190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 80290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 80390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; vp8_filter &= ~hev; 80490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Filter2 = vp8_filter; 80590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pandn mm4, mm1 ; vp8_filter&=~hev 80690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 80790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 80890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm3=qs0, mm4=filter2, mm6=ps0 80990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 81090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); 81190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs0 - u); 81290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq0 = s^0x80; 81390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps0 + u); 81490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op0 = s^0x80; 81590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 81690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 81790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 81890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 81990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 82090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 82190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm1, [s27 GLOBAL] 82290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm2, [s27 GLOBAL] 82390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, [s63 GLOBAL] 82490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, [s63 GLOBAL] 82590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 82690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 82790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 82890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 82990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 83090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 83190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 83290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] 83390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] 83490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax], mm6 83590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi], mm3 83690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 83790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; roughly 2/7th difference across boundary 83890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); 83990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs1 - u); 84090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq1 = s^0x80; 84190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps1 + u); 84290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op1 = s^0x80; 84390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 84490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 84590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 84690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 84790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm1, [s18 GLOBAL] 84890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm2, [s18 GLOBAL] 84990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, [s63 GLOBAL] 85090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, [s63 GLOBAL] 85190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 85290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 85390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 85490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 85590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdi] 85690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+rax*2] ; p1 85790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 85890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] 85990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] 86090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 86190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 86290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 86390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 86490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] 86590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] 86690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi], mm3 86790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax*2], mm6 86890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 86990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; roughly 1/7th difference across boundary 87090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); 87190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs2 - u); 87290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq2 = s^0x80; 87390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps2 + u); 87490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op2 = s^0x80; 87590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 87690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 87790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 87890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 87990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm1, [s9 GLOBAL] 88090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm2, [s9 GLOBAL] 88190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, [s63 GLOBAL] 88290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, [s63 GLOBAL] 88390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 88490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 88590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 88690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rdi+rax*4] 88990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 89090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdi+rax ] 89190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 89290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] 89390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] 89490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 89590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 89690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 89790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 89890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] 89990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] 90090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi+rax ], mm3 90190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 90290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi+rax*4], mm6 90390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 90490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;EARLY_BREAK_OUT: 90590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 90690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi,8 90790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 90890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz next8_mbh 90990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 91090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 32 91190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 91290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 91390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 91490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 91590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 91690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 91790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 91890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 91990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 92090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 92190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_mbloop_filter_vertical_edge_mmx 92290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 92390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 92490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 92590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *flimit, 92690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 92790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 92890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 92990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 93090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_mbloop_filter_vertical_edge_mmx) 93190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_mbloop_filter_vertical_edge_mmx): 93290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 93390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 93490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 93590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 93690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 93790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 93890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 93990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ALIGN_STACK 16, rax 94190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 96 ; reserve 96 bytes 94290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 94390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 94490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; 94590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 94790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 94890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi + rax*4 - 4] 95090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 95190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(5) ;count 95290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernext8_mbv: 95390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 95490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 95590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ;transpose 95690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70 95790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 95890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 95990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm6 ; 77 76 75 74 73 72 71 70 96090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64 96190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 96290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 96390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50 96490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 96590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 96690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; 47 46 45 44 43 42 41 40 96790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 96890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44 96990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 97090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 97190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 57 47 56 46 55 45 54 44 97290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 97390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 97490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 97590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm4 ; 53 43 52 42 51 41 50 40 97690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 97790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 97890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 97990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 98090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 98190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 98290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30 98390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 98490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 98590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm6 ; 27 26 25 24 23 22 21 20 98690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24 98790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 98890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20 98990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 99090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 99190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 99290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 99390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm7 ; 17 07 16 06 15 05 14 04 99490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 99590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 99690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 99790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm7 ; 37 27 17 07 36 26 16 06 99890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 99990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 100090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 100190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 100290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 100390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm6 ; 76 66 56 46 36 26 16 06 100490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 100590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+56], mm7 100690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm7 ; q2-q3 100790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 100890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 100990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+48], mm6 101090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm6 ; q3-q2 101190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 101290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm5; ; mm7=abs (q3-q2) 101390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; 35 25 15 05 34 24 14 04 101490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 101590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 101690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 101790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 101890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 101990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm6 ; q1-q2 102090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 102190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm5 ; q2-q1 102290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm6, mm3 ; mm6=abs(q2-q1) 102390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 102490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+40], mm5 ; save q1 102590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+32], mm0 ; save q0 102690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 102790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 102890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 102990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 103090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm3 ; 13 03 12 02 11 01 10 00 103190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 103290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 103390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 103490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm0 ; 31 21 11 01 30 20 10 00 103590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 103690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 103790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 103890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 103990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx], mm0 ; save p3 104090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+8], mm1 ; save p2 104190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 104290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 104390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm0 ; p2-p3 104490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 104590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm0, mm1 ; p3-p2 104690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm2 ; mm0=abs(p3-p2) 104790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 104890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm3 ; 33 23 13 03 32 22 12 02 104990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 105090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 105190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 105290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+24], mm3 ; save p0 105390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 105490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+16], mm2 ; save p1 105590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; mm5 = p1 105690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 105790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm2, mm1 ; p1-p2 105890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm5 ; p2-p1 105990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 106090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm2 ; mm1=abs(p2-p1) 106190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ;limit 106290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 106390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rdx] ; mm4 = limit 106490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm4 ; abs(q3-q2) > limit 106590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 106690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm0, mm4 ; abs(p3-p2) > limit 106790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm4 ; abs(p2-p1) > limit 106890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 106990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm4 ; abs(q2-q1) > limit 107090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm6 ; or 107190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm1 ; 107390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit 107490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm5 ; p1 107690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm3 ; mm3=mm7=p0 107890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm5 ; p0 - p1 107990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 108090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; p1 - p0 108190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm7 ; abs(p1-p0) 108290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 108390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t0, mm5 ; save abs(p1-p0) 108490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 108590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 108690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit 108790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm5 ; mm0=mask 108890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 108990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, [rdx+32] ; mm5=q0 109090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx+40] ; mm7=q1 109190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 109290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm5 ; mm6=q0 109390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm7 ; q1 109490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm7 ; q0-q1 109590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 109690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm6 ; q1-q0 109790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm7, mm5 ; abs(q1-q0) 109890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 109990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t1, mm7 ; save abs(q1-q0) 110090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit 110190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 110290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm7 ; mask 110390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 110490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; q1 110590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm1 ; q1-=p1 110690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm2 ; p1-=q1 110790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm1 ; abs(p1-q1) 110890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm5, [tfe GLOBAL] ; set lsb of each byte to zero 110990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm5, 1 ; abs(p1-q1)/2 111090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 111190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(2) ;flimit ; 111290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 111390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx] ;flimit mm2 111490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm3 ; mm1=mm3=p0 111590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 111690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm6 ; mm7=mm6=q0 111790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm7 ; p0-q0 111890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 111990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm3 ; q0-p0 112090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm7 ; abs(q0-p0) 112190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm1, mm1 ; abs(q0-p0)*2 112290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 112390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 112490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm2, mm2 ; flimit*2 (less than 255) 112590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm4, mm2 ; flimit * 2 + limit (less than 255) 112690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 112790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 112890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm0; ; mask 112990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 113090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 113190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm1, mm0 113290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 113390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate high edge variance 113490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(4) ;thresh ; get thresh 113590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] 113690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; 113790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, t0 ; get abs (q1 - q0) 113890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm7 ; abs(q1 - q0) > thresh 113990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 114090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, t1 ; get abs (p1 - p0) 114190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm3, mm7 ; abs(p1 - p0)> thresh 114290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 114390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 114490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm4, mm0 114590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 114690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm0, mm0 114790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm4, mm0 114890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 114990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 115090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 115190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 115290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 115390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdx, srct 115490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 115590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 115690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx+16] ; p1 115790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx+40] ; q1 115890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, [t80 GLOBAL] ; p1 offset to convert to signed values 115990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, [t80 GLOBAL] ; q1 offset to convert to signed values 116090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm2, mm7 ; p1 - q1 116190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 116290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rdx+24] ; p0 116390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rdx+32] ; q0 116490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; offset to convert to signed values 116590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, [t80 GLOBAL] ; offset to convert to signed values 116690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 116790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 116890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm6 ; q0 - p0 116990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) 117090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 2 * (q0 - p0) 117190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) 117290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm1, mm2 ; mask filter values we don't care about 117390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 117490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 117590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; vp8_filter 117690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm2, mm4; ; Filter2 = vp8_filter & hev 117790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 117890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; 117990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm5, [t3 GLOBAL]; 118090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 118190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 118290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 0 118390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 118490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm5 ; e0f0g0h0 118590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 118690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, mm5 ; a0b0c0d0 118790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm7, 11 ; sign extended shift right by 3 118890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm7 ; Filter2 >>=3; 118990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 119090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; Filter2 119190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 119290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, [t4 GLOBAL] ; vp8_signed_char_clamp(Filter2 + 4) 119390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 0 119490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 0 119590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 119690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm2 ; e0f0g0h0 119790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 11 ; sign extended shift right by 3 119890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm7, mm2 ; a0b0c0d0 119990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm7, 11 ; sign extended shift right by 3 120090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm0, mm7 ; Filter2 >>=3; 120190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 120290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 120390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; qs0 =qs0 - filter1 120490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 120590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 120690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 120790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; vp8_filter &= ~hev; 120890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Filter2 = vp8_filter; 120990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pandn mm4, mm1 ; vp8_filter&=~hev 121090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 121190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 121290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm3=qs0, mm4=filter2, mm6=ps0 121390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 121490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); 121590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs0 - u); 121690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq0 = s^0x80; 121790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps0 + u); 121890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op0 = s^0x80; 121990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 122090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 122190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 122290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 122390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 122490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 122590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm1, [s27 GLOBAL] 122690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm2, [s27 GLOBAL] 122790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, [s63 GLOBAL] 122890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, [s63 GLOBAL] 122990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 123090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 123190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 123290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 123390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 123490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 123590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 123690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] 123790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] 123890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+24], mm6 123990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx+32], mm3 124090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 124190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; roughly 2/7th difference across boundary 124290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); 124390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs1 - u); 124490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq1 = s^0x80; 124590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps1 + u); 124690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op1 = s^0x80; 124790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 124890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 124990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 125090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 125190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm1, [s18 GLOBAL] 125290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm2, [s18 GLOBAL] 125390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, [s63 GLOBAL] 125490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, [s63 GLOBAL] 125590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 125690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 125790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 125890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 125990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdx + 40] 126090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rdx + 16] ; p1 126190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] 126290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] 126390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 126490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 126590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 126690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 126790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] 126890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] 126990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx + 40], mm3 127090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdx + 16], mm6 127190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 127290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; roughly 1/7th difference across boundary 127390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); 127490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(qs2 - u); 127590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *oq2 = s^0x80; 127690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; s = vp8_signed_char_clamp(ps2 + u); 127790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; *op2 = s^0x80; 127890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 127990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 128090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm4 128190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm4 128290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm1, [s9 GLOBAL] 128390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmulhw mm2, [s9 GLOBAL] 128490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, [s63 GLOBAL] 128590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, [s63 GLOBAL] 128690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 7 128790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, 7 128890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packsswb mm1, mm2 128990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 129090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rdx+ 8] 129190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdx+48] 129290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 129390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] 129490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] 129590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 129690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm1 129790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm1 129890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 129990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; mm6 = 71 61 51 41 31 21 11 01 130090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] ; mm3 = 76 66 56 46 36 26 15 06 130190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 130290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; tranpose and write back 130390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 130490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 130590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 130690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 130790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 130890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 130990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 131090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 131190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 131290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 131390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 131490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 131590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 131690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 131790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 131890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 131990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 132090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 132190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 132290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 132390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 132490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 132590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 132690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 132790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 132890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 132990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 133090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 133190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 133290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 133390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 133490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 133590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 133690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 133790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 133890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 133990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax*4], mm0 ; write out 134090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi+rax*4], mm6 ; write out 134190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 134290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 134390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 134490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 134590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 134690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax*2], mm0 ; write out 134790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 134890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi+rax*2], mm5 ; write out 134990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 135090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 135190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 135290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 135390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 135490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 135590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 135690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 135790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 135890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 135990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 136090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 136190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 136290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 136390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 136490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi], mm0 ; write out 136590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 136690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi], mm1 ; write out 136790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 136890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 136990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 137090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 137190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 137290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax*2], mm3 137390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rdi+rax*2], mm4 137490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 137590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi+rax*8] 137690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 137790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 137890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz next8_mbv 137990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 138090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 96 138190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 138290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 138390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 138490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 138590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 138690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 138790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 138890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 138990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 139090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 139190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_horizontal_edge_mmx 139290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 139390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 139490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 139590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *flimit, 139690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 139790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 139890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 139990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 140090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_simple_horizontal_edge_mmx) 140190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_horizontal_edge_mmx): 140290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 140390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 140490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 140590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 140690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 140790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 140890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 140990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 141090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 141190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 141290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 141390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(5) ;count 141490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernexts8_h: 141590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ;limit 141690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] 141790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(2) ;flimit ; get flimit 141890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdx] ; 141990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm3, mm3 ; flimit*2 (less than 255) 142090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm3, mm7 ; flimit * 2 + limit (less than 255) 142190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 142290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, rsi ; rdi points to row +1 for indirect addressing 142390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rax 142490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 142590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 142690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate mask 142790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi+2*rax] ; p1 142890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rdi] ; q1 142990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 143090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm0 143190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm0 143290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm0, mm1 ; q1-=p1 143390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm1, mm4 ; p1-=q1 143490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm1, mm0 ; abs(p1-q1) 143590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm1, [tfe GLOBAL] ; set lsb of each byte to zero 143690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm1, 1 ; abs(p1-q1)/2 143790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 143890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, [rsi+rax] ; p0 143990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, [rsi] ; q0 144090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm4 ; q0 144190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm5 ; p0 144290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm4 ; p0-=q0 144390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm6 ; q0-=p0 144490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm4 ; abs(p0 - q0) 144590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm5 ; abs(p0-q0)*2 144690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 144790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 144890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 144990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, mm3 145090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm5, mm3 145190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 145290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 145390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, [t80 GLOBAL] ; p1 offset to convert to signed values 145490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, [t80 GLOBAL] ; q1 offset to convert to signed values 145590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm2, mm7 ; p1 - q1 145690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 145790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; offset to convert to signed values 145890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, [t80 GLOBAL] ; offset to convert to signed values 145990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm0 ; q0 146090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm6 ; q0 - p0 146190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) 146290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) 146390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) 146490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm5, mm2 ; mask filter values we don't care about 146590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 146690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; do + 4 side 146790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm5, [t4 GLOBAL] ; 3* (q0 - p0) + (p1 - q1) + 4 146890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 146990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm5 ; get a copy of filters 147090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm0, 8 ; shift left 8 147190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 3 ; arithmetic shift right 11 147290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm0, 8 147390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm5 ; get a copy of filters 147490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, 11 ; arithmetic shift right 11 147590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm1, 8 ; shift left 8 to put it back 147690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 147790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm1 ; put the two together to get result 147890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 147990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; q0-= q0 add 148090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] ; unoffset 148190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi], mm3 ; write back 148290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 148390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 148490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; now do +3 side 148590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm5, [t1s GLOBAL] ; +3 instead of +4 148690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 148790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm5 ; get a copy of filters 148890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm0, 8 ; shift left 8 148990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 3 ; arithmetic shift right 11 149090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm0, 8 149190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 11 ; arithmetic shift right 11 149290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm5, 8 ; shift left 8 to put it back 149390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm5 ; put the two together to get result 149490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 149590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 149690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm0 ; p0+= p0 add 149790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; unoffset 149890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq [rsi+rax], mm6 ; write back 149990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 150090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi,8 150190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 150290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 150390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz nexts8_h 150490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 150590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 150690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 150790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 150890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 150990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 151090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 151190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 151290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 151390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 151490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_vertical_edge_mmx 151590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 151690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 151790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixel_step, 151890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *flimit, 151990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *limit, 152090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; const char *thresh, 152190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int count 152290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 152390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_simple_vertical_edge_mmx) 152490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_vertical_edge_mmx): 152590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 152690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 152790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 152890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 152990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 153090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 153190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 153290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 153390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ALIGN_STACK 16, rax 153490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 32 ; reserve 32 bytes 153590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 153690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 153790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 153890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;src_ptr 153990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 154090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 154190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi + rax*4- 2]; ; 154290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(5) ;count 154390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernexts8_v: 154490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 154590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdi, [rsi + rax]; 154690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 154790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 154890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 154990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 155090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 155190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 155290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 155390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 155490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 155590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm4 ; 53 43 52 42 51 41 50 40 155690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 155790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 155890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 155990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 156090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 156190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 156290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 156390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 156490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 156590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 156690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 156790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 156890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 156990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 157090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 157190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; 13 03 12 02 11 01 10 00 157290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 157390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 157490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 157590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm0 ; 13 03 12 02 11 01 10 00 157690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 157790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 157890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm2 ; 33 23 13 03 32 22 12 02 157990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 158090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 158190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 158290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 158390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 158490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 158590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 158690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; calculate mask 158790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm0 ; p1 158890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm3 ; q1 158990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm7, mm6 ; q1-=p1 159090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm6, mm3 ; p1-=q1 159190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm6, mm7 ; abs(p1-q1) 159290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm6, [tfe GLOBAL] ; set lsb of each byte to zero 159390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm6, 1 ; abs(p1-q1)/2 159490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 159590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; p0 159690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm2 ; q0 159790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 159890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm2 ; p0-=q0 159990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm4, mm1 ; q0-=p0 160090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 160190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm5, mm4 ; abs(p0 - q0) 160290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm5 ; abs(p0-q0)*2 160390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 160490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 160590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(2) ;flimit ; get flimit 160690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, [rdx] 160790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(3) ; get limit 160890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, [rdx] 160990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm7, mm7 ; flimit*2 (less than 255) 161090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddb mm7, mm6 ; flimit * 2 + limit (less than 255) 161190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 161290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 161390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 161490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pcmpeqb mm5, mm7 ; mm5 = mask 161590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 161690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; start work on filters 161790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t0, mm0 161890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq t1, mm3 161990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 162090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, [t80 GLOBAL] ; p1 offset to convert to signed values 162190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] ; q1 offset to convert to signed values 162290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 162390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm0, mm3 ; p1 - q1 162490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm1 ; p0 162590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 162690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm2 ; q0 162790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; offset to convert to signed values 162890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 162990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, [t80 GLOBAL] ; offset to convert to signed values 163090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm7 ; offseted ; q0 163190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 163290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm7, mm6 ; q0 - p0 163390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) 163490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 163590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) 163690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) 163790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 163890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pand mm5, mm0 ; mask filter values we don't care about 163990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 164090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm5, [t4 GLOBAL] ; 3* (q0 - p0) + (p1 - q1) + 4 164190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 164290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm5 ; get a copy of filters 164390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm0, 8 ; shift left 8 164490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 3 ; arithmetic shift right 11 164590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm0, 8 164690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 164790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm5 ; get a copy of filters 164890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm7, 11 ; arithmetic shift right 11 164990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm7, 8 ; shift left 8 to put it back 165090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 165190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm7 ; put the two together to get result 165290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 165390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm3, mm0 ; q0-= q0sz add 165490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, [t80 GLOBAL] ; unoffset 165590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 165690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; now do +3 side 165790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsb mm5, [t1s GLOBAL] ; +3 instead of +4 165890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 165990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm5 ; get a copy of filters 166090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm0, 8 ; shift left 8 166190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm0, 3 ; arithmetic shift right 11 166290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm0, 8 166390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 166490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm5, 11 ; arithmetic shift right 11 166590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psllw mm5, 8 ; shift left 8 to put it back 166690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber por mm0, mm5 ; put the two together to get result 166790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 166890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddsb mm6, mm0 ; p0+= p0 add 166990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, [t80 GLOBAL] ; unoffset 167090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 167190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 167290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, t0 167390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, t1 167490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 167590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm0 = 70 60 50 40 30 20 10 00 167690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm6 = 71 61 51 41 31 21 11 01 167790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm3 = 72 62 52 42 32 22 12 02 167890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; mm4 = 73 63 53 43 33 23 13 03 167990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; transpose back to write out 168090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 168190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm0 ; 168290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 168390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 168490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 168590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm3 ; 168690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 168790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 168890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; 71 70 61 60 51 50 41 40 168990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 169190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm0 ; 31 30 21 20 11 10 01 00 169290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 169490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 169590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+rax*4], mm0 ; write 03 02 01 00 169790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 169890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 169990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 170090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 170190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 170290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi+rax*4], mm0 ; write 13 12 11 10 170390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi+rax*2], mm6 ; write 23 22 21 20 170490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 170590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 ; 33 32 31 30 170690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi], mm1 ; write 43 42 41 40 170790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 170890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi + rax], mm6 ; write 33 32 31 30 170990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber neg rax 171090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 171190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi + rax*2], mm5 ; write 63 62 61 60 171290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm1, 32 ; 53 52 51 50 171390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 171490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi], mm1 ; write out 53 52 51 50 171590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm5, 32 ; 73 72 71 70 171690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 171790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi + rax*2], mm5 ; write 73 72 71 70 171890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 171990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi+rax*8] ; next 8 172090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 172190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 172290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz nexts8_v 172390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 172490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 32 172590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsp 172690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 172790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 172890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 172990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 173090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 173190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 173290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 173390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 173490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 173590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 173690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, 173790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int y_stride, 173890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; loop_filter_info *lfi) 173990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;{ 174090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 174190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 174290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 174390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 174490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 174590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;} 174690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 174790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA 174890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 174990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubertfe: 175090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 db 0xfe 175190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 175290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert80: 175390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 db 0x80 175490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 175590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert1s: 175690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 db 0x01 175790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 175890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert3: 175990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 db 0x03 176090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 176190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert4: 176290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 db 0x04 176390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 176490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberones: 176590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 0x0001 176690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 176790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers27: 176890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 0x1b00 176990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 177090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers18: 177190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 0x1200 177290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 177390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers9: 177490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 0x0900 177590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 177690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers63: 177790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 0x003f 1778