loopfilter_mmx.asm revision f71323e297a928af368937089d3ed71239786f86
190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_horizontal_edge_mmx
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int src_pixel_step,
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *flimit,
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  count
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_horizontal_edge_mmx)
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_horizontal_edge_mmx):
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ALIGN_STACK 16, rax
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 32                         ; reserve 32 bytes
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(0) ;src_ptr
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr arg(5) ;count
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernext8_h:
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(3) ;limit
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdx]
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi, rax
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate breakout conditions
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rdi+2*rax]      ; q3
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rsi+2*rax]      ; q2
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, mm1              ; q2
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1, mm2              ; q2-=q3
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm6              ; q3-=q2
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1, mm2              ; abs(q3-q2)
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1, mm7              ;
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+rax]        ; q1
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm4              ; q1
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm6              ; q1-=q2
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6, mm3              ; q2-=q1
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm6              ; abs(q2-q1)
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi]            ; q0
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm4              ; q0
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm3              ; q0-=q1
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm0              ; q1-=q0
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm3              ; abs(q0-q1)
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t0, mm4               ; save to t0
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax                   ; negate pitch to deal with above border
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rsi+4*rax]      ; p3
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rdi+4*rax]      ; p2
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm4              ; p2
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm2              ; p2-=p3
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm5              ; p3-=p2
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm2              ; abs(p3 - p2)
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+2*rax]      ; p1
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm4              ; p1
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm5              ; p1-=p2
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5, mm3              ; p2-=p1
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm5              ; abs(p2 - p1)
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm3              ; p1
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+rax]        ; p0
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm4              ; p0
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm3              ; p0-=p1
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm5              ; p1-=p0
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm3              ; abs(p1 - p0)
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t1, mm4               ; save to t1
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdi]            ; q1
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, mm3              ; q1
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm2              ; q1-=p1
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm4              ; p1-=q1
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm2, mm3              ; abs(p1-q1)
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm2, [tfe GLOBAL]     ; set lsb of each byte to zero
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2, 1                ; abs(p1-q1)/2
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, mm5              ; p0
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rsi]            ; q0
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5, mm3              ; p0-=q0
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm6              ; q0-=p0
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5, mm3              ; abs(p0 - q0)
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm5              ; abs(p0-q0)*2
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(2) ;flimit           ; get flimit
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rdx]            ; flimit mm2
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm2, mm2              ; flimit*2 (less than 255)
12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,    mm5
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5,    mm5
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm1,    mm5           ; mask mm1
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate high edge variance
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(4) ;thresh           ; get thresh
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdx]            ;
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, t0               ; get abs (q1 - q0)
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, t1               ; get abs (p1 - p0)
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm7
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm4,        mm5
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm5,        mm5
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm4,        mm5
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rsi+2*rax]      ; p1
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdi]            ; q1
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm2, mm7              ; p1 - q1
15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm0              ; q0
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0, mm6              ; q0 - p0
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm1, mm2                  ; mask filter values we don't care about
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm1
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm1, [t4 GLOBAL]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, [t3 GLOBAL]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0             ;
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5, mm5
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm2            ;
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm5, mm2            ;
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11             ;
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5, 11
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm5
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0              ; 0
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm1              ; abcdefgh
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm1              ; e0f0g0h0
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11               ; sign extended shift right by 3
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1              ; 0
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm1, mm5              ; a0b0c0d0
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 11               ; sign extended shift right by 3
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm0              ; save results
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm5, [ones GLOBAL]
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm1, [ones GLOBAL]
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pandn       mm4, mm5              ; high edge variance additive
19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm2              ; p0+= p0 add
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]     ; unoffset
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax], mm6        ; write back
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rsi+2*rax]      ; p1
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]     ; reoffset
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm4              ; p1+= p1 add
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]     ; unoffset
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+2*rax], mm6      ; write back
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm0              ; q0-= q0 add
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]     ; unoffset
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi], mm3            ; write back
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm7, mm4              ; q1-= q1 add
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, [t80 GLOBAL]     ; unoffset
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi], mm7            ; write back
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,8
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jnz         next8_h
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 32
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_vertical_edge_mmx
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *flimit,
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int count
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_vertical_edge_mmx)
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_vertical_edge_mmx):
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ALIGN_STACK 16, rax
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub          rsp, 64      ; reserve 64 bytes
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi,        arg(0) ;src_ptr
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi + rax*4 - 4]
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx,        dword ptr arg(5) ;count
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernext8_v:
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi,        rax
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;transpose
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm7                         ; q2-q3
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm6                         ; q3-q2
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3,        mm6                         ; q1-q2
32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6,        mm5                         ; q2-q1
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm6,        mm3                         ; mm6=abs(q2-q1)
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+24],   mm5                         ; save q1
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+16],   mm0                         ; save q0
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2,        mm0                         ; p2-p3
34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm0,        mm1                         ; p3-p2
34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm2                         ; mm0=abs(p3-p2)
34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+8],    mm3                         ; save p0
35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx],      mm2                         ; save p1
35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2                         ; mm5 = p1
35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2,        mm1                         ; p1-p2
35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm5                         ; p2-p1
35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm2                         ; mm1=abs(p2-p1)
35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(3) ;limit
36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        [rdx]                       ; mm4 = limit
36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm4
36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm0,        mm4
36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm4
36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6,        mm4
36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm6
36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm1
37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm5                         ; p1
37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                         ; mm3=mm7=p0
37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm5                         ; p0 - p1
37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm3                         ; p1 - p0
37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5,        mm7                         ; abs(p1-p0)
38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t0,         mm5                         ; save abs(p1-p0)
38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm4
38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm5                         ; mm0=mask
38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        [rdx+16]                    ; mm5=q0
38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx+24]                    ; mm7=q1
38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm5                         ; mm6=q0
39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm7                         ; q1
39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm7                         ; q0-q1
39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm6                         ; q1-q0
39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm5                         ; abs(q1-q0)
39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t1,         mm7                         ; save abs(q1-q0)
39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm4
39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm7                         ; mask
40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2                         ; q1
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm1                         ; q1-=p1
40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm2                         ; p1-=q1
40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5,        mm1                         ; abs(p1-q1)
40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm5,        [tfe GLOBAL]                ; set lsb of each byte to zero
40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm5,        1                           ; abs(p1-q1)/2
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(2) ;flimit                      ;
41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        [rdx]                       ;flimit  mm2
41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm3                         ; mm1=mm3=p0
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm6                         ; mm7=mm6=q0
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm7                         ; p0-q0
41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm3                         ; q0-p0
41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm7                         ; abs(q0-p0)
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm2,        mm2                         ; flimit*2 (less than 255)
42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm0;                        ; mask
42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        mm0
42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm1,        mm0
43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate high edge variance
43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(4) ;thresh            ; get thresh
43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx]
43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;
43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        t0              ; get abs (q1 - q0)
43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4,        mm7
43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        t1              ; get abs (p1 - p0)
43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3,        mm7
44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm4,        mm0
44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm0,        mm0
44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm4,        mm0
44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        [rdx]           ; p1
45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx+24]        ; q1
45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        [rdx+8]         ; p0
45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        [rdx+16]        ; q0
45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2,        [t80 GLOBAL]    ; p1 offset to convert to signed values
45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,        [t80 GLOBAL]    ; q1 offset to convert to signed values
46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm2,        mm7             ; p1 - q1
46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6,        [t80 GLOBAL]    ; offset to convert to signed values
46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        [t80 GLOBAL]    ; offset to convert to signed values
46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm0             ; q0
46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0,        mm6             ; q0 - p0
46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand       mm1,        mm2              ; mask filter values we don't care about
47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm1
47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm1,        [t4 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 4
47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2,        [t3 GLOBAL]       ; 3* (q0 - p0) + hvm(p1 - q1) + 3
48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        mm0          ;
48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5,        mm5
48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,        mm2         ;
48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm5,        mm2         ;
48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0,        11              ;
48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5,        11
48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0,        mm5
49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        mm0           ; 0
49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm1           ; abcdefgh
49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,        mm1           ; e0f0g0h0
49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0,        11                ; sign extended shift right by 3
49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1,        mm1           ; 0
50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm1,        mm5           ; a0b0c0d0
50190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1,        11                ; sign extended shift right by 3
50390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm0              ; save results
50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm5,        [ones GLOBAL]
50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm1,        [ones GLOBAL]
50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pandn       mm4,        mm5             ; high edge variance additive
51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6,        mm2             ; p0+= p0 add
51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6,        [t80 GLOBAL]    ; unoffset
51890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6=p0                               ;
52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        [rdx]           ; p1
52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1,        [t80 GLOBAL]    ; reoffset
52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm1,        mm4                 ; p1+= p1 add
52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1,        [t80 GLOBAL]        ; unoffset
52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 = p0 mm1 = p1
52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3,        mm0                 ; q0-= q0 add
52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3,        [t80 GLOBAL]        ; unoffset
52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm3 = q0
53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm7,        mm4                 ; q1-= q1 add
53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,        [t80 GLOBAL]        ; unoffset
53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm7 = q1
53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; tranpose and write back
53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 =    72 62 52 42 32 22 12 02
53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 =    73 63 53 43 33 23 13 03
53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm3 =    74 64 54 44 34 24 14 04
53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm7 =    75 65 55 45 35 25 15 05
54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm2 = 15 14 13 12 05 04 03 02
56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 = 35 34 33 32 25 24 23 22
56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm5 = 55 54 53 52 45 44 43 42
56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = 75 74 73 72 65 64 63 62
56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+rax*4+2], mm2
56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm2,        32
56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi+rax*4+2], mm2
57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+rax*2+2], mm6
57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm6,        32
57490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+rax+2],mm6
57590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+2],    mm1
57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm1,        32
57890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi+2],    mm1
58090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
58190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi+rax+2],mm5
58390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm5,        32
58490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi+rax*2+2], mm5
58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi+rax*8]
58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jnz         next8_v
59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 64
59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
59590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
59890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
59990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_mbloop_filter_horizontal_edge_mmx
60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
60490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
60690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *flimit,
60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
60890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int count
61090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
61190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_mbloop_filter_horizontal_edge_mmx)
61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_mbloop_filter_horizontal_edge_mmx):
61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
61690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
61890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
62090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ALIGN_STACK 16, rax
62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub          rsp, 32      ; reserve 32 bytes
62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
62490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
62590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(0) ;src_ptr
62790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
62890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr arg(5) ;count
63090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernext8_mbh:
63190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(3) ;limit
63290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdx]
63390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
63490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi, rax
63590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate breakout conditions
63790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rdi+2*rax]      ; q3
63890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rsi+2*rax]      ; q2
64090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, mm1              ; q2
64190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1, mm2              ; q2-=q3
64290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm6              ; q3-=q2
64390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1, mm2              ; abs(q3-q2)
64490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1, mm7
64590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
64890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+rax]        ; q1
64990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm4              ; q1
65090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm6              ; q1-=q2
65190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6, mm3              ; q2-=q1
65290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm6              ; abs(q2-q1)
65390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
65490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
65590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask,      mm3=q1, mm7 = limit
65890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi]            ; q0
66090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm4              ; q0
66190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm3              ; q0-=q1
66290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm0              ; q1-=q0
66390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm3              ; abs(q0-q1)
66490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t0, mm4               ; save to t0
66590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
66690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
66790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
67090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
67190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax                   ; negate pitch to deal with above border
67290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
67390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rsi+4*rax]      ; p3
67490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rdi+4*rax]      ; p2
67590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm4              ; p2
67690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm2              ; p2-=p3
67790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm5              ; p3-=p2
67890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm2              ; abs(p3 - p2)
67990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
68090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
68190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
68290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+2*rax]      ; p1
68490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm4              ; p1
68590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm5              ; p1-=p2
68690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5, mm3              ; p2-=p1
68790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm5              ; abs(p2 - p1)
68890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
68990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
69090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm3              ; p1
69290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
69590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+rax]        ; p0
69790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm4              ; p0
69890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm3              ; p0-=p1
69990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm5              ; p1-=p0
70090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm3              ; abs(p1 - p0)
70190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t1, mm4               ; save to t1
70290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
70390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
70490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
70590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm5 = p0
70690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdi]            ; q1
70790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, mm3              ; q1
70890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm2              ; q1-=p1
70990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm4              ; p1-=q1
71090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm2, mm3              ; abs(p1-q1)
71190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm2, [tfe GLOBAL]     ; set lsb of each byte to zero
71290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2, 1                ; abs(p1-q1)/2
71390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, mm5              ; p0
71590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm0              ; q0
71690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5, mm3              ; p0-=q0
71790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm6              ; q0-=p0
71890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5, mm3              ; abs(p0 - q0)
71990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm5              ; abs(p0-q0)*2
72090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
72190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
72290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(2) ;flimit           ; get flimit
72390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rdx]            ; flimit mm2
72490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm2, mm2              ; flimit*2 (less than 255)
72590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm7, mm2              ; flimit * 2 + limit (less than 255)
72690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
72790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
72890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,    mm5
72990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5,    mm5
73090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm1,    mm5           ; mask mm1
73190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask, mm0=q0,  mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
73390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 = p0,
73490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate high edge variance
73690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(4) ;thresh           ; get thresh
73790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdx]            ;
73890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, t0               ; get abs (q1 - q0)
73990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
74090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, t1               ; get abs (p1 - p0)
74190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm7
74290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
74390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm4,        mm5
74590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm5,        mm5
74790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm4,        mm5
74890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
75090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
75190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
75290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 = p0, mm4=hev
75390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
75490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rsi+2*rax]      ; p1
75590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdi]            ; q1
75690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
75790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
75890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm2, mm7              ; p1 - q1
75990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
76190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
76290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm0              ; q0
76390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0, mm6              ; q0 - p0
76490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
76590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 2 * (q0 - p0)
76690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
76790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm1, mm2              ; mask filter values we don't care about
76890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
77190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm1              ; vp8_filter
77290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
77390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2       ;
77590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm5,        [t3 GLOBAL];
77690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0              ; 0
77890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7              ; 0
77990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm5              ; e0f0g0h0
78190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11               ; sign extended shift right by 3
78290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7, mm5              ; a0b0c0d0
78390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm7, 11               ; sign extended shift right by 3
78490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm7              ; Filter2 >>=3;
78590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm0              ; Filter2
78790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, [t4 GLOBAL]      ; vp8_signed_char_clamp(Filter2 + 4)
78990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0              ; 0
79090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7              ; 0
79190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
79290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm2              ; e0f0g0h0
79390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11               ; sign extended shift right by 3
79490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7, mm2              ; a0b0c0d0
79590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm7, 11               ; sign extended shift right by 3
79690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm7              ; Filter2 >>=3;
79790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
79890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
79990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
80090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
80190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
80290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
80390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; vp8_filter &= ~hev;
80490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Filter2 = vp8_filter;
80590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pandn       mm4, mm1              ; vp8_filter&=~hev
80690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
80790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
80890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm3=qs0, mm4=filter2, mm6=ps0
80990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
81190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs0 - u);
81290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq0 = s^0x80;
81390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps0 + u);
81490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op0 = s^0x80;
81590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0
81690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
81890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
81990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
82090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
82190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm1, [s27 GLOBAL]
82290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm2, [s27 GLOBAL]
82390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1, [s63 GLOBAL]
82490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2, [s63 GLOBAL]
82590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
82690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
82790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
82890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
82990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
83090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
83190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
83290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]
83390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]
83490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax], mm6
83590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi],     mm3
83690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
83790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; roughly 2/7th difference across boundary
83890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
83990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs1 - u);
84090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq1 = s^0x80;
84190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps1 + u);
84290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op1 = s^0x80;
84390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
84490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
84590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
84690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
84790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm1, [s18 GLOBAL]
84890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm2, [s18 GLOBAL]
84990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1, [s63 GLOBAL]
85090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2, [s63 GLOBAL]
85190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
85290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
85390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
85490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
85590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdi]
85690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rsi+rax*2]       ; p1
85790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
85890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]
85990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]
86090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
86290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
86390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]
86590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]
86690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi], mm3
86790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax*2], mm6
86890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; roughly 1/7th difference across boundary
87090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
87190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs2 - u);
87290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq2 = s^0x80;
87390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps2 + u);
87490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op2 = s^0x80;
87590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
87690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
87790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
87890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
87990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm1, [s9 GLOBAL]
88090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm2, [s9 GLOBAL]
88190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1, [s63 GLOBAL]
88290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2, [s63 GLOBAL]
88390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
88490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
88590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
88690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rdi+rax*4]
88990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
89090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdi+rax  ]
89190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]
89390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]
89490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
89690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
89790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]
89990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]
90090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi+rax  ], mm3
90190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
90290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi+rax*4], mm6
90390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
90490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;EARLY_BREAK_OUT:
90590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
90690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,8
90790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
90890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jnz         next8_mbh
90990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
91090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 32
91190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
91290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
91390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
91490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
91590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
91690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
91790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
91890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
91990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
92090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
92190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_mbloop_filter_vertical_edge_mmx
92290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
92390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
92490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
92590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *flimit,
92690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
92790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
92890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int count
92990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
93090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_mbloop_filter_vertical_edge_mmx)
93190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_mbloop_filter_vertical_edge_mmx):
93290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
93390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
93490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
93590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
93690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
93790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
93890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
93990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ALIGN_STACK 16, rax
94190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub          rsp, 96      ; reserve 96 bytes
94290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
94390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
94490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
94590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi,        arg(0) ;src_ptr
94790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
94890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi + rax*4 - 4]
95090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx,        dword ptr arg(5) ;count
95290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernext8_mbv:
95390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
95490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;transpose
95690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
95790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
95890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
96090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7,        mm0                         ; 77 67 76 66 75 65 74 64
96190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm6,        mm0                         ; 73 63 72 62 71 61 70 60
96390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        [rsi+rax]                   ; 57 56 55 54 53 52 51 50
96490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
96690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
96790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm5,        mm0                         ; 57 47 56 46 55 45 54 44
96990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,        mm0                         ; 53 43 52 42 51 41 50 40
97090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
97290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
97390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
97590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
97690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
97890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
97990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
98090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
98190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
98290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rsi+rax]                   ; 37 36 35 34 33 32 31 30
98390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
98490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
98590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
98690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,        mm7                         ; 37 27 36 36 35 25 34 24
98790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
98890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,        mm7                         ; 33 23 32 22 31 21 30 20
98990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
99090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
99190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
99290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
99390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
99490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
99590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
99690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
99790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
99890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
99990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
100090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
100190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
100290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
100390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
100490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
100590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+56],   mm7
100690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm7                         ; q2-q3
100790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
100890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
100990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+48],   mm6
101090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm6                         ; q3-q2
101190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
101290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
101390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
101490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
101590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
101690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
101790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
101890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
101990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3,        mm6                         ; q1-q2
102090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
102190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6,        mm5                         ; q2-q1
102290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm6,        mm3                         ; mm6=abs(q2-q1)
102390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
102490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+40],   mm5                         ; save q1
102590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+32],   mm0                         ; save q0
102690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
102790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
102890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
102990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
103090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
103190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
103290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
103390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
103490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
103590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
103690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
103790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
103890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
103990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx],      mm0                         ; save p3
104090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+8],    mm1                         ; save p2
104190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
104290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
104390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2,        mm0                         ; p2-p3
104490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
104590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm0,        mm1                         ; p3-p2
104690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm2                         ; mm0=abs(p3-p2)
104790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
104890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
104990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
105090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
105190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
105290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+24],   mm3                         ; save p0
105390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
105490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+16],   mm2                         ; save p1
105590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2                         ; mm5 = p1
105690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
105790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2,        mm1                         ; p1-p2
105890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm5                         ; p2-p1
105990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
106090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm2                         ; mm1=abs(p2-p1)
106190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(3) ;limit
106290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
106390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        [rdx]                       ; mm4 = limit
106490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm4                         ; abs(q3-q2) > limit
106590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
106690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm0,        mm4                         ; abs(p3-p2) > limit
106790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm4                         ; abs(p2-p1) > limit
106890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
106990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6,        mm4                         ; abs(q2-q1) > limit
107090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm6                         ; or
107190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
107290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm1                         ;
107390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm7                         ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
107490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
107590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm5                         ; p1
107690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
107790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                         ; mm3=mm7=p0
107890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm5                         ; p0 - p1
107990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
108090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm3                         ; p1 - p0
108190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5,        mm7                         ; abs(p1-p0)
108290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
108390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t0,         mm5                         ; save abs(p1-p0)
108490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
108590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
108690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm4                         ; mm5 = abs(p1-p0) > limit
108790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm5                         ; mm0=mask
108890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
108990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        [rdx+32]                    ; mm5=q0
109090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx+40]                    ; mm7=q1
109190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
109290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm5                         ; mm6=q0
109390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm7                         ; q1
109490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm7                         ; q0-q1
109590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
109690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm6                         ; q1-q0
109790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm5                         ; abs(q1-q0)
109890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
109990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t1,         mm7                         ; save abs(q1-q0)
110090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm4                         ; mm7=abs(q1-q0)> limit
110190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
110290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm7                         ; mask
110390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
110490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2                         ; q1
110590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm1                         ; q1-=p1
110690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm2                         ; p1-=q1
110790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5,        mm1                         ; abs(p1-q1)
110890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm5,        [tfe GLOBAL]                ; set lsb of each byte to zero
110990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm5,        1                           ; abs(p1-q1)/2
111090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
111190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(2) ;flimit                      ;
111290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
111390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        [rdx]                       ;flimit  mm2
111490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm3                         ; mm1=mm3=p0
111590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
111690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm6                         ; mm7=mm6=q0
111790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm7                         ; p0-q0
111890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
111990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm3                         ; q0-p0
112090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm7                         ; abs(q0-p0)
112190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
112290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
112390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
112490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm2,        mm2                         ; flimit*2 (less than 255)
112590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm4,        mm2                         ; flimit * 2 + limit (less than 255)
112690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
112790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
112890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm0;                        ; mask
112990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
113090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        mm0
113190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm1,        mm0
113290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
113390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate high edge variance
113490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(4) ;thresh            ; get thresh
113590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx]
113690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;
113790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        t0              ; get abs (q1 - q0)
113890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4,        mm7             ; abs(q1 - q0) > thresh
113990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        t1              ; get abs (p1 - p0)
114190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3,        mm7             ; abs(p1 - p0)> thresh
114290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
114490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm4,        mm0
114590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm0,        mm0
114790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm4,        mm0
114890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
115090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
115190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
115290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
115390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
115490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
115590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
115690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rdx+16]         ; p1
115790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdx+40]         ; q1
115890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
115990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
116090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm2, mm7              ; p1 - q1
116190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
116290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rdx+24]         ; p0
116390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rdx+32]         ; q0
116490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
116590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
116690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
116790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm0              ; q0
116890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0, mm6              ; q0 - p0
116990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
117090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 2 * (q0 - p0)
117190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
117290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand       mm1, mm2           ; mask filter values we don't care about
117390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
117490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
117590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm1              ; vp8_filter
117690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
117790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
117890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2       ;
117990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm5,        [t3 GLOBAL];
118090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
118190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0              ; 0
118290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7              ; 0
118390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
118490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm5              ; e0f0g0h0
118590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11               ; sign extended shift right by 3
118690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7, mm5              ; a0b0c0d0
118790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm7, 11               ; sign extended shift right by 3
118890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm7              ; Filter2 >>=3;
118990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
119090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm0              ; Filter2
119190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
119290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, [t4 GLOBAL]      ; vp8_signed_char_clamp(Filter2 + 4)
119390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0              ; 0
119490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7              ; 0
119590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
119690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm2              ; e0f0g0h0
119790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11               ; sign extended shift right by 3
119890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7, mm2              ; a0b0c0d0
119990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm7, 11               ; sign extended shift right by 3
120090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm7              ; Filter2 >>=3;
120190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
120290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
120390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
120490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
120590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
120690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
120790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; vp8_filter &= ~hev;
120890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Filter2 = vp8_filter;
120990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pandn       mm4, mm1              ; vp8_filter&=~hev
121090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
121190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
121290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm3=qs0, mm4=filter2, mm6=ps0
121390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
121490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
121590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs0 - u);
121690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq0 = s^0x80;
121790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps0 + u);
121890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op0 = s^0x80;
121990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0
122090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
122190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
122290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
122390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
122490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
122590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm1, [s27 GLOBAL]
122690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm2, [s27 GLOBAL]
122790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1, [s63 GLOBAL]
122890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2, [s63 GLOBAL]
122990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
123090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
123190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
123290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
123390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
123490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
123590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
123690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]
123790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]
123890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+24], mm6
123990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+32], mm3
124090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
124190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; roughly 2/7th difference across boundary
124290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
124390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs1 - u);
124490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq1 = s^0x80;
124590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps1 + u);
124690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op1 = s^0x80;
124790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
124890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
124990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
125090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
125190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm1, [s18 GLOBAL]
125290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm2, [s18 GLOBAL]
125390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1, [s63 GLOBAL]
125490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2, [s63 GLOBAL]
125590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
125690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
125790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
125890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
125990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdx + 40]
126090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rdx + 16]       ; p1
126190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]
126290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]
126390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
126490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
126590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
126690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
126790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]
126890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]
126990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx + 40], mm3
127090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx + 16], mm6
127190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
127290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; roughly 1/7th difference across boundary
127390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
127490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs2 - u);
127590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq2 = s^0x80;
127690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps2 + u);
127790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op2 = s^0x80;
127890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
127990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
128090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
128190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
128290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm1, [s9 GLOBAL]
128390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmulhw      mm2, [s9 GLOBAL]
128490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1, [s63 GLOBAL]
128590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2, [s63 GLOBAL]
128690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
128790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
128890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
128990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
129090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rdx+ 8]
129190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdx+48]
129290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
129390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]
129490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]
129590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
129690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
129790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
129890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
129990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]           ; mm6 = 71 61 51 41 31 21 11 01
130090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]           ; mm3 = 76 66 56 46 36 26 15 06
130190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
130290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; tranpose and write back
130390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
130490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
130590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
130690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm6                 ; mm0 = 31 30 21 20 11 10 01 00
130790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm1,    mm6                 ; mm3 = 71 70 61 60 51 50 41 40
130890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
130990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    [rdx+16]            ; mm2 = 72 62 52 42 32 22 12 02
131090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm2                 ; mm3 = 72 62 52 42 32 22 12 02
131190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
131290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,    [rdx+24]            ; mm2 = 33 32 23 22 13 12 03 02
131390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,    [rdx+24]            ; mm3 = 73 72 63 62 53 52 43 42
131490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
131590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm0                 ; mm5 = 31 30 21 20 11 10 01 00
131690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,    mm2                 ; mm0 = 13 12 11 10 03 02 01 00
131790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
131890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,    mm2                 ; mm5 = 33 32 31 30 23 22 21 20
131990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm1                 ; mm4 = 71 70 61 60 51 50 41 40
132090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
132190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm1,    mm6                 ; mm1 = 53 52 51 50 43 42 41 40
132290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm4,    mm6                 ; mm4 = 73 72 71 70 63 62 61 60
132390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
132490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
132590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,    [rdx+40]            ; mm2 = 35 34 25 24 15 14 05 04
132690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
132790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm3                 ; mm6 = 76 66 56 46 36 26 15 06
132890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm6,    [rdx+56]            ; mm6 = 37 36 27 26 17 16 07 06
132990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
133090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,    mm2                 ; mm7 = 35 34 25 24 15 14 05 04
133190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm2,    mm6                 ; mm2 = 17 16 15 14 07 06 05 04
133290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
133390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm7,    mm6                 ; mm7 = 37 36 35 34 27 26 25 24
133490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm0                 ; mm6 = 13 12 11 10 03 02 01 00
133590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
133690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,    mm2                 ; mm0 = 07 06 05 04 03 02 01 00
133790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm6,    mm2                 ; mm6 = 17 16 15 14 13 12 11 10
133890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
133990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax*4], mm0            ; write out
134090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi+rax*4], mm6            ; write out
134190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
134290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    mm5                 ; mm0 = 33 32 31 30 23 22 21 20
134390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,    mm7                 ; mm0 = 27 26 25 24 23 22 20 20
134490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
134590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm5,    mm7                 ; mm5 = 37 36 35 34 33 32 31 30
134690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax*2], mm0            ; write out
134790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
134890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi+rax*2], mm5            ; write out
134990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
135090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
135190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    [rdx+40]            ; mm2 = 75 74 65 64 54 54 45 44
135290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    [rdx+56]            ; mm3 = 77 76 67 66 57 56 47 46
135390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
135490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm2                 ; mm5 = 75 74 65 64 54 54 45 44
135590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm2,    mm3                 ; mm2 = 57 56 55 54 47 46 45 44
135690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
135790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,    mm3                 ; mm5 = 77 76 75 74 67 66 65 64
135890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    mm1                 ; mm0=  53 52 51 50 43 42 41 40
135990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
136090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm4                 ; mm4 = 73 72 71 70 63 62 61 60
136190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,    mm2                 ; mm0 = 47 46 45 44 43 42 41 40
136290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
136390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm1,    mm2                 ; mm1 = 57 56 55 54 53 52 51 50
136490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi],  mm0                 ; write out
136590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
136690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi],  mm1                 ; write out
136790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
136890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
136990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm3,    mm5                 ; mm3 = 67 66 65 64 63 62 61 60
137090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm4,    mm5                 ; mm4 = 77 76 75 74 73 72 71 60
137190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
137290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax*2], mm3
137390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi+rax*2], mm4
137490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
137590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi+rax*8]
137690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
137790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
137890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jnz         next8_mbv
137990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
138090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 96
138190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
138290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
138390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
138490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
138590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
138690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
138790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
138890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
138990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
139090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
139190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_horizontal_edge_mmx
139290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
139390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
139490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
139590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *flimit,
139690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
139790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
139890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int count
139990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
140090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_simple_horizontal_edge_mmx)
140190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_horizontal_edge_mmx):
140290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
140390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
140490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
140590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
140690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
140790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
140890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
140990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
141090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(0) ;src_ptr
141190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
141290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
141390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr arg(5) ;count
141490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernexts8_h:
141590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(3) ;limit
141690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdx]
141790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(2) ;flimit           ; get flimit
141890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdx]            ;
141990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm3, mm3              ; flimit*2 (less than 255)
142090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm3, mm7              ; flimit * 2 + limit (less than 255)
142190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
142290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
142390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi, rax
142490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
142590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
142690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate mask
142790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rsi+2*rax]      ; p1
142890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rdi]            ; q1
142990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm1
143090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, mm0
143190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, mm0
143290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm0, mm1              ; q1-=p1
143390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1, mm4              ; p1-=q1
143490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1, mm0              ; abs(p1-q1)
143590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm1, [tfe GLOBAL]     ; set lsb of each byte to zero
143690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1, 1                ; abs(p1-q1)/2
143790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
143890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, [rsi+rax]        ; p0
143990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi]            ; q0
144090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm4              ; q0
144190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, mm5              ; p0
144290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5, mm4              ; p0-=q0
144390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm6              ; q0-=p0
144490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5, mm4              ; abs(p0 - q0)
144590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm5              ; abs(p0-q0)*2
144690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
144790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
144890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
144990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, mm3
145090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm5, mm3
145190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
145290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
145390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, [t80 GLOBAL]     ; p1 offset to convert to signed values
145490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, [t80 GLOBAL]     ; q1 offset to convert to signed values
145590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm2, mm7              ; p1 - q1
145690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
145790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]     ; offset to convert to signed values
145890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, [t80 GLOBAL]     ; offset to convert to signed values
145990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm0              ; q0
146090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0, mm6              ; q0 - p0
146190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
146290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
146390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
146490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm5, mm2              ; mask filter values we don't care about
146590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; do + 4 side
146790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm5, [t4 GLOBAL]      ; 3* (q0 - p0) + (p1 - q1) + 4
146890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm5              ; get a copy of filters
147090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm0, 8                ; shift left 8
147190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 3                ; arithmetic shift right 11
147290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0, 8
147390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, mm5              ; get a copy of filters
147490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 11               ; arithmetic shift right 11
147590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm1, 8                ; shift left 8 to put it back
147690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
147790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0, mm1              ; put the two together to get result
147890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
147990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm0              ; q0-= q0 add
148090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, [t80 GLOBAL]     ; unoffset
148190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi], mm3            ; write back
148290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
148390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
148490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; now do +3 side
148590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm5, [t1s GLOBAL]      ; +3 instead of +4
148690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
148790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm5              ; get a copy of filters
148890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm0, 8                ; shift left 8
148990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 3                ; arithmetic shift right 11
149090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0, 8
149190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5, 11               ; arithmetic shift right 11
149290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm5, 8                ; shift left 8 to put it back
149390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0, mm5              ; put the two together to get result
149490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
149590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
149690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm0              ; p0+= p0 add
149790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]     ; unoffset
149890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax], mm6        ; write back
149990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
150090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,8
150190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
150290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
150390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jnz         nexts8_h
150490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
150590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
150690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
150790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
150890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
150990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
151090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
151190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
151290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
151390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
151490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_vertical_edge_mmx
151590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
151690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
151790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
151890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *flimit,
151990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
152090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
152190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int count
152290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
152390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_simple_vertical_edge_mmx)
152490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_vertical_edge_mmx):
152590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
152690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
152790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
152890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
152990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
153090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
153190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
153290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ALIGN_STACK 16, rax
153490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub          rsp, 32      ; reserve 32 bytes
153590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
153690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
153790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(0) ;src_ptr
153990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
154090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi, [rsi + rax*4- 2];  ;
154290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr arg(5) ;count
154390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubernexts8_v:
154490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdi,        [rsi + rax];
154690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
154790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
154990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
155090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
155290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
155390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
155590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
155690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
155890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
155990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
156190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
156390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
156490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
156690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
156790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
156990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
157090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
157290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
157390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
157590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
157690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
157890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
157990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
158090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
158190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
158290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
158390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
158490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
158590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
158690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate mask
158790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm0                             ; p1
158890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                             ; q1
158990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm6                             ; q1-=p1
159090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6,        mm3                             ; p1-=q1
159190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm6,        mm7                             ; abs(p1-q1)
159290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm6,        [tfe GLOBAL]                    ; set lsb of each byte to zero
159390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm6,        1                               ; abs(p1-q1)/2
159490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
159590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm1                             ; p0
159690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        mm2                             ; q0
159790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
159890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm2                             ; p0-=q0
159990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4,        mm1                             ; q0-=p0
160090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
160190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5,        mm4                             ; abs(p0 - q0)
160290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
160390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
160490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
160590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(2) ;flimit                          ; get flimit
160690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx]
160790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(3)                          ; get limit
160890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        [rdx]
160990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm7,        mm7                             ; flimit*2 (less than 255)
161090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm7,        mm6                             ; flimit * 2 + limit (less than 255)
161190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
161290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
161390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,        mm7
161490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm5,        mm7                             ; mm5 = mask
161590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
161690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
161790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t0,         mm0
161890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t1,         mm3
161990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        [t80 GLOBAL]                    ; p1 offset to convert to signed values
162190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3,        [t80 GLOBAL]                    ; q1 offset to convert to signed values
162290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0,        mm3                             ; p1 - q1
162490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm1                             ; p0
162590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm2                             ; q0
162790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6,        [t80 GLOBAL]                    ; offset to convert to signed values
162890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,        [t80 GLOBAL]                    ; offset to convert to signed values
163090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm7                             ; offseted ; q0
163190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
163290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm7,        mm6                             ; q0 - p0
163390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
163490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
163590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
163690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
163790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
163890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm5,        mm0                             ; mask filter values we don't care about
163990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm5,        [t4 GLOBAL]                     ;  3* (q0 - p0) + (p1 - q1) + 4
164190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        mm5                             ; get a copy of filters
164390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm0,        8                               ; shift left 8
164490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0,        3                               ; arithmetic shift right 11
164590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,        8
164690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm5                             ; get a copy of filters
164890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm7,        11                              ; arithmetic shift right 11
164990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm7,        8                               ; shift left 8 to put it back
165090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm7                             ; put the two together to get result
165290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3,        mm0                             ; q0-= q0sz add
165490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3,        [t80 GLOBAL]                    ; unoffset
165590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; now do +3 side
165790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm5, [t1s GLOBAL]                           ; +3 instead of +4
165890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm5                                    ; get a copy of filters
166090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm0, 8                                      ; shift left 8
166190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 3                                      ; arithmetic shift right 11
166290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0, 8
166390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
166490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5, 11                                     ; arithmetic shift right 11
166590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm5, 8                                      ; shift left 8 to put it back
166690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0, mm5                                    ; put the two together to get result
166790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
166890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm0                                    ; p0+= p0 add
166990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, [t80 GLOBAL]                           ; unoffset
167090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        t0
167390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        t1
167490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm0 = 70 60 50 40 30 20 10 00
167690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 = 71 61 51 41 31 21 11 01
167790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm3 = 72 62 52 42 32 22 12 02
167890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm4 = 73 63 53 43 33 23 13 03
167990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; transpose back to write out
168090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
168190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm0                         ;
168290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
168390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
168490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
168590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm3                         ;
168690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
168790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
168890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
168990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
169190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
169290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
169490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
169590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
169790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
169890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
170090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
170190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
170290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
170390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
170490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
170590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm6,        32                          ; 33 32 31 30
170690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi],      mm1                         ; write 43 42 41 40
170790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
170890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi + rax], mm6                        ; write 33 32 31 30
170990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
171090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
171290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm1,        32                          ; 53 52 51 50
171390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi],      mm1                         ; write out 53 52 51 50
171590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm5,        32                          ; 73 72 71 70
171690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
171890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi+rax*8]                 ; next 8
172090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
172190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
172290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jnz         nexts8_v
172390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
172490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 32
172590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
172690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
172790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
172890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
172990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
173090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
173190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
173290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
173390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
173490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
173590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
173690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
173790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;                  int y_stride,
173890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;                  loop_filter_info *lfi)
173990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;{
174090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
174190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
174290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
174390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
174490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
174590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;}
174690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
174790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA
174890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
174990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubertfe:
175090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 db 0xfe
175190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
175290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert80:
175390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 db 0x80
175490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
175590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert1s:
175690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 db 0x01
175790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
175890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert3:
175990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 db 0x03
176090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
176190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert4:
176290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 db 0x04
176390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
176490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberones:
176590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x0001
176690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
176790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers27:
176890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x1b00
176990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
177090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers18:
177190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x1200
177290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
177390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers9:
177490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x0900
177590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
177690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers63:
177790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x003f
1778