190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_horizontal_edge_mmx
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int src_pixel_step,
191b362b15af34006e6a11974088a46d42b903418eJohann;    const char *blimit,
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  count
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
241b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_horizontal_edge_mmx):
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ALIGN_STACK 16, rax
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 32                         ; reserve 32 bytes
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(0) ;src_ptr
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr arg(5) ;count
431b362b15af34006e6a11974088a46d42b903418eJohann.next8_h:
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(3) ;limit
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdx]
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi, rax
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate breakout conditions
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rdi+2*rax]      ; q3
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rsi+2*rax]      ; q2
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, mm1              ; q2
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1, mm2              ; q2-=q3
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm6              ; q3-=q2
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1, mm2              ; abs(q3-q2)
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1, mm7              ;
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+rax]        ; q1
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm4              ; q1
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm6              ; q1-=q2
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6, mm3              ; q2-=q1
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm6              ; abs(q2-q1)
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi]            ; q0
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm4              ; q0
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm3              ; q0-=q1
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm0              ; q1-=q0
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm3              ; abs(q0-q1)
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t0, mm4               ; save to t0
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax                   ; negate pitch to deal with above border
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rsi+4*rax]      ; p3
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rdi+4*rax]      ; p2
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm4              ; p2
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm2              ; p2-=p3
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm5              ; p3-=p2
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm2              ; abs(p3 - p2)
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+2*rax]      ; p1
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm4              ; p1
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm5              ; p1-=p2
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5, mm3              ; p2-=p1
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm5              ; abs(p2 - p1)
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm3              ; p1
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+rax]        ; p0
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm4              ; p0
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm3              ; p0-=p1
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm5              ; p1-=p0
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm3              ; abs(p1 - p0)
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t1, mm4               ; save to t1
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdi]            ; q1
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, mm3              ; q1
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm2              ; q1-=p1
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm4              ; p1-=q1
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm2, mm3              ; abs(p1-q1)
114538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2, 1                ; abs(p1-q1)/2
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, mm5              ; p0
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rsi]            ; q0
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5, mm3              ; p0-=q0
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm6              ; q0-=p0
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5, mm3              ; abs(p0 - q0)
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm5              ; abs(p0-q0)*2
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1251b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdx, arg(2) ;blimit           ; get blimit
1261b362b15af34006e6a11974088a46d42b903418eJohann        movq        mm7, [rdx]            ; blimit
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1281b362b15af34006e6a11974088a46d42b903418eJohann        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,    mm5
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5,    mm5
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm1,    mm5           ; mask mm1
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate high edge variance
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(4) ;thresh           ; get thresh
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdx]            ;
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, t0               ; get abs (q1 - q0)
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, t1               ; get abs (p1 - p0)
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm7
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm4,        mm5
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm5,        mm5
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm4,        mm5
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rsi+2*rax]      ; p1
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdi]            ; q1
151538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
152538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm2, mm7              ; p1 - q1
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
155538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
156538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm0              ; q0
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0, mm6              ; q0 - p0
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm1, mm2                  ; mask filter values we don't care about
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm1
164538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
165538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0             ;
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5, mm5
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm2            ;
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm5, mm2            ;
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11             ;
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5, 11
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm5
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0              ; 0
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm1              ; abcdefgh
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm1              ; e0f0g0h0
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11               ; sign extended shift right by 3
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1              ; 0
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm1, mm5              ; a0b0c0d0
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 11               ; sign extended shift right by 3
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm0              ; save results
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
186538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsw      mm5, [GLOBAL(ones)]
187538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsw      mm1, [GLOBAL(ones)]
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pandn       mm4, mm5              ; high edge variance additive
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm2              ; p0+= p0 add
194538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]    ; unoffset
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax], mm6        ; write back
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rsi+2*rax]      ; p1
198538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]    ; reoffset
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm4              ; p1+= p1 add
200538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]    ; unoffset
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+2*rax], mm6      ; write back
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm0              ; q0-= q0 add
204538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]    ; unoffset
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi], mm3            ; write back
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm7, mm4              ; q1-= q1 add
208538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm7, [GLOBAL(t80)]    ; unoffset
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi], mm7            ; write back
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,8
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
2141b362b15af34006e6a11974088a46d42b903418eJohann        jnz         .next8_h
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 32
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_vertical_edge_mmx
22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
2311b362b15af34006e6a11974088a46d42b903418eJohann;    const char *blimit,
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int count
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
2361b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_vertical_edge_mmx):
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ALIGN_STACK 16, rax
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub          rsp, 64      ; reserve 64 bytes
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi,        arg(0) ;src_ptr
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi + rax*4 - 4]
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx,        dword ptr arg(5) ;count
2581b362b15af34006e6a11974088a46d42b903418eJohann.next8_v:
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi,        rax
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;transpose
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm7                         ; q2-q3
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm6                         ; q3-q2
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3,        mm6                         ; q1-q2
31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6,        mm5                         ; q2-q1
31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm6,        mm3                         ; mm6=abs(q2-q1)
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+24],   mm5                         ; save q1
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+16],   mm0                         ; save q0
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2,        mm0                         ; p2-p3
34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm0,        mm1                         ; p3-p2
34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm2                         ; mm0=abs(p3-p2)
34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+8],    mm3                         ; save p0
34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx],      mm2                         ; save p1
35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2                         ; mm5 = p1
35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2,        mm1                         ; p1-p2
35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm5                         ; p2-p1
35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm2                         ; mm1=abs(p2-p1)
35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(3) ;limit
35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        [rdx]                       ; mm4 = limit
36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm4
36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm0,        mm4
36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm4
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6,        mm4
36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm6
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm1
36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm5                         ; p1
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                         ; mm3=mm7=p0
37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm5                         ; p0 - p1
37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm3                         ; p1 - p0
37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5,        mm7                         ; abs(p1-p0)
37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t0,         mm5                         ; save abs(p1-p0)
38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm4
38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm5                         ; mm0=mask
38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        [rdx+16]                    ; mm5=q0
38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx+24]                    ; mm7=q1
38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm5                         ; mm6=q0
38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm7                         ; q1
39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm7                         ; q0-q1
39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm6                         ; q1-q0
39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm5                         ; abs(q1-q0)
39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t1,         mm7                         ; save abs(q1-q0)
39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm4
39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm7                         ; mask
39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2                         ; q1
40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm1                         ; q1-=p1
40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm2                         ; p1-=q1
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5,        mm1                         ; abs(p1-q1)
404538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm5,        1                           ; abs(p1-q1)/2
40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4071b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdx,        arg(2) ;blimit                      ;
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4091b362b15af34006e6a11974088a46d42b903418eJohann        movq        mm4,        [rdx]                       ;blimit
41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm3                         ; mm1=mm3=p0
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm6                         ; mm7=mm6=q0
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm7                         ; p0-q0
41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm3                         ; q0-p0
41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm7                         ; abs(q0-p0)
41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4201b362b15af34006e6a11974088a46d42b903418eJohann        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm0;                        ; mask
42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        mm0
42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm1,        mm0
42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate high edge variance
42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(4) ;thresh            ; get thresh
42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx]
42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;
43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        t0              ; get abs (q1 - q0)
43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4,        mm7
43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        t1              ; get abs (p1 - p0)
43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3,        mm7
43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm4,        mm0
43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm0,        mm0
44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm4,        mm0
44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        [rdx]           ; p1
44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx+24]        ; q1
44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        [rdx+8]         ; p0
45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        [rdx+16]        ; q0
45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
453538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
454538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm2,        mm7             ; p1 - q1
45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
459538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
460538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm0             ; q0
46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0,        mm6             ; q0 - p0
46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand       mm1,        mm2              ; mask filter values we don't care about
47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm1
472538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
474538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        mm0          ;
47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5,        mm5
47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,        mm2         ;
47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm5,        mm2         ;
48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0,        11              ;
48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5,        11
48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0,        mm5
48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        mm0           ; 0
48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm1           ; abcdefgh
49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,        mm1           ; e0f0g0h0
49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0,        11                ; sign extended shift right by 3
49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1,        mm1           ; 0
49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm1,        mm5           ; a0b0c0d0
49690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1,        11                ; sign extended shift right by 3
49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm0              ; save results
49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
501538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsw      mm5,        [GLOBAL(ones)]
50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
503538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsw      mm1,        [GLOBAL(ones)]
50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pandn       mm4,        mm5             ; high edge variance additive
51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6,        mm2             ; p0+= p0 add
512538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6=p0                               ;
51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        [rdx]           ; p1
516538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm1,        mm4                 ; p1+= p1 add
519538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 = p0 mm1 = p1
52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3,        mm0                 ; q0-= q0 add
523538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm3 = q0
52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm7,        mm4                 ; q1-= q1 add
527538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm7 = q1
52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
530b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian        ; transpose and write back
53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 =    72 62 52 42 32 22 12 02
53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 =    73 63 53 43 33 23 13 03
53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm3 =    74 64 54 44 34 24 14 04
53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm7 =    75 65 55 45 35 25 15 05
53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
54490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
55390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm2 = 15 14 13 12 05 04 03 02
55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 = 35 34 33 32 25 24 23 22
55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm5 = 55 54 53 52 45 44 43 42
55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = 75 74 73 72 65 64 63 62
55990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+rax*4+2], mm2
56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm2,        32
56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi+rax*4+2], mm2
56690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+rax*2+2], mm6
56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm6,        32
56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+rax+2],mm6
57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+2],    mm1
57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm1,        32
57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi+2],    mm1
57590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
57690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi+rax+2],mm5
57890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm5,        32
57990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi+rax*2+2], mm5
58190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi+rax*8]
58390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
5841b362b15af34006e6a11974088a46d42b903418eJohann        jnz         .next8_v
58590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 64
58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
59590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_mbloop_filter_horizontal_edge_mmx
59890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
59990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
6011b362b15af34006e6a11974088a46d42b903418eJohann;    const char *blimit,
60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
60490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int count
60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
6061b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_mbloop_filter_horizontal_edge_mmx):
60890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
61090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
61190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ALIGN_STACK 16, rax
61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub          rsp, 32      ; reserve 32 bytes
61890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
62090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(0) ;src_ptr
62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr arg(5) ;count
6251b362b15af34006e6a11974088a46d42b903418eJohann.next8_mbh:
62690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(3) ;limit
62790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdx]
62890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
62990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi, rax
63090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate breakout conditions
63290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rdi+2*rax]      ; q3
63390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rsi+2*rax]      ; q2
63590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, mm1              ; q2
63690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1, mm2              ; q2-=q3
63790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm6              ; q3-=q2
63890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1, mm2              ; abs(q3-q2)
63990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1, mm7
64090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
64390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+rax]        ; q1
64490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm4              ; q1
64590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm6              ; q1-=q2
64690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6, mm3              ; q2-=q1
64790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm6              ; abs(q2-q1)
64890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
64990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
65090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask,      mm3=q1, mm7 = limit
65390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi]            ; q0
65590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm4              ; q0
65690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm3              ; q0-=q1
65790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm0              ; q1-=q0
65890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm3              ; abs(q0-q1)
65990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t0, mm4               ; save to t0
66090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
66190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
66290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
66590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax                   ; negate pitch to deal with above border
66790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rsi+4*rax]      ; p3
66990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rdi+4*rax]      ; p2
67090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm4              ; p2
67190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm2              ; p2-=p3
67290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm5              ; p3-=p2
67390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm2              ; abs(p3 - p2)
67490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
67590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
67690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
67790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
67890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+2*rax]      ; p1
67990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm4              ; p1
68090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm5              ; p1-=p2
68190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5, mm3              ; p2-=p1
68290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm5              ; abs(p2 - p1)
68390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
68490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
68590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm3              ; p1
68790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
69090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi+rax]        ; p0
69290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm4              ; p0
69390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm3              ; p0-=p1
69490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm5              ; p1-=p0
69590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4, mm3              ; abs(p1 - p0)
69690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t1, mm4               ; save to t1
69790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
69890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por        mm1, mm4
69990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
70090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm5 = p0
70190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdi]            ; q1
70290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, mm3              ; q1
70390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm2              ; q1-=p1
70490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2, mm4              ; p1-=q1
70590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm2, mm3              ; abs(p1-q1)
706538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
70790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2, 1                ; abs(p1-q1)/2
70890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
70990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, mm5              ; p0
71090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm0              ; q0
71190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5, mm3              ; p0-=q0
71290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm6              ; q0-=p0
71390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5, mm3              ; abs(p0 - q0)
71490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm5              ; abs(p0-q0)*2
71590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
71690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7171b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdx, arg(2) ;blimit           ; get blimit
7181b362b15af34006e6a11974088a46d42b903418eJohann        movq        mm7, [rdx]            ; blimit
71990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7201b362b15af34006e6a11974088a46d42b903418eJohann        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
72190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,    mm5
72290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5,    mm5
72390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm1,    mm5           ; mask mm1
72490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7251b362b15af34006e6a11974088a46d42b903418eJohann        ; mm1 = mask, mm0=q0,  mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
72690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 = p0,
72790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
72890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate high edge variance
72990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(4) ;thresh           ; get thresh
73090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdx]            ;
73190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, t0               ; get abs (q1 - q0)
73290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm7
73390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, t1               ; get abs (p1 - p0)
73490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3, mm7
73590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
73690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm4,        mm5
73890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm5,        mm5
74090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm4,        mm5
74190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
74590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 = p0, mm4=hev
74690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
74790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rsi+2*rax]      ; p1
74890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdi]            ; q1
749538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
750538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
75190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm2, mm7              ; p1 - q1
75290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
753538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
754538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
75590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm0              ; q0
75690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0, mm6              ; q0 - p0
75790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
75890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 2 * (q0 - p0)
75990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
76090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm1, mm2              ; mask filter values we don't care about
76190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
76490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm1              ; vp8_filter
76590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
76690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2       ;
768538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      mm5,        [GLOBAL(t3)];
76990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0              ; 0
77190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7              ; 0
77290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm5              ; e0f0g0h0
77490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11               ; sign extended shift right by 3
77590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7, mm5              ; a0b0c0d0
77690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm7, 11               ; sign extended shift right by 3
77790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm7              ; Filter2 >>=3;
77890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm0              ; Filter2
78090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
781538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
78290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0              ; 0
78390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7              ; 0
78490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm2              ; e0f0g0h0
78690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11               ; sign extended shift right by 3
78790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7, mm2              ; a0b0c0d0
78890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm7, 11               ; sign extended shift right by 3
78990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm7              ; Filter2 >>=3;
79090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
79190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
79290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
79390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
79490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
79590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
79690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; vp8_filter &= ~hev;
79790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Filter2 = vp8_filter;
79890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pandn       mm4, mm1              ; vp8_filter&=~hev
79990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
80090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
80190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm3=qs0, mm4=filter2, mm6=ps0
80290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
80390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
80490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs0 - u);
80590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq0 = s^0x80;
80690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps0 + u);
80790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op0 = s^0x80;
80890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0
80990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
81190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
81290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
81390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
814538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm1, [GLOBAL(s27)]
815538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm2, [GLOBAL(s27)]
816538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm1, [GLOBAL(s63)]
817538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm2, [GLOBAL(s63)]
81890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
81990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
82090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
82190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
82290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
82390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
82490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
825538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]
826538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]
82790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax], mm6
82890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi],     mm3
82990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
83090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; roughly 2/7th difference across boundary
83190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
83290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs1 - u);
83390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq1 = s^0x80;
83490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps1 + u);
83590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op1 = s^0x80;
83690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
83790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
83890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
83990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
840538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm1, [GLOBAL(s18)]
841538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm2, [GLOBAL(s18)]
842538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm1, [GLOBAL(s63)]
843538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm2, [GLOBAL(s63)]
84490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
84590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
84690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
84790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
84890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdi]
84990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rsi+rax*2]       ; p1
85090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
851538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]
852538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]
85390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
85490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
85590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
85690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
857538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]
858538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]
85990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi], mm3
86090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax*2], mm6
86190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; roughly 1/7th difference across boundary
86390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
86490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs2 - u);
86590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq2 = s^0x80;
86690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps2 + u);
86790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op2 = s^0x80;
86890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
86990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
87090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
87190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
872538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm1, [GLOBAL(s9)]
873538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm2, [GLOBAL(s9)]
874538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm1, [GLOBAL(s63)]
875538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm2, [GLOBAL(s63)]
87690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
87790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
87890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
87990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rdi+rax*4]
88290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
88390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdi+rax  ]
88490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
885538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]
886538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]
88790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
88990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
89090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
891538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]
892538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]
89390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi+rax  ], mm3
89490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
89590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi+rax*4], mm6
89690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;EARLY_BREAK_OUT:
89890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
89990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,8
90090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
9011b362b15af34006e6a11974088a46d42b903418eJohann        jnz         .next8_mbh
90290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
90390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 32
90490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
90590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
90690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
90790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
90890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
90990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
91090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
91190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
91290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
91390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
91490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_mbloop_filter_vertical_edge_mmx
91590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
91690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
91790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
9181b362b15af34006e6a11974088a46d42b903418eJohann;    const char *blimit,
91990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
92090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
92190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int count
92290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
9231b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
92490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_mbloop_filter_vertical_edge_mmx):
92590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
92690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
92790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
92890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
92990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
93090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
93190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
93290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
93390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ALIGN_STACK 16, rax
93490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub          rsp, 96      ; reserve 96 bytes
93590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
93690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
93790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
93890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
93990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi,        arg(0) ;src_ptr
94090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
94190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi + rax*4 - 4]
94390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx,        dword ptr arg(5) ;count
9451b362b15af34006e6a11974088a46d42b903418eJohann.next8_mbv:
94690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
94790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;transpose
94990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
95090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
95190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
95390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7,        mm0                         ; 77 67 76 66 75 65 74 64
95490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm6,        mm0                         ; 73 63 72 62 71 61 70 60
95690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        [rsi+rax]                   ; 57 56 55 54 53 52 51 50
95790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
95990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
96090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm5,        mm0                         ; 57 47 56 46 55 45 54 44
96290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,        mm0                         ; 53 43 52 42 51 41 50 40
96390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
96590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
96690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
96890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
96990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
97190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
97290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
97490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rsi+rax]                   ; 37 36 35 34 33 32 31 30
97690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
97790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
97990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,        mm7                         ; 37 27 36 36 35 25 34 24
98090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
98190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,        mm7                         ; 33 23 32 22 31 21 30 20
98290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
98390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
98490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
98590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
98690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
98790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
98890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
98990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
99090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
99190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
99290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
99390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
99490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
99590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
99690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
99790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
99890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+56],   mm7
99990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm7                         ; q2-q3
100090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
100190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
100290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+48],   mm6
100390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm6                         ; q3-q2
100490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
100590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
100690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
100790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
100890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
100990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
101090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
101190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
101290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3,        mm6                         ; q1-q2
101390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
101490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6,        mm5                         ; q2-q1
101590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm6,        mm3                         ; mm6=abs(q2-q1)
101690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
101790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+40],   mm5                         ; save q1
101890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+32],   mm0                         ; save q0
101990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
102090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
102190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
102290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
102390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
102490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
102590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
102690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
102790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
102890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
102990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
103090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
103190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
103290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx],      mm0                         ; save p3
103390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+8],    mm1                         ; save p2
103490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
103590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
103690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2,        mm0                         ; p2-p3
103790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
103890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm0,        mm1                         ; p3-p2
103990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm2                         ; mm0=abs(p3-p2)
104090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
104190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
104290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
104390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
104490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
104590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+24],   mm3                         ; save p0
104690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
104790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+16],   mm2                         ; save p1
104890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2                         ; mm5 = p1
104990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
105090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm2,        mm1                         ; p1-p2
105190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm5                         ; p2-p1
105290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
105390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm2                         ; mm1=abs(p2-p1)
105490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(3) ;limit
105590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
105690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        [rdx]                       ; mm4 = limit
105790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm4                         ; abs(q3-q2) > limit
105890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
105990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm0,        mm4                         ; abs(p3-p2) > limit
106090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm4                         ; abs(p2-p1) > limit
106190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
106290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6,        mm4                         ; abs(q2-q1) > limit
106390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm6                         ; or
106490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
106590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm1                         ;
106690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm7                         ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
106790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
106890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm5                         ; p1
106990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
107090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                         ; mm3=mm7=p0
107190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm5                         ; p0 - p1
107290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
107390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm3                         ; p1 - p0
107490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5,        mm7                         ; abs(p1-p0)
107590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
107690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t0,         mm5                         ; save abs(p1-p0)
107790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
107890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
107990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm4                         ; mm5 = abs(p1-p0) > limit
108090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm5                         ; mm0=mask
108190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
108290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        [rdx+32]                    ; mm5=q0
108390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx+40]                    ; mm7=q1
108490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
108590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm5                         ; mm6=q0
108690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm7                         ; q1
108790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm7                         ; q0-q1
108890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
108990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm6                         ; q1-q0
109090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm7,        mm5                         ; abs(q1-q0)
109190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
109290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t1,         mm7                         ; save abs(q1-q0)
109390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm4                         ; mm7=abs(q1-q0)> limit
109490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
109590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm7                         ; mask
109690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
109790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2                         ; q1
109890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm1                         ; q1-=p1
109990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm2                         ; p1-=q1
110090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5,        mm1                         ; abs(p1-q1)
1101538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
110290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm5,        1                           ; abs(p1-q1)/2
110390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11041b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdx,        arg(2) ;blimit                      ;
110590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11061b362b15af34006e6a11974088a46d42b903418eJohann        movq        mm4,        [rdx]                       ;blimit
110790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm3                         ; mm1=mm3=p0
110890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
110990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm6                         ; mm7=mm6=q0
111090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1,        mm7                         ; p0-q0
111190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
111290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm3                         ; q0-p0
111390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm7                         ; abs(q0-p0)
111490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
111590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
111690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11171b362b15af34006e6a11974088a46d42b903418eJohann        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
111890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,        mm0;                        ; mask
111990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
112090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        mm0
112190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm1,        mm0
112290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
112390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate high edge variance
112490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(4) ;thresh            ; get thresh
112590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx]
112690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;
112790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        t0              ; get abs (q1 - q0)
112890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4,        mm7             ; abs(q1 - q0) > thresh
112990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
113090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        t1              ; get abs (p1 - p0)
113190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm3,        mm7             ; abs(p1 - p0)> thresh
113290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
113390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
113490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm4,        mm0
113590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
113690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm0,        mm0
113790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm4,        mm0
113890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
113990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
114390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        srct
114490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
114690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rdx+16]         ; p1
114790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, [rdx+40]         ; q1
1148538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
1149538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
115090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm2, mm7              ; p1 - q1
115190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
115290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rdx+24]         ; p0
115390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rdx+32]         ; q0
1154538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
1155538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
115690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
115790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm0              ; q0
115890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0, mm6              ; q0 - p0
115990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
116090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 2 * (q0 - p0)
116190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
116290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand       mm1, mm2           ; mask filter values we don't care about
116390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
116490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
116590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm1              ; vp8_filter
116690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
116790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
116890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm2       ;
1169538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      mm5,        [GLOBAL(t3)];
117090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
117190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0              ; 0
117290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7              ; 0
117390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
117490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm5              ; e0f0g0h0
117590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11               ; sign extended shift right by 3
117690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7, mm5              ; a0b0c0d0
117790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm7, 11               ; sign extended shift right by 3
117890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm7              ; Filter2 >>=3;
117990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
118090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, mm0              ; Filter2
118190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1182538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
118390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0              ; 0
118490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7              ; 0
118590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
118690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm2              ; e0f0g0h0
118790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 11               ; sign extended shift right by 3
118890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm7, mm2              ; a0b0c0d0
118990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm7, 11               ; sign extended shift right by 3
119090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm0, mm7              ; Filter2 >>=3;
119190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
119290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
119390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
119490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
119590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
119690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
119790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; vp8_filter &= ~hev;
119890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Filter2 = vp8_filter;
119990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pandn       mm4, mm1              ; vp8_filter&=~hev
120090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
120190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
120290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm3=qs0, mm4=filter2, mm6=ps0
120390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
120490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
120590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs0 - u);
120690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq0 = s^0x80;
120790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps0 + u);
120890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op0 = s^0x80;
120990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0
121090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
121190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
121290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
121390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
121490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
1215538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm1, [GLOBAL(s27)]
1216538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm2, [GLOBAL(s27)]
1217538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm1, [GLOBAL(s63)]
1218538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm2, [GLOBAL(s63)]
121990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
122090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
122190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
122290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
122390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
122490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
122590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1226538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]
1227538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]
122890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+24], mm6
122990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx+32], mm3
123090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
123190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; roughly 2/7th difference across boundary
123290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
123390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs1 - u);
123490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq1 = s^0x80;
123590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps1 + u);
123690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op1 = s^0x80;
123790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
123890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
123990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
124090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
1241538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm1, [GLOBAL(s18)]
1242538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm2, [GLOBAL(s18)]
1243538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm1, [GLOBAL(s63)]
1244538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm2, [GLOBAL(s63)]
124590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
124690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
124790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
124890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
124990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdx + 40]
125090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rdx + 16]       ; p1
1251538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]
1252538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]
125390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
125490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
125590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
125690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1257538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]
1258538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]
125990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx + 40], mm3
126090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdx + 16], mm6
126190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
126290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; roughly 1/7th difference across boundary
126390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
126490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(qs2 - u);
126590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *oq2 = s^0x80;
126690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; s = vp8_signed_char_clamp(ps2 + u);
126790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; *op2 = s^0x80;
126890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm1, mm1
126990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm2, mm2
127090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm4
127190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm4
1272538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm1, [GLOBAL(s9)]
1273538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      mm2, [GLOBAL(s9)]
1274538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm1, [GLOBAL(s63)]
1275538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm2, [GLOBAL(s63)]
127690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 7
127790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2, 7
127890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packsswb    mm1, mm2
127990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
128090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, [rdx+ 8]
128190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdx+48]
128290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1283538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]
1284538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]
128590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
128690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm1
128790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm1
128890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1289538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]          ; mm6 = 71 61 51 41 31 21 11 01
1290538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]          ; mm3 = 76 66 56 46 36 26 15 06
129190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1292b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian        ; transpose and write back
129390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
129490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
129590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
129690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm6                 ; mm0 = 31 30 21 20 11 10 01 00
129790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm1,    mm6                 ; mm3 = 71 70 61 60 51 50 41 40
129890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
129990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    [rdx+16]            ; mm2 = 72 62 52 42 32 22 12 02
130090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm2                 ; mm3 = 72 62 52 42 32 22 12 02
130190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
130290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,    [rdx+24]            ; mm2 = 33 32 23 22 13 12 03 02
130390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,    [rdx+24]            ; mm3 = 73 72 63 62 53 52 43 42
130490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
130590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm0                 ; mm5 = 31 30 21 20 11 10 01 00
130690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,    mm2                 ; mm0 = 13 12 11 10 03 02 01 00
130790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
130890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,    mm2                 ; mm5 = 33 32 31 30 23 22 21 20
130990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm1                 ; mm4 = 71 70 61 60 51 50 41 40
131090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
131190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm1,    mm6                 ; mm1 = 53 52 51 50 43 42 41 40
131290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm4,    mm6                 ; mm4 = 73 72 71 70 63 62 61 60
131390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
131490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
131590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,    [rdx+40]            ; mm2 = 35 34 25 24 15 14 05 04
131690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
131790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm3                 ; mm6 = 76 66 56 46 36 26 15 06
131890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm6,    [rdx+56]            ; mm6 = 37 36 27 26 17 16 07 06
131990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
132090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,    mm2                 ; mm7 = 35 34 25 24 15 14 05 04
132190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm2,    mm6                 ; mm2 = 17 16 15 14 07 06 05 04
132290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
132390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm7,    mm6                 ; mm7 = 37 36 35 34 27 26 25 24
132490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm0                 ; mm6 = 13 12 11 10 03 02 01 00
132590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
132690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,    mm2                 ; mm0 = 07 06 05 04 03 02 01 00
132790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm6,    mm2                 ; mm6 = 17 16 15 14 13 12 11 10
132890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
132990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax*4], mm0            ; write out
133090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi+rax*4], mm6            ; write out
133190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
133290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    mm5                 ; mm0 = 33 32 31 30 23 22 21 20
133390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,    mm7                 ; mm0 = 27 26 25 24 23 22 20 20
133490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
133590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm5,    mm7                 ; mm5 = 37 36 35 34 33 32 31 30
133690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax*2], mm0            ; write out
133790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
133890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi+rax*2], mm5            ; write out
133990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
134090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
134190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    [rdx+40]            ; mm2 = 75 74 65 64 54 54 45 44
134290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    [rdx+56]            ; mm3 = 77 76 67 66 57 56 47 46
134390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
134490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm2                 ; mm5 = 75 74 65 64 54 54 45 44
134590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm2,    mm3                 ; mm2 = 57 56 55 54 47 46 45 44
134690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
134790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,    mm3                 ; mm5 = 77 76 75 74 67 66 65 64
134890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    mm1                 ; mm0=  53 52 51 50 43 42 41 40
134990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
135090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm4                 ; mm4 = 73 72 71 70 63 62 61 60
135190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,    mm2                 ; mm0 = 47 46 45 44 43 42 41 40
135290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
135390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm1,    mm2                 ; mm1 = 57 56 55 54 53 52 51 50
135490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi],  mm0                 ; write out
135590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
135690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi],  mm1                 ; write out
135790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
135890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
135990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm3,    mm5                 ; mm3 = 67 66 65 64 63 62 61 60
136090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm4,    mm5                 ; mm4 = 77 76 75 74 73 72 71 60
136190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
136290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax*2], mm3
136390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi+rax*2], mm4
136490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
136590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi+rax*8]
136690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
136790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13681b362b15af34006e6a11974088a46d42b903418eJohann        jnz         .next8_mbv
136990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
137090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 96
137190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
137290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
137390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
137490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
137590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
137690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
137790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
137890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
137990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
138090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
138190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_horizontal_edge_mmx
138290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
138390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
138490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
13851b362b15af34006e6a11974088a46d42b903418eJohann;    const char *blimit
138690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
13871b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
138890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_horizontal_edge_mmx):
138990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
139090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
13911b362b15af34006e6a11974088a46d42b903418eJohann    SHADOW_ARGS_TO_STACK 3
139290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
139390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
139490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
139590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
139690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
139790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(0) ;src_ptr
139890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
139990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14001b362b15af34006e6a11974088a46d42b903418eJohann        mov         rcx, 2                ; count
14011b362b15af34006e6a11974088a46d42b903418eJohann.nexts8_h:
14021b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdx, arg(2) ;blimit           ; get blimit
140390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rdx]            ;
140490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
140590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
140690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi, rax
140790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
140890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
140990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate mask
141090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rsi+2*rax]      ; p1
141190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rdi]            ; q1
141290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm1
141390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7, mm0
141490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, mm0
141590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm0, mm1              ; q1-=p1
141690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm1, mm4              ; p1-=q1
141790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1, mm0              ; abs(p1-q1)
1418538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
141990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1, 1                ; abs(p1-q1)/2
142090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
142190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5, [rsi+rax]        ; p0
142290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi]            ; q0
142390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm4              ; q0
142490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6, mm5              ; p0
142590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5, mm4              ; p0-=q0
142690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4, mm6              ; q0-=p0
142790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5, mm4              ; abs(p0 - q0)
142890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm5              ; abs(p0-q0)*2
142990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
143090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14311b362b15af34006e6a11974088a46d42b903418eJohann        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
143290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm3, mm3
143390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm5, mm3
143490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
143590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
1436538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
1437538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
143890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm2, mm7              ; p1 - q1
143990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1440538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
1441538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
144290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm0              ; q0
144390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0, mm6              ; q0 - p0
144490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
144590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
144690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
144790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm5, mm2              ; mask filter values we don't care about
144890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
144990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; do + 4 side
1450538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
145190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
145290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm5              ; get a copy of filters
145390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm0, 8                ; shift left 8
145490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 3                ; arithmetic shift right 11
145590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0, 8
145690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, mm5              ; get a copy of filters
145790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm1, 11               ; arithmetic shift right 11
145890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm1, 8                ; shift left 8 to put it back
145990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0, mm1              ; put the two together to get result
146190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3, mm0              ; q0-= q0 add
1463538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3, [GLOBAL(t80)]    ; unoffset
146490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi], mm3            ; write back
146590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; now do +3 side
1468538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
146990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
147090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm5              ; get a copy of filters
147190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm0, 8                ; shift left 8
147290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 3                ; arithmetic shift right 11
147390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0, 8
147490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5, 11               ; arithmetic shift right 11
147590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm5, 8                ; shift left 8 to put it back
147690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0, mm5              ; put the two together to get result
147790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
147890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
147990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm0              ; p0+= p0 add
1480538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]    ; unoffset
148190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rsi+rax], mm6        ; write back
148290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
148390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,8
148490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
148590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
14861b362b15af34006e6a11974088a46d42b903418eJohann        jnz         .nexts8_h
148790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
148890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
148990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
149090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
149190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
149290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
149390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
149490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
149590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
149690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
149790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_vertical_edge_mmx
149890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
149990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
150090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
15011b362b15af34006e6a11974088a46d42b903418eJohann;    const char *blimit
150290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
15031b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
150490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_vertical_edge_mmx):
150590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
150690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
15071b362b15af34006e6a11974088a46d42b903418eJohann    SHADOW_ARGS_TO_STACK 3
150890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
150990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
151090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
151190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
151290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
151390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ALIGN_STACK 16, rax
151490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub          rsp, 32      ; reserve 32 bytes
151590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
151690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
151790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
151890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(0) ;src_ptr
151990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
152090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
152190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi, [rsi + rax*4- 2];  ;
15221b362b15af34006e6a11974088a46d42b903418eJohann        mov         rcx, 2                                      ; count
15231b362b15af34006e6a11974088a46d42b903418eJohann.nexts8_v:
152490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
152590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdi,        [rsi + rax];
152690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
152790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
152890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
152990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
153090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
153290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
153390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
153590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
153690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
153890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
153990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
154190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
154390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
154490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
154690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
154790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
154990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
155090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
155290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
155390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
155590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
155690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
155890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
155990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
156190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
156290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
156490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate mask
156790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm0                             ; p1
156890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                             ; q1
156990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm7,        mm6                             ; q1-=p1
157090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm6,        mm3                             ; p1-=q1
157190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm6,        mm7                             ; abs(p1-q1)
1572538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
157390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm6,        1                               ; abs(p1-q1)/2
157490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm1                             ; p0
157690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        mm2                             ; q0
157790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm5,        mm2                             ; p0-=q0
157990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     mm4,        mm1                             ; q0-=p0
158090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
158190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm5,        mm4                             ; abs(p0 - q0)
158290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
158390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
158490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15851b362b15af34006e6a11974088a46d42b903418eJohann        mov         rdx,        arg(2) ;blimit                          ; get blimit
158690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        [rdx]
158790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15881b362b15af34006e6a11974088a46d42b903418eJohann        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
158990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,        mm7
159090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     mm5,        mm7                             ; mm5 = mask
159190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
159290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
159390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t0,         mm0
159490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        t1,         mm3
159590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1596538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
1597538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
159890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
159990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm0,        mm3                             ; p1 - q1
160090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm1                             ; p0
160190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
160290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm2                             ; q0
1603538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
160490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1605538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
160690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        mm7                             ; offseted ; q0
160790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
160890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm7,        mm6                             ; q0 - p0
160990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
161090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
161190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
161290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
161390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
161490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm5,        mm0                             ; mask filter values we don't care about
161590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1616538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
161790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
161890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        mm5                             ; get a copy of filters
161990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm0,        8                               ; shift left 8
162090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0,        3                               ; arithmetic shift right 11
162190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,        8
162290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm5                             ; get a copy of filters
162490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm7,        11                              ; arithmetic shift right 11
162590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm7,        8                               ; shift left 8 to put it back
162690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0,        mm7                             ; put the two together to get result
162890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      mm3,        mm0                             ; q0-= q0sz add
1630538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
163190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
163290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; now do +3 side
1633538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
163490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
163590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, mm5                                    ; get a copy of filters
163690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm0, 8                                      ; shift left 8
163790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0, 3                                      ; arithmetic shift right 11
163890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0, 8
163990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm5, 11                                     ; arithmetic shift right 11
164190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm5, 8                                      ; shift left 8 to put it back
164290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm0, mm5                                    ; put the two together to get result
164390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      mm6, mm0                                    ; p0+= p0 add
1645538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
164690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        t0
164990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        t1
165090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm0 = 70 60 50 40 30 20 10 00
165290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm6 = 71 61 51 41 31 21 11 01
165390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm3 = 72 62 52 42 32 22 12 02
165490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; mm4 = 73 63 53 43 33 23 13 03
165590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; transpose back to write out
165690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm0                         ;
165890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
165990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
166090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
166190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        mm3                         ;
166290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
166390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
166490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
166590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
166690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
166790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
166890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
166990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
167090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
167190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
167390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
167490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
167690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
167790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
167990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
168090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
168190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm6,        32                          ; 33 32 31 30
168290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi],      mm1                         ; write 43 42 41 40
168390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
168490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi + rax], mm6                        ; write 33 32 31 30
168590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
168690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
168790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
168890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm1,        32                          ; 53 52 51 50
168990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi],      mm1                         ; write out 53 52 51 50
169190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm5,        32                          ; 73 72 71 70
169290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
169490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi+rax*8]                 ; next 8
169690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
16981b362b15af34006e6a11974088a46d42b903418eJohann        jnz         .nexts8_v
169990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
170090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 32
170190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
170290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
170390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
170490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
170590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
170690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
170790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
170890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
170990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
171390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;                  int y_stride,
171490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;                  loop_filter_info *lfi)
171590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;{
171690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
171790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
171890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
171990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
172090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
172190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;}
172290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
172390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA
172490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
172590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubertfe:
172690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 db 0xfe
172790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
172890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert80:
172990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 db 0x80
173090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
173190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert1s:
173290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 db 0x01
173390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
173490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert3:
173590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 db 0x03
173690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
173790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert4:
173890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 db 0x04
173990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
174090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberones:
174190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x0001
174290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
174390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers27:
174490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x1b00
174590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
174690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers18:
174790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x1200
174890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
174990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers9:
175090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x0900
175190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
175290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers63:
175390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x003f
1754