190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm"
131b362b15af34006e6a11974088a46d42b903418eJohannextern sym(vp8_bilinear_filters_x86_8)
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%define BLOCK_HEIGHT_WIDTH 4
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%define vp8_filter_weight 128
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%define VP8_FILTER_SHIFT  7
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_filter_block1d_h6_mmx
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char   *src_ptr,
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned short  *output_ptr,
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int    src_pixels_per_line,
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int    pixel_step,
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int    output_height,
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int    output_width,
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    short           * vp8_filter
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
311b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_filter_block1d_h6_mmx) PRIVATE
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_filter_block1d_h6_mmx):
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 7
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,    arg(6) ;vp8_filter
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    [rdx + 32]         ;
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    [rdx + 48]        ;
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,    [rdx + 64]        ;
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi,    arg(1) ;output_ptr
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi,    arg(0) ;src_ptr
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx,    dword ptr arg(4) ;output_height
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,    mm0              ; mm0 = 00000000
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
541b362b15af34006e6a11974088a46d42b903418eJohann.nextrow:
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm3              ; mm4 = p-2..p5
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm3,    8                ; mm3 = p-1..p5
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm4              ; mm5 = p-2..p5
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm3,    mm4              ; mm3 += mm5
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm5              ; mm4 = p-2..p5;
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm5,    16               ; mm5 = p0..p5;
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm3,    mm5              ; mm3 += mm5
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm4              ; mm5 = p-2..p5
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm4,    24               ; mm4 = p1..p5
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm3,    mm4              ; mm3 += mm5
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; do outer positive taps
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm4,    [rsi+3]
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm3,    mm4              ; mm3 += mm5
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm3,    mm5              ; mm3 += mm5
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm3,    mm0              ; pack and unpack to saturate
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,    mm0              ;
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi],  mm3              ; store the results in the destination
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi,    rax;
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi,    rax;
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,    r8               ; next line
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx                      ; decrement count
1061b362b15af34006e6a11974088a46d42b903418eJohann        jnz         .nextrow                 ; next row
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_filter_block1dc_v6_mmx
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;   short *src_ptr,
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;   unsigned char *output_ptr,
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int output_pitch,
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;   unsigned int pixels_per_line,
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;   unsigned int pixel_step,
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;   unsigned int output_height,
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;   unsigned int output_width,
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;   short * vp8_filter
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
1281b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_filter_block1dc_v6_mmx) PRIVATE
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_filter_block1dc_v6_mmx):
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 8
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
138538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq      mm5, [GLOBAL(rd)]
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        push        rbx
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rbx, arg(7) ;vp8_filter
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq      mm2, [rbx + 32]         ;
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq      mm6, [rbx + 48]        ;
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq      mm7, [rbx + 64]        ;
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, arg(1) ;output_ptr
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(0) ;src_ptr
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         rsi, rdx
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         rsi, rdx
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, DWORD PTR arg(5) ;output_height
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0, mm0              ; mm0 = 00000000
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1561b362b15af34006e6a11974088a46d42b903418eJohann.nextrow_cv:
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm3, mm4              ; mm3 += mm4
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm3, mm4              ; mm3 += mm4
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm3, mm4              ; mm3 += mm4
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm3, mm4              ; mm3 += mm4
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm3, mm4              ; mm3 += mm4
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsw      mm3, mm5               ; mm3 += round value
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm3, mm0              ; pack and saturate
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi],mm3             ; store the results in the destination
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; recon block should be in cache this shouldn't cost much.  Its obviously
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; avoidable!!!.
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdi,  [rdi+rax] ;
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx                   ; decrement count
1941b362b15af34006e6a11974088a46d42b903418eJohann        jnz         .nextrow_cv           ; next row
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pop         rbx
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void bilinear_predict8x8_mmx
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char  *src_ptr,
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int   src_pixels_per_line,
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  xoffset,
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  yoffset,
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;   unsigned char *dst_ptr,
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int dst_pitch
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
2161b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_bilinear_predict8x8_mmx) PRIVATE
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_bilinear_predict8x8_mmx):
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2261b362b15af34006e6a11974088a46d42b903418eJohann    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
2271b362b15af34006e6a11974088a46d42b903418eJohann    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,        dword ptr arg(2) ;xoffset
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi,        arg(4) ;dst_ptr           ;
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shl         rax,        5 ; offset * 32
2331b362b15af34006e6a11974088a46d42b903418eJohann        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,        rcx ; HFilter
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi,        arg(0) ;src_ptr              ;
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        [rax]               ;
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        [rax+16]            ;
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,        dword ptr arg(3) ;yoffset
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        mm0                 ;
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shl         rax,        5 ; offset*32
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,        rcx ; VFilter
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rcx,        [rdi+rdx*8]          ;
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; get the first horizontal line done       ;
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        mm3                 ; make a copy of current line
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm4,        mm0                 ;
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,        mm1                 ;
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,        mm1                 ;
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        [rsi+1]             ;
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm5                 ;
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm5,        mm0                 ;
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,        mm0                 ;
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,        mm2                 ;
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,        mm2                 ;
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,        mm5                 ;
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,        mm6                 ;
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
276538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
279538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm4,        [GLOBAL(rd)]                 ;
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm4,        VP8_FILTER_SHIFT        ;
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                 ;
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm7,        mm4                 ;
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,        rdx                 ; next line
2861b362b15af34006e6a11974088a46d42b903418eJohann.next_row_8x8:
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        mm3                 ; make a copy of current line
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm4,        mm0                 ;
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,        mm1                 ;
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,        mm1                 ;
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        [rsi+1]             ;
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm5                 ;
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm5,        mm0                 ;
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,        mm0                 ;
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,        mm2                 ;
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,        mm2                 ;
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,        mm5                 ;
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,        mm6                 ;
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm7                 ;
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm7                 ;
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm5,        mm0                 ;
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,        mm0
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,        [rax]               ;
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,        [rax]               ;
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
317538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
320538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm4,        [GLOBAL(rd)]                 ;
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm4,        VP8_FILTER_SHIFT        ;
32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                 ;
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm7,        mm4                 ;
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,        [rax+16]            ;
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,        [rax+16]            ;
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,        mm5                 ;
33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,        mm6                 ;
33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
334538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
337538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm4,        [GLOBAL(rd)]                 ;
33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm4,        VP8_FILTER_SHIFT        ;
33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm3,        mm4
34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi],      mm3                 ; store the results in the destination
34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,        rdx                 ; next line
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      r8,         dword ptr arg(5) ;dst_pitch
34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,        rdx                 ; next line
35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi,        r8                  ;dst_pitch
35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp         rdi,        rcx                 ;
3531b362b15af34006e6a11974088a46d42b903418eJohann        jne         .next_row_8x8
35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void bilinear_predict8x4_mmx
36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char  *src_ptr,
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int   src_pixels_per_line,
36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  xoffset,
36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  yoffset,
37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *dst_ptr,
37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int dst_pitch
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
3731b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_bilinear_predict8x4_mmx) PRIVATE
37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_bilinear_predict8x4_mmx):
37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3831b362b15af34006e6a11974088a46d42b903418eJohann    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
3841b362b15af34006e6a11974088a46d42b903418eJohann    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,        dword ptr arg(2) ;xoffset
38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi,        arg(4) ;dst_ptr           ;
38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3891b362b15af34006e6a11974088a46d42b903418eJohann        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shl         rax,        5
39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi,        arg(0) ;src_ptr              ;
39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,        rcx
39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        [rax]               ;
39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        [rax+16]            ;
39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,        dword ptr arg(3) ;yoffset
40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        mm0                 ;
40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shl         rax,        5
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,        rcx
40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rcx,        [rdi+rdx*4]          ;
40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; get the first horizontal line done       ;
41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        mm3                 ; make a copy of current line
41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm4,        mm0                 ;
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,        mm1                 ;
41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,        mm1                 ;
41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        [rsi+1]             ;
42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm5                 ;
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm5,        mm0                 ;
42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,        mm0                 ;
42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,        mm2                 ;
42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,        mm2                 ;
42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,        mm5                 ;
42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,        mm6                 ;
43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
431538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
434538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm4,        [GLOBAL(rd)]                 ;
43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm4,        VP8_FILTER_SHIFT        ;
43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                 ;
43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm7,        mm4                 ;
43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,        rdx                 ; next line
4411b362b15af34006e6a11974088a46d42b903418eJohann.next_row_8x4:
44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        mm3                 ; make a copy of current line
44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm4,        mm0                 ;
44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,        mm1                 ;
44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,        mm1                 ;
45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        [rsi+1]             ;
45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm5                 ;
45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm5,        mm0                 ;
45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,        mm0                 ;
45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,        mm2                 ;
45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,        mm2                 ;
45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,        mm5                 ;
46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,        mm6                 ;
46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm7                 ;
46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        mm7                 ;
46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm5,        mm0                 ;
46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm6,        mm0
46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,        [rax]               ;
47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,        [rax]               ;
47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
472538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
475538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm4,        [GLOBAL(rd)]                 ;
47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm4,        VP8_FILTER_SHIFT        ;
47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                 ;
47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm7,        mm4                 ;
48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,        [rax+16]            ;
48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,        [rax+16]            ;
48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,        mm5                 ;
48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,        mm6                 ;
48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
489538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
492538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm4,        [GLOBAL(rd)]                 ;
49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm4,        VP8_FILTER_SHIFT        ;
49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm3,        mm4
49690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [rdi],      mm3                 ; store the results in the destination
49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,        rdx                 ; next line
50190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
50390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      r8,         dword ptr arg(5) ;dst_pitch
50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,        rdx                 ; next line
50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi,        r8
50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp         rdi,        rcx                 ;
5081b362b15af34006e6a11974088a46d42b903418eJohann        jne         .next_row_8x4
50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
51690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void bilinear_predict4x4_mmx
52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char  *src_ptr,
52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int   src_pixels_per_line,
52390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  xoffset,
52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  yoffset,
52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *dst_ptr,
52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int dst_pitch
52790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
5281b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_bilinear_predict4x4_mmx) PRIVATE
52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_bilinear_predict4x4_mmx):
53090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5381b362b15af34006e6a11974088a46d42b903418eJohann    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
5391b362b15af34006e6a11974088a46d42b903418eJohann    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,        dword ptr arg(2) ;xoffset
54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi,        arg(4) ;dst_ptr           ;
54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5441b362b15af34006e6a11974088a46d42b903418eJohann        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shl         rax,        5
54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,        rcx ; HFilter
54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi,        arg(0) ;src_ptr              ;
54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        [rax]               ;
55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,        [rax+16]            ;
55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,        dword ptr arg(3) ;yoffset
55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,        mm0                 ;
55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shl         rax,        5
55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,        rcx
56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rcx,        [rdi+rdx*4]          ;
56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; get the first horizontal line done       ;
56590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
56690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,        mm1                 ;
56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm5,        [rsi+1]             ;
57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm5,        mm0                 ;
57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,        mm2                 ;
57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,        mm5                 ;
575538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
57690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
57890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                 ;
58090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm7,        mm0                 ;
58190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,        rdx                 ; next line
5831b362b15af34006e6a11974088a46d42b903418eJohann.next_row_4x4:
58490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
58590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,        mm1                 ;
58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm5,        [rsi+1]             ;
58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm5,        mm0                 ;
59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,        mm2                 ;
59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,        mm5                 ;
59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        mm7                 ;
59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm5,        mm0                 ;
59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,        [rax]               ;
599538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,        mm3                 ;
60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm7,        mm0                 ;
60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,        [rax+16]            ;
60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,        mm5                 ;
60890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
610538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
61190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm3,        mm0
61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi],      mm3                 ; store the results in the destination
61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,        rdx                 ; next line
61890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
62090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,        rdx                 ; next line
62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi,        r8
62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
62490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp         rdi,        rcx                 ;
6261b362b15af34006e6a11974088a46d42b903418eJohann        jne         .next_row_4x4
62790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
62990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
63090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
63190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
63290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
63390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
63490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
63590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA
63990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
64090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberrd:
64190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 0x40
64290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
644f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal HIDDEN_DATA(sym(vp8_six_tap_mmx))
64590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_six_tap_mmx):
64690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
64790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
64890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 128
64990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
65090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
65190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
65290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
65490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -6
65590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 123
65690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 12
65790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -1
65890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
65990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 2
66190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -11
66290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 108
66390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 36
66490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -8
66590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 1
66690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
66890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -9
66990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 93
67090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 50
67190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -6
67290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
67390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
67490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 3
67590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -16
67690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 77
67790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 77
67890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -16
67990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 3
68090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
68290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -6
68390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 50
68490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 93
68590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -9
68690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
68790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 1
68990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -8
69090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 36
69190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 108
69290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -11
69390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 2
69490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
69690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -1
69790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 12
69890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 123
69990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw -6
70090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0
70190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
70290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
703