179f15823c34ae1e423108295e416213200bb280fAndreas Huber;
279f15823c34ae1e423108295e416213200bb280fAndreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
379f15823c34ae1e423108295e416213200bb280fAndreas Huber;
479f15823c34ae1e423108295e416213200bb280fAndreas Huber;  Use of this source code is governed by a BSD-style license
579f15823c34ae1e423108295e416213200bb280fAndreas Huber;  that can be found in the LICENSE file in the root of the source
679f15823c34ae1e423108295e416213200bb280fAndreas Huber;  tree. An additional intellectual property rights grant can be found
779f15823c34ae1e423108295e416213200bb280fAndreas Huber;  in the file PATENTS.  All contributing project authors may
879f15823c34ae1e423108295e416213200bb280fAndreas Huber;  be found in the AUTHORS file in the root of the source tree.
979f15823c34ae1e423108295e416213200bb280fAndreas Huber;
1079f15823c34ae1e423108295e416213200bb280fAndreas Huber
1179f15823c34ae1e423108295e416213200bb280fAndreas Huber
1279f15823c34ae1e423108295e416213200bb280fAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1379f15823c34ae1e423108295e416213200bb280fAndreas Huber
1479f15823c34ae1e423108295e416213200bb280fAndreas Huber; void vp8_temporal_filter_apply_sse2 | arg
1579f15823c34ae1e423108295e416213200bb280fAndreas Huber;  (unsigned char  *frame1,           |  0
1679f15823c34ae1e423108295e416213200bb280fAndreas Huber;   unsigned int    stride,           |  1
1779f15823c34ae1e423108295e416213200bb280fAndreas Huber;   unsigned char  *frame2,           |  2
1879f15823c34ae1e423108295e416213200bb280fAndreas Huber;   unsigned int    block_size,       |  3
1979f15823c34ae1e423108295e416213200bb280fAndreas Huber;   int             strength,         |  4
2079f15823c34ae1e423108295e416213200bb280fAndreas Huber;   int             filter_weight,    |  5
2179f15823c34ae1e423108295e416213200bb280fAndreas Huber;   unsigned int   *accumulator,      |  6
2279f15823c34ae1e423108295e416213200bb280fAndreas Huber;   unsigned short *count)            |  7
231b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_temporal_filter_apply_sse2) PRIVATE
2479f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_temporal_filter_apply_sse2):
2579f15823c34ae1e423108295e416213200bb280fAndreas Huber
2679f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rbp
2779f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rbp, rsp
2879f15823c34ae1e423108295e416213200bb280fAndreas Huber    SHADOW_ARGS_TO_STACK 8
291b362b15af34006e6a11974088a46d42b903418eJohann    SAVE_XMM 7
3079f15823c34ae1e423108295e416213200bb280fAndreas Huber    GET_GOT     rbx
3179f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rsi
3279f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rdi
3379f15823c34ae1e423108295e416213200bb280fAndreas Huber    ALIGN_STACK 16, rax
3479f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define block_size    0
3579f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define strength      16
3679f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define filter_weight 32
3779f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define rounding_bit  48
3879f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define rbp_backup    64
3979f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define stack_size    80
4079f15823c34ae1e423108295e416213200bb280fAndreas Huber    sub         rsp,           stack_size
4179f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         [rsp + rbp_backup], rbp
4279f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; end prolog
4379f15823c34ae1e423108295e416213200bb280fAndreas Huber
4479f15823c34ae1e423108295e416213200bb280fAndreas Huber        mov         rdx,            arg(3)
4579f15823c34ae1e423108295e416213200bb280fAndreas Huber        mov         [rsp + block_size], rdx
4679f15823c34ae1e423108295e416213200bb280fAndreas Huber        movd        xmm6,            arg(4)
4779f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
4879f15823c34ae1e423108295e416213200bb280fAndreas Huber
4979f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; calculate the rounding bit outside the loop
5079f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; 0x8000 >> (16 - strength)
5179f15823c34ae1e423108295e416213200bb280fAndreas Huber        mov         rdx,            16
5279f15823c34ae1e423108295e416213200bb280fAndreas Huber        sub         rdx,            arg(4) ; 16 - strength
5379f15823c34ae1e423108295e416213200bb280fAndreas Huber        movd        xmm4,           rdx    ; can't use rdx w/ shift
5479f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
5579f15823c34ae1e423108295e416213200bb280fAndreas Huber        psrlw       xmm5,           xmm4
5679f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      [rsp + rounding_bit], xmm5
5779f15823c34ae1e423108295e416213200bb280fAndreas Huber
5879f15823c34ae1e423108295e416213200bb280fAndreas Huber        mov         rsi,            arg(0) ; src/frame1
5979f15823c34ae1e423108295e416213200bb280fAndreas Huber        mov         rdx,            arg(2) ; predictor frame
6079f15823c34ae1e423108295e416213200bb280fAndreas Huber        mov         rdi,            arg(6) ; accumulator
6179f15823c34ae1e423108295e416213200bb280fAndreas Huber        mov         rax,            arg(7) ; count
6279f15823c34ae1e423108295e416213200bb280fAndreas Huber
6379f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; dup the filter weight and store for later
6479f15823c34ae1e423108295e416213200bb280fAndreas Huber        movd        xmm0,           arg(5) ; filter_weight
6579f15823c34ae1e423108295e416213200bb280fAndreas Huber        pshuflw     xmm0,           xmm0, 0
6679f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklwd   xmm0,           xmm0
6779f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      [rsp + filter_weight], xmm0
6879f15823c34ae1e423108295e416213200bb280fAndreas Huber
6979f15823c34ae1e423108295e416213200bb280fAndreas Huber        mov         rbp,            arg(1) ; stride
7079f15823c34ae1e423108295e416213200bb280fAndreas Huber        pxor        xmm7,           xmm7   ; zero for extraction
7179f15823c34ae1e423108295e416213200bb280fAndreas Huber
7279f15823c34ae1e423108295e416213200bb280fAndreas Huber        lea         rcx,            [rdx + 16*16*1]
7379f15823c34ae1e423108295e416213200bb280fAndreas Huber        cmp         dword ptr [rsp + block_size], 8
741b362b15af34006e6a11974088a46d42b903418eJohann        jne         .temporal_filter_apply_load_16
7579f15823c34ae1e423108295e416213200bb280fAndreas Huber        lea         rcx,            [rdx + 8*8*1]
7679f15823c34ae1e423108295e416213200bb280fAndreas Huber
771b362b15af34006e6a11974088a46d42b903418eJohann.temporal_filter_apply_load_8:
7879f15823c34ae1e423108295e416213200bb280fAndreas Huber        movq        xmm0,           [rsi]  ; first row
7979f15823c34ae1e423108295e416213200bb280fAndreas Huber        lea         rsi,            [rsi + rbp] ; += stride
8079f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
8179f15823c34ae1e423108295e416213200bb280fAndreas Huber        movq        xmm1,           [rsi]  ; second row
8279f15823c34ae1e423108295e416213200bb280fAndreas Huber        lea         rsi,            [rsi + rbp] ; += stride
8379f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
841b362b15af34006e6a11974088a46d42b903418eJohann        jmp         .temporal_filter_apply_load_finished
8579f15823c34ae1e423108295e416213200bb280fAndreas Huber
861b362b15af34006e6a11974088a46d42b903418eJohann.temporal_filter_apply_load_16:
8779f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm0,           [rsi]  ; src (frame1)
8879f15823c34ae1e423108295e416213200bb280fAndreas Huber        lea         rsi,            [rsi + rbp] ; += stride
8979f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm1,           xmm0
9079f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
9179f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
9279f15823c34ae1e423108295e416213200bb280fAndreas Huber
931b362b15af34006e6a11974088a46d42b903418eJohann.temporal_filter_apply_load_finished:
9479f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm2,           [rdx]  ; predictor (frame2)
9579f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm3,           xmm2
9679f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
9779f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
9879f15823c34ae1e423108295e416213200bb280fAndreas Huber
9979f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; modifier = src_byte - pixel_value
10079f15823c34ae1e423108295e416213200bb280fAndreas Huber        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
10179f15823c34ae1e423108295e416213200bb280fAndreas Huber        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
10279f15823c34ae1e423108295e416213200bb280fAndreas Huber
10379f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; modifier *= modifier
10479f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
10579f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
10679f15823c34ae1e423108295e416213200bb280fAndreas Huber
10779f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; modifier *= 3
10879f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmullw      xmm0,           [GLOBAL(_const_3w)]
10979f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmullw      xmm1,           [GLOBAL(_const_3w)]
11079f15823c34ae1e423108295e416213200bb280fAndreas Huber
11179f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; modifer += 0x8000 >> (16 - strength)
11279f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddw       xmm0,           [rsp + rounding_bit]
11379f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddw       xmm1,           [rsp + rounding_bit]
11479f15823c34ae1e423108295e416213200bb280fAndreas Huber
11579f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; modifier >>= strength
11679f15823c34ae1e423108295e416213200bb280fAndreas Huber        psrlw       xmm0,           [rsp + strength]
11779f15823c34ae1e423108295e416213200bb280fAndreas Huber        psrlw       xmm1,           [rsp + strength]
11879f15823c34ae1e423108295e416213200bb280fAndreas Huber
11979f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; modifier = 16 - modifier
12079f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; saturation takes care of modifier > 16
12179f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm3,           [GLOBAL(_const_16w)]
12279f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm2,           [GLOBAL(_const_16w)]
12379f15823c34ae1e423108295e416213200bb280fAndreas Huber        psubusw     xmm3,           xmm1
12479f15823c34ae1e423108295e416213200bb280fAndreas Huber        psubusw     xmm2,           xmm0
12579f15823c34ae1e423108295e416213200bb280fAndreas Huber
12679f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; modifier *= filter_weight
12779f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmullw      xmm2,           [rsp + filter_weight]
12879f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmullw      xmm3,           [rsp + filter_weight]
12979f15823c34ae1e423108295e416213200bb280fAndreas Huber
13079f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; count
13179f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm4,           [rax]
13279f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm5,           [rax+16]
13379f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; += modifier
13479f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddw       xmm4,           xmm2
13579f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddw       xmm5,           xmm3
13679f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; write back
13779f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      [rax],          xmm4
13879f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      [rax+16],       xmm5
13979f15823c34ae1e423108295e416213200bb280fAndreas Huber        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
14079f15823c34ae1e423108295e416213200bb280fAndreas Huber
14179f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; load and extract the predictor up to shorts
14279f15823c34ae1e423108295e416213200bb280fAndreas Huber        pxor        xmm7,           xmm7
14379f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm0,           [rdx]
14479f15823c34ae1e423108295e416213200bb280fAndreas Huber        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
14579f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm1,           xmm0
14679f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
14779f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
14879f15823c34ae1e423108295e416213200bb280fAndreas Huber
14979f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; modifier *= pixel_value
15079f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmullw      xmm0,           xmm2
15179f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmullw      xmm1,           xmm3
15279f15823c34ae1e423108295e416213200bb280fAndreas Huber
15379f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; expand to double words
15479f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm2,           xmm0
15579f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
15679f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
15779f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm3,           xmm1
15879f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklwd   xmm1,           xmm7   ; [ 8-11]
15979f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckhwd   xmm3,           xmm7   ; [12-15]
16079f15823c34ae1e423108295e416213200bb280fAndreas Huber
16179f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; accumulator
16279f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm4,           [rdi]
16379f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm5,           [rdi+16]
16479f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm6,           [rdi+32]
16579f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      xmm7,           [rdi+48]
16679f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; += modifier
1671b362b15af34006e6a11974088a46d42b903418eJohann        paddd       xmm4,           xmm0
1681b362b15af34006e6a11974088a46d42b903418eJohann        paddd       xmm5,           xmm2
1691b362b15af34006e6a11974088a46d42b903418eJohann        paddd       xmm6,           xmm1
1701b362b15af34006e6a11974088a46d42b903418eJohann        paddd       xmm7,           xmm3
17179f15823c34ae1e423108295e416213200bb280fAndreas Huber        ; write back
17279f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      [rdi],          xmm4
17379f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      [rdi+16],       xmm5
17479f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      [rdi+32],       xmm6
17579f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa      [rdi+48],       xmm7
17679f15823c34ae1e423108295e416213200bb280fAndreas Huber        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
17779f15823c34ae1e423108295e416213200bb280fAndreas Huber
17879f15823c34ae1e423108295e416213200bb280fAndreas Huber        cmp         rdx,            rcx
1791b362b15af34006e6a11974088a46d42b903418eJohann        je          .temporal_filter_apply_epilog
18079f15823c34ae1e423108295e416213200bb280fAndreas Huber        pxor        xmm7,           xmm7   ; zero for extraction
18179f15823c34ae1e423108295e416213200bb280fAndreas Huber        cmp         dword ptr [rsp + block_size], 16
1821b362b15af34006e6a11974088a46d42b903418eJohann        je          .temporal_filter_apply_load_16
1831b362b15af34006e6a11974088a46d42b903418eJohann        jmp         .temporal_filter_apply_load_8
18479f15823c34ae1e423108295e416213200bb280fAndreas Huber
1851b362b15af34006e6a11974088a46d42b903418eJohann.temporal_filter_apply_epilog:
18679f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; begin epilog
18779f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rbp,            [rsp + rbp_backup]
18879f15823c34ae1e423108295e416213200bb280fAndreas Huber    add         rsp,            stack_size
18979f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rsp
19079f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rdi
19179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rsi
19279f15823c34ae1e423108295e416213200bb280fAndreas Huber    RESTORE_GOT
19379f15823c34ae1e423108295e416213200bb280fAndreas Huber    RESTORE_XMM
19479f15823c34ae1e423108295e416213200bb280fAndreas Huber    UNSHADOW_ARGS
19579f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rbp
19679f15823c34ae1e423108295e416213200bb280fAndreas Huber    ret
19779f15823c34ae1e423108295e416213200bb280fAndreas Huber
19879f15823c34ae1e423108295e416213200bb280fAndreas HuberSECTION_RODATA
19979f15823c34ae1e423108295e416213200bb280fAndreas Huberalign 16
20079f15823c34ae1e423108295e416213200bb280fAndreas Huber_const_3w:
20179f15823c34ae1e423108295e416213200bb280fAndreas Huber    times 8 dw 3
20279f15823c34ae1e423108295e416213200bb280fAndreas Huberalign 16
20379f15823c34ae1e423108295e416213200bb280fAndreas Huber_const_top_bit:
20479f15823c34ae1e423108295e416213200bb280fAndreas Huber    times 8 dw 1<<15
20579f15823c34ae1e423108295e416213200bb280fAndreas Huberalign 16
20679f15823c34ae1e423108295e416213200bb280fAndreas Huber_const_16w
20779f15823c34ae1e423108295e416213200bb280fAndreas Huber    times 8 dw 16
208