1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm"
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_temporal_filter_apply_sse2 | arg
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  (unsigned char  *frame1,           |  0
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   unsigned int    stride,           |  1
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   unsigned char  *frame2,           |  2
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   unsigned int    block_size,       |  3
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   int             strength,         |  4
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   int             filter_weight,    |  5
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   unsigned int   *accumulator,      |  6
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan;   unsigned short *count)            |  7
23233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_temporal_filter_apply_sse2) PRIVATE
24233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_temporal_filter_apply_sse2):
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 8
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define block_size    0
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define strength      16
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define filter_weight 32
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define rounding_bit  48
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define rbp_backup    64
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define stack_size    80
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp,           stack_size
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         [rsp + rbp_backup], rbp
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,            arg(3)
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         [rsp + block_size], rdx
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm6,            arg(4)
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; calculate the rounding bit outside the loop
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; 0x8000 >> (16 - strength)
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,            16
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub         rdx,            arg(4) ; 16 - strength
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm4,           rdx    ; can't use rdx w/ shift
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlw       xmm5,           xmm4
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp + rounding_bit], xmm5
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,            arg(0) ; src/frame1
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,            arg(2) ; predictor frame
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,            arg(6) ; accumulator
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax,            arg(7) ; count
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; dup the filter weight and store for later
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm0,           arg(5) ; filter_weight
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pshuflw     xmm0,           xmm0, 0
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm0,           xmm0
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp + filter_weight], xmm0
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rbp,            arg(1) ; stride
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,           xmm7   ; zero for extraction
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,            [rdx + 16*16*1]
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan        cmp         dword ptr [rsp + block_size], 8
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jne         .temporal_filter_apply_load_16
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,            [rdx + 8*8*1]
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan.temporal_filter_apply_load_8:
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm0,           [rsi]  ; first row
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,            [rsi + rbp] ; += stride
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm1,           [rsi]  ; second row
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,            [rsi + rbp] ; += stride
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jmp         .temporal_filter_apply_load_finished
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan.temporal_filter_apply_load_16:
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,           [rsi]  ; src (frame1)
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,            [rsi + rbp] ; += stride
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,           xmm0
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan.temporal_filter_apply_load_finished:
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,           [rdx]  ; predictor (frame2)
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,           xmm2
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; modifier = src_byte - pixel_value
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; modifier *= modifier
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; modifier *= 3
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm0,           [GLOBAL(_const_3w)]
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm1,           [GLOBAL(_const_3w)]
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; modifer += 0x8000 >> (16 - strength)
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm0,           [rsp + rounding_bit]
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm1,           [rsp + rounding_bit]
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; modifier >>= strength
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlw       xmm0,           [rsp + strength]
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlw       xmm1,           [rsp + strength]
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; modifier = 16 - modifier
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; saturation takes care of modifier > 16
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,           [GLOBAL(_const_16w)]
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,           [GLOBAL(_const_16w)]
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusw     xmm3,           xmm1
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusw     xmm2,           xmm0
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; modifier *= filter_weight
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm2,           [rsp + filter_weight]
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm3,           [rsp + filter_weight]
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; count
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           [rax]
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           [rax+16]
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; += modifier
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm4,           xmm2
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm5,           xmm3
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; write back
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rax],          xmm4
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rax+16],       xmm5
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; load and extract the predictor up to shorts
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,           xmm7
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,           [rdx]
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,           xmm0
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; modifier *= pixel_value
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm0,           xmm2
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw      xmm1,           xmm3
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; expand to double words
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,           xmm0
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,           xmm1
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm1,           xmm7   ; [ 8-11]
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm3,           xmm7   ; [12-15]
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; accumulator
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           [rdi]
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,           [rdi+16]
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,           [rdi+32]
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           [rdi+48]
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; += modifier
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm4,           xmm0
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm5,           xmm2
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm6,           xmm1
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm7,           xmm3
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; write back
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rdi],          xmm4
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rdi+16],       xmm5
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rdi+32],       xmm6
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rdi+48],       xmm7
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan        cmp         rdx,            rcx
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan        je          .temporal_filter_apply_epilog
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,           xmm7   ; zero for extraction
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan        cmp         dword ptr [rsp + block_size], 16
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan        je          .temporal_filter_apply_load_16
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jmp         .temporal_filter_apply_load_8
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan.temporal_filter_apply_epilog:
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp,            [rsp + rbp_backup]
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         rsp,            stack_size
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rsp
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rdi
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rsi
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan
198233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA
199233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan_const_3w:
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 8 dw 3
202233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan_const_top_bit:
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 8 dw 1<<15
205233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan_const_16w
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 8 dw 16
208