1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14; void vp9_temporal_filter_apply_sse2 | arg
15;  (unsigned char  *frame1,           |  0
16;   unsigned int    stride,           |  1
17;   unsigned char  *frame2,           |  2
18;   unsigned int    block_width,      |  3
19;   unsigned int    block_height,     |  4
20;   int             strength,         |  5
21;   int             filter_weight,    |  6
22;   unsigned int   *accumulator,      |  7
23;   unsigned short *count)            |  8
24global sym(vp9_temporal_filter_apply_sse2) PRIVATE
25sym(vp9_temporal_filter_apply_sse2):
26
27    push        rbp
28    mov         rbp, rsp
29    SHADOW_ARGS_TO_STACK 9
30    SAVE_XMM 7
31    GET_GOT     rbx
32    push        rsi
33    push        rdi
34    ALIGN_STACK 16, rax
35    %define block_width    0
36    %define block_height  16
37    %define strength      32
38    %define filter_weight 48
39    %define rounding_bit  64
40    %define rbp_backup    80
41    %define stack_size    96
42    sub         rsp,           stack_size
43    mov         [rsp + rbp_backup], rbp
44    ; end prolog
45
46        mov         edx,            arg(3)
47        mov         [rsp + block_width], rdx
48        mov         edx,            arg(4)
49        mov         [rsp + block_height], rdx
50        movd        xmm6,           arg(5)
51        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
52
53        ; calculate the rounding bit outside the loop
54        ; 0x8000 >> (16 - strength)
55        mov         rdx,            16
56        sub         rdx,            arg(5) ; 16 - strength
57        movq        xmm4,           rdx    ; can't use rdx w/ shift
58        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
59        psrlw       xmm5,           xmm4
60        movdqa      [rsp + rounding_bit], xmm5
61
62        mov         rsi,            arg(0) ; src/frame1
63        mov         rdx,            arg(2) ; predictor frame
64        mov         rdi,            arg(7) ; accumulator
65        mov         rax,            arg(8) ; count
66
67        ; dup the filter weight and store for later
68        movd        xmm0,           arg(6) ; filter_weight
69        pshuflw     xmm0,           xmm0, 0
70        punpcklwd   xmm0,           xmm0
71        movdqa      [rsp + filter_weight], xmm0
72
73        mov         rbp,            arg(1) ; stride
74        pxor        xmm7,           xmm7   ; zero for extraction
75
76        mov         rcx,            [rsp + block_width]
77        imul        rcx,            [rsp + block_height]
78        add         rcx,            rdx
79        cmp         dword ptr [rsp + block_width], 8
80        jne         .temporal_filter_apply_load_16
81
82.temporal_filter_apply_load_8:
83        movq        xmm0,           [rsi]  ; first row
84        lea         rsi,            [rsi + rbp] ; += stride
85        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
86        movq        xmm1,           [rsi]  ; second row
87        lea         rsi,            [rsi + rbp] ; += stride
88        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
89        jmp         .temporal_filter_apply_load_finished
90
91.temporal_filter_apply_load_16:
92        movdqa      xmm0,           [rsi]  ; src (frame1)
93        lea         rsi,            [rsi + rbp] ; += stride
94        movdqa      xmm1,           xmm0
95        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
96        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
97
98.temporal_filter_apply_load_finished:
99        movdqa      xmm2,           [rdx]  ; predictor (frame2)
100        movdqa      xmm3,           xmm2
101        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
102        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
103
104        ; modifier = src_byte - pixel_value
105        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
106        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
107
108        ; modifier *= modifier
109        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
110        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
111
112        ; modifier *= 3
113        pmullw      xmm0,           [GLOBAL(_const_3w)]
114        pmullw      xmm1,           [GLOBAL(_const_3w)]
115
116        ; modifer += 0x8000 >> (16 - strength)
117        paddw       xmm0,           [rsp + rounding_bit]
118        paddw       xmm1,           [rsp + rounding_bit]
119
120        ; modifier >>= strength
121        psrlw       xmm0,           [rsp + strength]
122        psrlw       xmm1,           [rsp + strength]
123
124        ; modifier = 16 - modifier
125        ; saturation takes care of modifier > 16
126        movdqa      xmm3,           [GLOBAL(_const_16w)]
127        movdqa      xmm2,           [GLOBAL(_const_16w)]
128        psubusw     xmm3,           xmm1
129        psubusw     xmm2,           xmm0
130
131        ; modifier *= filter_weight
132        pmullw      xmm2,           [rsp + filter_weight]
133        pmullw      xmm3,           [rsp + filter_weight]
134
135        ; count
136        movdqa      xmm4,           [rax]
137        movdqa      xmm5,           [rax+16]
138        ; += modifier
139        paddw       xmm4,           xmm2
140        paddw       xmm5,           xmm3
141        ; write back
142        movdqa      [rax],          xmm4
143        movdqa      [rax+16],       xmm5
144        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
145
146        ; load and extract the predictor up to shorts
147        pxor        xmm7,           xmm7
148        movdqa      xmm0,           [rdx]
149        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
150        movdqa      xmm1,           xmm0
151        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
152        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
153
154        ; modifier *= pixel_value
155        pmullw      xmm0,           xmm2
156        pmullw      xmm1,           xmm3
157
158        ; expand to double words
159        movdqa      xmm2,           xmm0
160        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
161        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
162        movdqa      xmm3,           xmm1
163        punpcklwd   xmm1,           xmm7   ; [ 8-11]
164        punpckhwd   xmm3,           xmm7   ; [12-15]
165
166        ; accumulator
167        movdqa      xmm4,           [rdi]
168        movdqa      xmm5,           [rdi+16]
169        movdqa      xmm6,           [rdi+32]
170        movdqa      xmm7,           [rdi+48]
171        ; += modifier
172        paddd       xmm4,           xmm0
173        paddd       xmm5,           xmm2
174        paddd       xmm6,           xmm1
175        paddd       xmm7,           xmm3
176        ; write back
177        movdqa      [rdi],          xmm4
178        movdqa      [rdi+16],       xmm5
179        movdqa      [rdi+32],       xmm6
180        movdqa      [rdi+48],       xmm7
181        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
182
183        cmp         rdx,            rcx
184        je          .temporal_filter_apply_epilog
185        pxor        xmm7,           xmm7   ; zero for extraction
186        cmp         dword ptr [rsp + block_width], 16
187        je          .temporal_filter_apply_load_16
188        jmp         .temporal_filter_apply_load_8
189
190.temporal_filter_apply_epilog:
191    ; begin epilog
192    mov         rbp,            [rsp + rbp_backup]
193    add         rsp,            stack_size
194    pop         rsp
195    pop         rdi
196    pop         rsi
197    RESTORE_GOT
198    RESTORE_XMM
199    UNSHADOW_ARGS
200    pop         rbp
201    ret
202
203SECTION_RODATA
204align 16
205_const_3w:
206    times 8 dw 3
207align 16
208_const_top_bit:
209    times 8 dw 1<<15
210align 16
211_const_16w
212    times 8 dw 16
213