1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14; void vp8_temporal_filter_apply_sse2 | arg
15;  (unsigned char  *frame1,           |  0
16;   unsigned int    stride,           |  1
17;   unsigned char  *frame2,           |  2
18;   unsigned int    block_size,       |  3
19;   int             strength,         |  4
20;   int             filter_weight,    |  5
21;   unsigned int   *accumulator,      |  6
22;   unsigned short *count)            |  7
23global sym(vp8_temporal_filter_apply_sse2) PRIVATE
24sym(vp8_temporal_filter_apply_sse2):
25
26    push        rbp
27    mov         rbp, rsp
28    SHADOW_ARGS_TO_STACK 8
29    SAVE_XMM 7
30    GET_GOT     rbx
31    push        rsi
32    push        rdi
33    ALIGN_STACK 16, rax
34    %define block_size    0
35    %define strength      16
36    %define filter_weight 32
37    %define rounding_bit  48
38    %define rbp_backup    64
39    %define stack_size    80
40    sub         rsp,           stack_size
41    mov         [rsp + rbp_backup], rbp
42    ; end prolog
43
44        mov         rdx,            arg(3)
45        mov         [rsp + block_size], rdx
46        movd        xmm6,            arg(4)
47        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
48
49        ; calculate the rounding bit outside the loop
50        ; 0x8000 >> (16 - strength)
51        mov         rdx,            16
52        sub         rdx,            arg(4) ; 16 - strength
53        movq        xmm4,           rdx    ; can't use rdx w/ shift
54        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
55        psrlw       xmm5,           xmm4
56        movdqa      [rsp + rounding_bit], xmm5
57
58        mov         rsi,            arg(0) ; src/frame1
59        mov         rdx,            arg(2) ; predictor frame
60        mov         rdi,            arg(6) ; accumulator
61        mov         rax,            arg(7) ; count
62
63        ; dup the filter weight and store for later
64        movd        xmm0,           arg(5) ; filter_weight
65        pshuflw     xmm0,           xmm0, 0
66        punpcklwd   xmm0,           xmm0
67        movdqa      [rsp + filter_weight], xmm0
68
69        mov         rbp,            arg(1) ; stride
70        pxor        xmm7,           xmm7   ; zero for extraction
71
72        lea         rcx,            [rdx + 16*16*1]
73        cmp         dword ptr [rsp + block_size], 8
74        jne         .temporal_filter_apply_load_16
75        lea         rcx,            [rdx + 8*8*1]
76
77.temporal_filter_apply_load_8:
78        movq        xmm0,           [rsi]  ; first row
79        lea         rsi,            [rsi + rbp] ; += stride
80        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
81        movq        xmm1,           [rsi]  ; second row
82        lea         rsi,            [rsi + rbp] ; += stride
83        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
84        jmp         .temporal_filter_apply_load_finished
85
86.temporal_filter_apply_load_16:
87        movdqa      xmm0,           [rsi]  ; src (frame1)
88        lea         rsi,            [rsi + rbp] ; += stride
89        movdqa      xmm1,           xmm0
90        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
91        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
92
93.temporal_filter_apply_load_finished:
94        movdqa      xmm2,           [rdx]  ; predictor (frame2)
95        movdqa      xmm3,           xmm2
96        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
97        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
98
99        ; modifier = src_byte - pixel_value
100        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
101        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
102
103        ; modifier *= modifier
104        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
105        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
106
107        ; modifier *= 3
108        pmullw      xmm0,           [GLOBAL(_const_3w)]
109        pmullw      xmm1,           [GLOBAL(_const_3w)]
110
111        ; modifer += 0x8000 >> (16 - strength)
112        paddw       xmm0,           [rsp + rounding_bit]
113        paddw       xmm1,           [rsp + rounding_bit]
114
115        ; modifier >>= strength
116        psrlw       xmm0,           [rsp + strength]
117        psrlw       xmm1,           [rsp + strength]
118
119        ; modifier = 16 - modifier
120        ; saturation takes care of modifier > 16
121        movdqa      xmm3,           [GLOBAL(_const_16w)]
122        movdqa      xmm2,           [GLOBAL(_const_16w)]
123        psubusw     xmm3,           xmm1
124        psubusw     xmm2,           xmm0
125
126        ; modifier *= filter_weight
127        pmullw      xmm2,           [rsp + filter_weight]
128        pmullw      xmm3,           [rsp + filter_weight]
129
130        ; count
131        movdqa      xmm4,           [rax]
132        movdqa      xmm5,           [rax+16]
133        ; += modifier
134        paddw       xmm4,           xmm2
135        paddw       xmm5,           xmm3
136        ; write back
137        movdqa      [rax],          xmm4
138        movdqa      [rax+16],       xmm5
139        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
140
141        ; load and extract the predictor up to shorts
142        pxor        xmm7,           xmm7
143        movdqa      xmm0,           [rdx]
144        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
145        movdqa      xmm1,           xmm0
146        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
147        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
148
149        ; modifier *= pixel_value
150        pmullw      xmm0,           xmm2
151        pmullw      xmm1,           xmm3
152
153        ; expand to double words
154        movdqa      xmm2,           xmm0
155        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
156        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
157        movdqa      xmm3,           xmm1
158        punpcklwd   xmm1,           xmm7   ; [ 8-11]
159        punpckhwd   xmm3,           xmm7   ; [12-15]
160
161        ; accumulator
162        movdqa      xmm4,           [rdi]
163        movdqa      xmm5,           [rdi+16]
164        movdqa      xmm6,           [rdi+32]
165        movdqa      xmm7,           [rdi+48]
166        ; += modifier
167        paddd       xmm4,           xmm0
168        paddd       xmm5,           xmm2
169        paddd       xmm6,           xmm1
170        paddd       xmm7,           xmm3
171        ; write back
172        movdqa      [rdi],          xmm4
173        movdqa      [rdi+16],       xmm5
174        movdqa      [rdi+32],       xmm6
175        movdqa      [rdi+48],       xmm7
176        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
177
178        cmp         rdx,            rcx
179        je          .temporal_filter_apply_epilog
180        pxor        xmm7,           xmm7   ; zero for extraction
181        cmp         dword ptr [rsp + block_size], 16
182        je          .temporal_filter_apply_load_16
183        jmp         .temporal_filter_apply_load_8
184
185.temporal_filter_apply_epilog:
186    ; begin epilog
187    mov         rbp,            [rsp + rbp_backup]
188    add         rsp,            stack_size
189    pop         rsp
190    pop         rdi
191    pop         rsi
192    RESTORE_GOT
193    RESTORE_XMM
194    UNSHADOW_ARGS
195    pop         rbp
196    ret
197
198SECTION_RODATA
199align 16
200_const_3w:
201    times 8 dw 3
202align 16
203_const_top_bit:
204    times 8 dw 1<<15
205align 16
206_const_16w
207    times 8 dw 16
208