highbd_variance_impl_sse2.asm revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;unsigned int vpx_highbd_calc16x16var_sse2
15;(
16;    unsigned char   *  src_ptr,
17;    int             source_stride,
18;    unsigned char   *  ref_ptr,
19;    int             recon_stride,
20;    unsigned int    *  SSE,
21;    int             *  Sum
22;)
23global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
24sym(vpx_highbd_calc16x16var_sse2):
25    push        rbp
26    mov         rbp, rsp
27    SHADOW_ARGS_TO_STACK 6
28    SAVE_XMM 7
29    push rbx
30    push rsi
31    push rdi
32    ; end prolog
33
34        mov         rsi,            arg(0) ;[src_ptr]
35        mov         rdi,            arg(2) ;[ref_ptr]
36
37        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
38        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
39        add         rax,            rax ; source stride in bytes
40        add         rdx,            rdx ; recon stride in bytes
41
42        ; Prefetch data
43        prefetcht0      [rsi]
44        prefetcht0      [rsi+16]
45        prefetcht0      [rsi+rax]
46        prefetcht0      [rsi+rax+16]
47        lea             rbx,    [rsi+rax*2]
48        prefetcht0      [rbx]
49        prefetcht0      [rbx+16]
50        prefetcht0      [rbx+rax]
51        prefetcht0      [rbx+rax+16]
52
53        prefetcht0      [rdi]
54        prefetcht0      [rdi+16]
55        prefetcht0      [rdi+rdx]
56        prefetcht0      [rdi+rdx+16]
57        lea             rbx,    [rdi+rdx*2]
58        prefetcht0      [rbx]
59        prefetcht0      [rbx+16]
60        prefetcht0      [rbx+rdx]
61        prefetcht0      [rbx+rdx+16]
62
63        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
64        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
65
66        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
67        mov         rcx,            16
68
69.var16loop:
70        movdqu      xmm1,           XMMWORD PTR [rsi]
71        movdqu      xmm2,           XMMWORD PTR [rdi]
72
73        lea             rbx,    [rsi+rax*2]
74        prefetcht0      [rbx]
75        prefetcht0      [rbx+16]
76        prefetcht0      [rbx+rax]
77        prefetcht0      [rbx+rax+16]
78        lea             rbx,    [rdi+rdx*2]
79        prefetcht0      [rbx]
80        prefetcht0      [rbx+16]
81        prefetcht0      [rbx+rdx]
82        prefetcht0      [rbx+rdx+16]
83
84        pxor        xmm5,           xmm5
85
86        psubw       xmm1,           xmm2
87        movdqu      xmm3,           XMMWORD PTR [rsi+16]
88        paddw       xmm5,           xmm1
89        pmaddwd     xmm1,           xmm1
90        movdqu      xmm2,           XMMWORD PTR [rdi+16]
91        paddd       xmm6,           xmm1
92
93        psubw       xmm3,           xmm2
94        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
95        paddw       xmm5,           xmm3
96        pmaddwd     xmm3,           xmm3
97        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
98        paddd       xmm6,           xmm3
99
100        psubw       xmm1,           xmm2
101        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
102        paddw       xmm5,           xmm1
103        pmaddwd     xmm1,           xmm1
104        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
105        paddd       xmm6,           xmm1
106
107        psubw       xmm3,           xmm2
108        paddw       xmm5,           xmm3
109        pmaddwd     xmm3,           xmm3
110        paddd       xmm6,           xmm3
111
112        movdqa      xmm1,           xmm5
113        movdqa      xmm2,           xmm5
114        pcmpgtw     xmm1,           xmm0
115        pcmpeqw     xmm2,           xmm0
116        por         xmm1,           xmm2
117        pcmpeqw     xmm1,           xmm0
118        movdqa      xmm2,           xmm5
119        punpcklwd   xmm5,           xmm1
120        punpckhwd   xmm2,           xmm1
121        paddd       xmm7,           xmm5
122        paddd       xmm7,           xmm2
123
124        lea         rsi,            [rsi + 2*rax]
125        lea         rdi,            [rdi + 2*rdx]
126        sub         rcx,            2
127        jnz         .var16loop
128
129        movdqa      xmm4,           xmm6
130        punpckldq   xmm6,           xmm0
131
132        punpckhdq   xmm4,           xmm0
133        movdqa      xmm5,           xmm7
134
135        paddd       xmm6,           xmm4
136        punpckldq   xmm7,           xmm0
137
138        punpckhdq   xmm5,           xmm0
139        paddd       xmm7,           xmm5
140
141        movdqa      xmm4,           xmm6
142        movdqa      xmm5,           xmm7
143
144        psrldq      xmm4,           8
145        psrldq      xmm5,           8
146
147        paddd       xmm6,           xmm4
148        paddd       xmm7,           xmm5
149
150        mov         rdi,            arg(4)   ; [SSE]
151        mov         rax,            arg(5)   ; [Sum]
152
153        movd DWORD PTR [rdi],       xmm6
154        movd DWORD PTR [rax],       xmm7
155
156
157    ; begin epilog
158    pop rdi
159    pop rsi
160    pop rbx
161    RESTORE_XMM
162    UNSHADOW_ARGS
163    pop         rbp
164    ret
165
166
167;unsigned int vpx_highbd_calc8x8var_sse2
168;(
169;    unsigned char   *  src_ptr,
170;    int             source_stride,
171;    unsigned char   *  ref_ptr,
172;    int             recon_stride,
173;    unsigned int    *  SSE,
174;    int             *  Sum
175;)
176global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
177sym(vpx_highbd_calc8x8var_sse2):
178    push        rbp
179    mov         rbp, rsp
180    SHADOW_ARGS_TO_STACK 6
181    SAVE_XMM 7
182    push rbx
183    push rsi
184    push rdi
185    ; end prolog
186
187        mov         rsi,            arg(0) ;[src_ptr]
188        mov         rdi,            arg(2) ;[ref_ptr]
189
190        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
191        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
192        add         rax,            rax ; source stride in bytes
193        add         rdx,            rdx ; recon stride in bytes
194
195        ; Prefetch data
196        prefetcht0      [rsi]
197        prefetcht0      [rsi+rax]
198        lea             rbx,    [rsi+rax*2]
199        prefetcht0      [rbx]
200        prefetcht0      [rbx+rax]
201
202        prefetcht0      [rdi]
203        prefetcht0      [rdi+rdx]
204        lea             rbx,    [rdi+rdx*2]
205        prefetcht0      [rbx]
206        prefetcht0      [rbx+rdx]
207
208        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
209        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
210
211        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
212        mov         rcx,            8
213
214.var8loop:
215        movdqu      xmm1,           XMMWORD PTR [rsi]
216        movdqu      xmm2,           XMMWORD PTR [rdi]
217
218        lea             rbx,    [rsi+rax*4]
219        prefetcht0      [rbx]
220        prefetcht0      [rbx+rax]
221        lea             rbx,    [rbx+rax*2]
222        prefetcht0      [rbx]
223        prefetcht0      [rbx+rax]
224        lea             rbx,    [rdi+rdx*4]
225        prefetcht0      [rbx]
226        prefetcht0      [rbx+rdx]
227        lea             rbx,    [rbx+rdx*2]
228        prefetcht0      [rbx]
229        prefetcht0      [rbx+rdx]
230
231        pxor        xmm5,           xmm5
232
233        psubw       xmm1,           xmm2
234        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
235        paddw       xmm5,           xmm1
236        pmaddwd     xmm1,           xmm1
237        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
238        paddd       xmm6,           xmm1
239
240        lea         rsi,            [rsi + 2*rax]
241        lea         rdi,            [rdi + 2*rdx]
242
243        psubw       xmm3,           xmm2
244        movdqu      xmm1,           XMMWORD PTR [rsi]
245        paddw       xmm5,           xmm3
246        pmaddwd     xmm3,           xmm3
247        movdqu      xmm2,           XMMWORD PTR [rdi]
248        paddd       xmm6,           xmm3
249
250        psubw       xmm1,           xmm2
251        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
252        paddw       xmm5,           xmm1
253        pmaddwd     xmm1,           xmm1
254        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
255        paddd       xmm6,           xmm1
256
257        psubw       xmm3,           xmm2
258        paddw       xmm5,           xmm3
259        pmaddwd     xmm3,           xmm3
260        paddd       xmm6,           xmm3
261
262        movdqa      xmm1,           xmm5
263        movdqa      xmm2,           xmm5
264        pcmpgtw     xmm1,           xmm0
265        pcmpeqw     xmm2,           xmm0
266        por         xmm1,           xmm2
267        pcmpeqw     xmm1,           xmm0
268        movdqa      xmm2,           xmm5
269        punpcklwd   xmm5,           xmm1
270        punpckhwd   xmm2,           xmm1
271        paddd       xmm7,           xmm5
272        paddd       xmm7,           xmm2
273
274        lea         rsi,            [rsi + 2*rax]
275        lea         rdi,            [rdi + 2*rdx]
276        sub         rcx,            4
277        jnz         .var8loop
278
279        movdqa      xmm4,           xmm6
280        punpckldq   xmm6,           xmm0
281
282        punpckhdq   xmm4,           xmm0
283        movdqa      xmm5,           xmm7
284
285        paddd       xmm6,           xmm4
286        punpckldq   xmm7,           xmm0
287
288        punpckhdq   xmm5,           xmm0
289        paddd       xmm7,           xmm5
290
291        movdqa      xmm4,           xmm6
292        movdqa      xmm5,           xmm7
293
294        psrldq      xmm4,           8
295        psrldq      xmm5,           8
296
297        paddd       xmm6,           xmm4
298        paddd       xmm7,           xmm5
299
300        mov         rdi,            arg(4)   ; [SSE]
301        mov         rax,            arg(5)   ; [Sum]
302
303        movd DWORD PTR [rdi],       xmm6
304        movd DWORD PTR [rax],       xmm7
305
306    ; begin epilog
307    pop rdi
308    pop rsi
309    pop rbx
310    RESTORE_XMM
311    UNSHADOW_ARGS
312    pop         rbp
313    ret
314