sad_sse3.asm revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro STACK_FRAME_CREATE_X3 0
14%if ABI_IS_32BIT
15  %define     src_ptr       rsi
16  %define     src_stride    rax
17  %define     ref_ptr       rdi
18  %define     ref_stride    rdx
19  %define     end_ptr       rcx
20  %define     ret_var       rbx
21  %define     result_ptr    arg(4)
22  %define     height        dword ptr arg(4)
23    push        rbp
24    mov         rbp,        rsp
25    push        rsi
26    push        rdi
27    push        rbx
28
29    mov         rsi,        arg(0)              ; src_ptr
30    mov         rdi,        arg(2)              ; ref_ptr
31
32    movsxd      rax,        dword ptr arg(1)    ; src_stride
33    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
34%else
35  %if LIBVPX_YASM_WIN64
36    SAVE_XMM 7, u
37    %define     src_ptr     rcx
38    %define     src_stride  rdx
39    %define     ref_ptr     r8
40    %define     ref_stride  r9
41    %define     end_ptr     r10
42    %define     ret_var     r11
43    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
44    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
45  %else
46    %define     src_ptr     rdi
47    %define     src_stride  rsi
48    %define     ref_ptr     rdx
49    %define     ref_stride  rcx
50    %define     end_ptr     r9
51    %define     ret_var     r10
52    %define     result_ptr  r8
53    %define     height      r8
54  %endif
55%endif
56
57%endmacro
58
59%macro STACK_FRAME_DESTROY_X3 0
60  %define     src_ptr
61  %define     src_stride
62  %define     ref_ptr
63  %define     ref_stride
64  %define     end_ptr
65  %define     ret_var
66  %define     result_ptr
67  %define     height
68
69%if ABI_IS_32BIT
70    pop         rbx
71    pop         rdi
72    pop         rsi
73    pop         rbp
74%else
75  %if LIBVPX_YASM_WIN64
76    RESTORE_XMM
77  %endif
78%endif
79    ret
80%endmacro
81
82%macro PROCESS_16X2X3 5
83%if %1==0
84        movdqa          xmm0,       XMMWORD PTR [%2]
85        lddqu           xmm5,       XMMWORD PTR [%3]
86        lddqu           xmm6,       XMMWORD PTR [%3+1]
87        lddqu           xmm7,       XMMWORD PTR [%3+2]
88
89        psadbw          xmm5,       xmm0
90        psadbw          xmm6,       xmm0
91        psadbw          xmm7,       xmm0
92%else
93        movdqa          xmm0,       XMMWORD PTR [%2]
94        lddqu           xmm1,       XMMWORD PTR [%3]
95        lddqu           xmm2,       XMMWORD PTR [%3+1]
96        lddqu           xmm3,       XMMWORD PTR [%3+2]
97
98        psadbw          xmm1,       xmm0
99        psadbw          xmm2,       xmm0
100        psadbw          xmm3,       xmm0
101
102        paddw           xmm5,       xmm1
103        paddw           xmm6,       xmm2
104        paddw           xmm7,       xmm3
105%endif
106        movdqa          xmm0,       XMMWORD PTR [%2+%4]
107        lddqu           xmm1,       XMMWORD PTR [%3+%5]
108        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
109        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
110
111%if %1==0 || %1==1
112        lea             %2,         [%2+%4*2]
113        lea             %3,         [%3+%5*2]
114%endif
115
116        psadbw          xmm1,       xmm0
117        psadbw          xmm2,       xmm0
118        psadbw          xmm3,       xmm0
119
120        paddw           xmm5,       xmm1
121        paddw           xmm6,       xmm2
122        paddw           xmm7,       xmm3
123%endmacro
124
125%macro PROCESS_8X2X3 5
126%if %1==0
127        movq            mm0,       QWORD PTR [%2]
128        movq            mm5,       QWORD PTR [%3]
129        movq            mm6,       QWORD PTR [%3+1]
130        movq            mm7,       QWORD PTR [%3+2]
131
132        psadbw          mm5,       mm0
133        psadbw          mm6,       mm0
134        psadbw          mm7,       mm0
135%else
136        movq            mm0,       QWORD PTR [%2]
137        movq            mm1,       QWORD PTR [%3]
138        movq            mm2,       QWORD PTR [%3+1]
139        movq            mm3,       QWORD PTR [%3+2]
140
141        psadbw          mm1,       mm0
142        psadbw          mm2,       mm0
143        psadbw          mm3,       mm0
144
145        paddw           mm5,       mm1
146        paddw           mm6,       mm2
147        paddw           mm7,       mm3
148%endif
149        movq            mm0,       QWORD PTR [%2+%4]
150        movq            mm1,       QWORD PTR [%3+%5]
151        movq            mm2,       QWORD PTR [%3+%5+1]
152        movq            mm3,       QWORD PTR [%3+%5+2]
153
154%if %1==0 || %1==1
155        lea             %2,        [%2+%4*2]
156        lea             %3,        [%3+%5*2]
157%endif
158
159        psadbw          mm1,       mm0
160        psadbw          mm2,       mm0
161        psadbw          mm3,       mm0
162
163        paddw           mm5,       mm1
164        paddw           mm6,       mm2
165        paddw           mm7,       mm3
166%endmacro
167
168;void int vpx_sad16x16x3_sse3(
169;    unsigned char *src_ptr,
170;    int  src_stride,
171;    unsigned char *ref_ptr,
172;    int  ref_stride,
173;    int  *results)
174global sym(vpx_sad16x16x3_sse3) PRIVATE
175sym(vpx_sad16x16x3_sse3):
176
177    STACK_FRAME_CREATE_X3
178
179        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
180        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
181        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
182        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
183        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
184        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
185        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
186        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
187
188        mov             rcx,        result_ptr
189
190        movq            xmm0,       xmm5
191        psrldq          xmm5,       8
192
193        paddw           xmm0,       xmm5
194        movd            [rcx],      xmm0
195;-
196        movq            xmm0,       xmm6
197        psrldq          xmm6,       8
198
199        paddw           xmm0,       xmm6
200        movd            [rcx+4],    xmm0
201;-
202        movq            xmm0,       xmm7
203        psrldq          xmm7,       8
204
205        paddw           xmm0,       xmm7
206        movd            [rcx+8],    xmm0
207
208    STACK_FRAME_DESTROY_X3
209
210;void int vpx_sad16x8x3_sse3(
211;    unsigned char *src_ptr,
212;    int  src_stride,
213;    unsigned char *ref_ptr,
214;    int  ref_stride,
215;    int  *results)
216global sym(vpx_sad16x8x3_sse3) PRIVATE
217sym(vpx_sad16x8x3_sse3):
218
219    STACK_FRAME_CREATE_X3
220
221        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
222        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
223        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
224        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
225
226        mov             rcx,        result_ptr
227
228        movq            xmm0,       xmm5
229        psrldq          xmm5,       8
230
231        paddw           xmm0,       xmm5
232        movd            [rcx],      xmm0
233;-
234        movq            xmm0,       xmm6
235        psrldq          xmm6,       8
236
237        paddw           xmm0,       xmm6
238        movd            [rcx+4],    xmm0
239;-
240        movq            xmm0,       xmm7
241        psrldq          xmm7,       8
242
243        paddw           xmm0,       xmm7
244        movd            [rcx+8],    xmm0
245
246    STACK_FRAME_DESTROY_X3
247
248;void int vpx_sad8x16x3_sse3(
249;    unsigned char *src_ptr,
250;    int  src_stride,
251;    unsigned char *ref_ptr,
252;    int  ref_stride,
253;    int  *results)
254global sym(vpx_sad8x16x3_sse3) PRIVATE
255sym(vpx_sad8x16x3_sse3):
256
257    STACK_FRAME_CREATE_X3
258
259        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
260        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
261        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
262        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
263        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
264        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
265        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
266        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
267
268        mov             rcx,        result_ptr
269
270        punpckldq       mm5,        mm6
271
272        movq            [rcx],      mm5
273        movd            [rcx+8],    mm7
274
275    STACK_FRAME_DESTROY_X3
276
277;void int vpx_sad8x8x3_sse3(
278;    unsigned char *src_ptr,
279;    int  src_stride,
280;    unsigned char *ref_ptr,
281;    int  ref_stride,
282;    int  *results)
283global sym(vpx_sad8x8x3_sse3) PRIVATE
284sym(vpx_sad8x8x3_sse3):
285
286    STACK_FRAME_CREATE_X3
287
288        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
289        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
290        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
291        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
292
293        mov             rcx,        result_ptr
294
295        punpckldq       mm5,        mm6
296
297        movq            [rcx],      mm5
298        movd            [rcx+8],    mm7
299
300    STACK_FRAME_DESTROY_X3
301
302;void int vpx_sad4x4x3_sse3(
303;    unsigned char *src_ptr,
304;    int  src_stride,
305;    unsigned char *ref_ptr,
306;    int  ref_stride,
307;    int  *results)
308global sym(vpx_sad4x4x3_sse3) PRIVATE
309sym(vpx_sad4x4x3_sse3):
310
311    STACK_FRAME_CREATE_X3
312
313        movd            mm0,        DWORD PTR [src_ptr]
314        movd            mm1,        DWORD PTR [ref_ptr]
315
316        movd            mm2,        DWORD PTR [src_ptr+src_stride]
317        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
318
319        punpcklbw       mm0,        mm2
320        punpcklbw       mm1,        mm3
321
322        movd            mm4,        DWORD PTR [ref_ptr+1]
323        movd            mm5,        DWORD PTR [ref_ptr+2]
324
325        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
326        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
327
328        psadbw          mm1,        mm0
329
330        punpcklbw       mm4,        mm2
331        punpcklbw       mm5,        mm3
332
333        psadbw          mm4,        mm0
334        psadbw          mm5,        mm0
335
336        lea             src_ptr,    [src_ptr+src_stride*2]
337        lea             ref_ptr,    [ref_ptr+ref_stride*2]
338
339        movd            mm0,        DWORD PTR [src_ptr]
340        movd            mm2,        DWORD PTR [ref_ptr]
341
342        movd            mm3,        DWORD PTR [src_ptr+src_stride]
343        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
344
345        punpcklbw       mm0,        mm3
346        punpcklbw       mm2,        mm6
347
348        movd            mm3,        DWORD PTR [ref_ptr+1]
349        movd            mm7,        DWORD PTR [ref_ptr+2]
350
351        psadbw          mm2,        mm0
352
353        paddw           mm1,        mm2
354
355        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
356        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
357
358        punpcklbw       mm3,        mm2
359        punpcklbw       mm7,        mm6
360
361        psadbw          mm3,        mm0
362        psadbw          mm7,        mm0
363
364        paddw           mm3,        mm4
365        paddw           mm7,        mm5
366
367        mov             rcx,        result_ptr
368
369        punpckldq       mm1,        mm3
370
371        movq            [rcx],      mm1
372        movd            [rcx+8],    mm7
373
374    STACK_FRAME_DESTROY_X3
375