1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro STACK_FRAME_CREATE_X3 0
14%if ABI_IS_32BIT
15  %define     src_ptr       rsi
16  %define     src_stride    rax
17  %define     ref_ptr       rdi
18  %define     ref_stride    rdx
19  %define     end_ptr       rcx
20  %define     ret_var       rbx
21  %define     result_ptr    arg(4)
22  %define     max_err       arg(4)
23  %define     height        dword ptr arg(4)
24    push        rbp
25    mov         rbp,        rsp
26    push        rsi
27    push        rdi
28    push        rbx
29
30    mov         rsi,        arg(0)              ; src_ptr
31    mov         rdi,        arg(2)              ; ref_ptr
32
33    movsxd      rax,        dword ptr arg(1)    ; src_stride
34    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
35%else
36  %if LIBVPX_YASM_WIN64
37    SAVE_XMM 7, u
38    %define     src_ptr     rcx
39    %define     src_stride  rdx
40    %define     ref_ptr     r8
41    %define     ref_stride  r9
42    %define     end_ptr     r10
43    %define     ret_var     r11
44    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
45    %define     max_err     [rsp+xmm_stack_space+8+4*8]
46    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
47  %else
48    %define     src_ptr     rdi
49    %define     src_stride  rsi
50    %define     ref_ptr     rdx
51    %define     ref_stride  rcx
52    %define     end_ptr     r9
53    %define     ret_var     r10
54    %define     result_ptr  r8
55    %define     max_err     r8
56    %define     height      r8
57  %endif
58%endif
59
60%endmacro
61
62%macro STACK_FRAME_DESTROY_X3 0
63  %define     src_ptr
64  %define     src_stride
65  %define     ref_ptr
66  %define     ref_stride
67  %define     end_ptr
68  %define     ret_var
69  %define     result_ptr
70  %define     max_err
71  %define     height
72
73%if ABI_IS_32BIT
74    pop         rbx
75    pop         rdi
76    pop         rsi
77    pop         rbp
78%else
79  %if LIBVPX_YASM_WIN64
80    RESTORE_XMM
81  %endif
82%endif
83    ret
84%endmacro
85
86%macro PROCESS_16X2X3 5
87%if %1==0
88        movdqa          xmm0,       XMMWORD PTR [%2]
89        lddqu           xmm5,       XMMWORD PTR [%3]
90        lddqu           xmm6,       XMMWORD PTR [%3+1]
91        lddqu           xmm7,       XMMWORD PTR [%3+2]
92
93        psadbw          xmm5,       xmm0
94        psadbw          xmm6,       xmm0
95        psadbw          xmm7,       xmm0
96%else
97        movdqa          xmm0,       XMMWORD PTR [%2]
98        lddqu           xmm1,       XMMWORD PTR [%3]
99        lddqu           xmm2,       XMMWORD PTR [%3+1]
100        lddqu           xmm3,       XMMWORD PTR [%3+2]
101
102        psadbw          xmm1,       xmm0
103        psadbw          xmm2,       xmm0
104        psadbw          xmm3,       xmm0
105
106        paddw           xmm5,       xmm1
107        paddw           xmm6,       xmm2
108        paddw           xmm7,       xmm3
109%endif
110        movdqa          xmm0,       XMMWORD PTR [%2+%4]
111        lddqu           xmm1,       XMMWORD PTR [%3+%5]
112        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
113        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
114
115%if %1==0 || %1==1
116        lea             %2,         [%2+%4*2]
117        lea             %3,         [%3+%5*2]
118%endif
119
120        psadbw          xmm1,       xmm0
121        psadbw          xmm2,       xmm0
122        psadbw          xmm3,       xmm0
123
124        paddw           xmm5,       xmm1
125        paddw           xmm6,       xmm2
126        paddw           xmm7,       xmm3
127%endmacro
128
129%macro PROCESS_8X2X3 5
130%if %1==0
131        movq            mm0,       QWORD PTR [%2]
132        movq            mm5,       QWORD PTR [%3]
133        movq            mm6,       QWORD PTR [%3+1]
134        movq            mm7,       QWORD PTR [%3+2]
135
136        psadbw          mm5,       mm0
137        psadbw          mm6,       mm0
138        psadbw          mm7,       mm0
139%else
140        movq            mm0,       QWORD PTR [%2]
141        movq            mm1,       QWORD PTR [%3]
142        movq            mm2,       QWORD PTR [%3+1]
143        movq            mm3,       QWORD PTR [%3+2]
144
145        psadbw          mm1,       mm0
146        psadbw          mm2,       mm0
147        psadbw          mm3,       mm0
148
149        paddw           mm5,       mm1
150        paddw           mm6,       mm2
151        paddw           mm7,       mm3
152%endif
153        movq            mm0,       QWORD PTR [%2+%4]
154        movq            mm1,       QWORD PTR [%3+%5]
155        movq            mm2,       QWORD PTR [%3+%5+1]
156        movq            mm3,       QWORD PTR [%3+%5+2]
157
158%if %1==0 || %1==1
159        lea             %2,        [%2+%4*2]
160        lea             %3,        [%3+%5*2]
161%endif
162
163        psadbw          mm1,       mm0
164        psadbw          mm2,       mm0
165        psadbw          mm3,       mm0
166
167        paddw           mm5,       mm1
168        paddw           mm6,       mm2
169        paddw           mm7,       mm3
170%endmacro
171
172;void int vp9_sad16x16x3_sse3(
173;    unsigned char *src_ptr,
174;    int  src_stride,
175;    unsigned char *ref_ptr,
176;    int  ref_stride,
177;    int  *results)
178global sym(vp9_sad16x16x3_sse3) PRIVATE
179sym(vp9_sad16x16x3_sse3):
180
181    STACK_FRAME_CREATE_X3
182
183        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
184        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
185        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
186        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
187        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
188        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
189        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
190        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
191
192        mov             rcx,        result_ptr
193
194        movq            xmm0,       xmm5
195        psrldq          xmm5,       8
196
197        paddw           xmm0,       xmm5
198        movd            [rcx],      xmm0
199;-
200        movq            xmm0,       xmm6
201        psrldq          xmm6,       8
202
203        paddw           xmm0,       xmm6
204        movd            [rcx+4],    xmm0
205;-
206        movq            xmm0,       xmm7
207        psrldq          xmm7,       8
208
209        paddw           xmm0,       xmm7
210        movd            [rcx+8],    xmm0
211
212    STACK_FRAME_DESTROY_X3
213
214;void int vp9_sad16x8x3_sse3(
215;    unsigned char *src_ptr,
216;    int  src_stride,
217;    unsigned char *ref_ptr,
218;    int  ref_stride,
219;    int  *results)
220global sym(vp9_sad16x8x3_sse3) PRIVATE
221sym(vp9_sad16x8x3_sse3):
222
223    STACK_FRAME_CREATE_X3
224
225        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
226        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
227        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
228        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
229
230        mov             rcx,        result_ptr
231
232        movq            xmm0,       xmm5
233        psrldq          xmm5,       8
234
235        paddw           xmm0,       xmm5
236        movd            [rcx],      xmm0
237;-
238        movq            xmm0,       xmm6
239        psrldq          xmm6,       8
240
241        paddw           xmm0,       xmm6
242        movd            [rcx+4],    xmm0
243;-
244        movq            xmm0,       xmm7
245        psrldq          xmm7,       8
246
247        paddw           xmm0,       xmm7
248        movd            [rcx+8],    xmm0
249
250    STACK_FRAME_DESTROY_X3
251
252;void int vp9_sad8x16x3_sse3(
253;    unsigned char *src_ptr,
254;    int  src_stride,
255;    unsigned char *ref_ptr,
256;    int  ref_stride,
257;    int  *results)
258global sym(vp9_sad8x16x3_sse3) PRIVATE
259sym(vp9_sad8x16x3_sse3):
260
261    STACK_FRAME_CREATE_X3
262
263        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
264        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
265        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
266        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
267        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
268        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
269        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
270        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
271
272        mov             rcx,        result_ptr
273
274        punpckldq       mm5,        mm6
275
276        movq            [rcx],      mm5
277        movd            [rcx+8],    mm7
278
279    STACK_FRAME_DESTROY_X3
280
281;void int vp9_sad8x8x3_sse3(
282;    unsigned char *src_ptr,
283;    int  src_stride,
284;    unsigned char *ref_ptr,
285;    int  ref_stride,
286;    int  *results)
287global sym(vp9_sad8x8x3_sse3) PRIVATE
288sym(vp9_sad8x8x3_sse3):
289
290    STACK_FRAME_CREATE_X3
291
292        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
293        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
294        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
295        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
296
297        mov             rcx,        result_ptr
298
299        punpckldq       mm5,        mm6
300
301        movq            [rcx],      mm5
302        movd            [rcx+8],    mm7
303
304    STACK_FRAME_DESTROY_X3
305
306;void int vp9_sad4x4x3_sse3(
307;    unsigned char *src_ptr,
308;    int  src_stride,
309;    unsigned char *ref_ptr,
310;    int  ref_stride,
311;    int  *results)
312global sym(vp9_sad4x4x3_sse3) PRIVATE
313sym(vp9_sad4x4x3_sse3):
314
315    STACK_FRAME_CREATE_X3
316
317        movd            mm0,        DWORD PTR [src_ptr]
318        movd            mm1,        DWORD PTR [ref_ptr]
319
320        movd            mm2,        DWORD PTR [src_ptr+src_stride]
321        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
322
323        punpcklbw       mm0,        mm2
324        punpcklbw       mm1,        mm3
325
326        movd            mm4,        DWORD PTR [ref_ptr+1]
327        movd            mm5,        DWORD PTR [ref_ptr+2]
328
329        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
330        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
331
332        psadbw          mm1,        mm0
333
334        punpcklbw       mm4,        mm2
335        punpcklbw       mm5,        mm3
336
337        psadbw          mm4,        mm0
338        psadbw          mm5,        mm0
339
340        lea             src_ptr,    [src_ptr+src_stride*2]
341        lea             ref_ptr,    [ref_ptr+ref_stride*2]
342
343        movd            mm0,        DWORD PTR [src_ptr]
344        movd            mm2,        DWORD PTR [ref_ptr]
345
346        movd            mm3,        DWORD PTR [src_ptr+src_stride]
347        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
348
349        punpcklbw       mm0,        mm3
350        punpcklbw       mm2,        mm6
351
352        movd            mm3,        DWORD PTR [ref_ptr+1]
353        movd            mm7,        DWORD PTR [ref_ptr+2]
354
355        psadbw          mm2,        mm0
356
357        paddw           mm1,        mm2
358
359        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
360        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
361
362        punpcklbw       mm3,        mm2
363        punpcklbw       mm7,        mm6
364
365        psadbw          mm3,        mm0
366        psadbw          mm7,        mm0
367
368        paddw           mm3,        mm4
369        paddw           mm7,        mm5
370
371        mov             rcx,        result_ptr
372
373        punpckldq       mm1,        mm3
374
375        movq            [rcx],      mm1
376        movd            [rcx+8],    mm7
377
378    STACK_FRAME_DESTROY_X3
379