1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro PROCESS_16X2X8 1
15%if %1
16        movdqa          xmm0,       XMMWORD PTR [rsi]
17        movq            xmm1,       MMWORD PTR [rdi]
18        movq            xmm3,       MMWORD PTR [rdi+8]
19        movq            xmm2,       MMWORD PTR [rdi+16]
20        punpcklqdq      xmm1,       xmm3
21        punpcklqdq      xmm3,       xmm2
22
23        movdqa          xmm2,       xmm1
24        mpsadbw         xmm1,       xmm0,  0x0
25        mpsadbw         xmm2,       xmm0,  0x5
26
27        psrldq          xmm0,       8
28
29        movdqa          xmm4,       xmm3
30        mpsadbw         xmm3,       xmm0,  0x0
31        mpsadbw         xmm4,       xmm0,  0x5
32
33        paddw           xmm1,       xmm2
34        paddw           xmm1,       xmm3
35        paddw           xmm1,       xmm4
36%else
37        movdqa          xmm0,       XMMWORD PTR [rsi]
38        movq            xmm5,       MMWORD PTR [rdi]
39        movq            xmm3,       MMWORD PTR [rdi+8]
40        movq            xmm2,       MMWORD PTR [rdi+16]
41        punpcklqdq      xmm5,       xmm3
42        punpcklqdq      xmm3,       xmm2
43
44        movdqa          xmm2,       xmm5
45        mpsadbw         xmm5,       xmm0,  0x0
46        mpsadbw         xmm2,       xmm0,  0x5
47
48        psrldq          xmm0,       8
49
50        movdqa          xmm4,       xmm3
51        mpsadbw         xmm3,       xmm0,  0x0
52        mpsadbw         xmm4,       xmm0,  0x5
53
54        paddw           xmm5,       xmm2
55        paddw           xmm5,       xmm3
56        paddw           xmm5,       xmm4
57
58        paddw           xmm1,       xmm5
59%endif
60        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
61        movq            xmm5,       MMWORD PTR [rdi+ rdx]
62        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
63        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
64        punpcklqdq      xmm5,       xmm3
65        punpcklqdq      xmm3,       xmm2
66
67        lea             rsi,        [rsi+rax*2]
68        lea             rdi,        [rdi+rdx*2]
69
70        movdqa          xmm2,       xmm5
71        mpsadbw         xmm5,       xmm0,  0x0
72        mpsadbw         xmm2,       xmm0,  0x5
73
74        psrldq          xmm0,       8
75        movdqa          xmm4,       xmm3
76        mpsadbw         xmm3,       xmm0,  0x0
77        mpsadbw         xmm4,       xmm0,  0x5
78
79        paddw           xmm5,       xmm2
80        paddw           xmm5,       xmm3
81        paddw           xmm5,       xmm4
82
83        paddw           xmm1,       xmm5
84%endmacro
85
86%macro PROCESS_8X2X8 1
87%if %1
88        movq            xmm0,       MMWORD PTR [rsi]
89        movq            xmm1,       MMWORD PTR [rdi]
90        movq            xmm3,       MMWORD PTR [rdi+8]
91        punpcklqdq      xmm1,       xmm3
92
93        movdqa          xmm2,       xmm1
94        mpsadbw         xmm1,       xmm0,  0x0
95        mpsadbw         xmm2,       xmm0,  0x5
96        paddw           xmm1,       xmm2
97%else
98        movq            xmm0,       MMWORD PTR [rsi]
99        movq            xmm5,       MMWORD PTR [rdi]
100        movq            xmm3,       MMWORD PTR [rdi+8]
101        punpcklqdq      xmm5,       xmm3
102
103        movdqa          xmm2,       xmm5
104        mpsadbw         xmm5,       xmm0,  0x0
105        mpsadbw         xmm2,       xmm0,  0x5
106        paddw           xmm5,       xmm2
107
108        paddw           xmm1,       xmm5
109%endif
110        movq            xmm0,       MMWORD PTR [rsi + rax]
111        movq            xmm5,       MMWORD PTR [rdi+ rdx]
112        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
113        punpcklqdq      xmm5,       xmm3
114
115        lea             rsi,        [rsi+rax*2]
116        lea             rdi,        [rdi+rdx*2]
117
118        movdqa          xmm2,       xmm5
119        mpsadbw         xmm5,       xmm0,  0x0
120        mpsadbw         xmm2,       xmm0,  0x5
121        paddw           xmm5,       xmm2
122
123        paddw           xmm1,       xmm5
124%endmacro
125
126%macro PROCESS_4X2X8 1
127%if %1
128        movd            xmm0,       [rsi]
129        movq            xmm1,       MMWORD PTR [rdi]
130        movq            xmm3,       MMWORD PTR [rdi+8]
131        punpcklqdq      xmm1,       xmm3
132
133        mpsadbw         xmm1,       xmm0,  0x0
134%else
135        movd            xmm0,       [rsi]
136        movq            xmm5,       MMWORD PTR [rdi]
137        movq            xmm3,       MMWORD PTR [rdi+8]
138        punpcklqdq      xmm5,       xmm3
139
140        mpsadbw         xmm5,       xmm0,  0x0
141
142        paddw           xmm1,       xmm5
143%endif
144        movd            xmm0,       [rsi + rax]
145        movq            xmm5,       MMWORD PTR [rdi+ rdx]
146        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
147        punpcklqdq      xmm5,       xmm3
148
149        lea             rsi,        [rsi+rax*2]
150        lea             rdi,        [rdi+rdx*2]
151
152        mpsadbw         xmm5,       xmm0,  0x0
153
154        paddw           xmm1,       xmm5
155%endmacro
156
157
158;void vp8_sad16x16x8_sse4(
159;    const unsigned char *src_ptr,
160;    int  src_stride,
161;    const unsigned char *ref_ptr,
162;    int  ref_stride,
163;    unsigned short *sad_array);
164global sym(vp8_sad16x16x8_sse4) PRIVATE
165sym(vp8_sad16x16x8_sse4):
166    push        rbp
167    mov         rbp, rsp
168    SHADOW_ARGS_TO_STACK 5
169    push        rsi
170    push        rdi
171    ; end prolog
172
173        mov             rsi,        arg(0)           ;src_ptr
174        mov             rdi,        arg(2)           ;ref_ptr
175
176        movsxd          rax,        dword ptr arg(1) ;src_stride
177        movsxd          rdx,        dword ptr arg(3) ;ref_stride
178
179        PROCESS_16X2X8 1
180        PROCESS_16X2X8 0
181        PROCESS_16X2X8 0
182        PROCESS_16X2X8 0
183        PROCESS_16X2X8 0
184        PROCESS_16X2X8 0
185        PROCESS_16X2X8 0
186        PROCESS_16X2X8 0
187
188        mov             rdi,        arg(4)           ;Results
189        movdqa          XMMWORD PTR [rdi],    xmm1
190
191    ; begin epilog
192    pop         rdi
193    pop         rsi
194    UNSHADOW_ARGS
195    pop         rbp
196    ret
197
198
199;void vp8_sad16x8x8_sse4(
200;    const unsigned char *src_ptr,
201;    int  src_stride,
202;    const unsigned char *ref_ptr,
203;    int  ref_stride,
204;    unsigned short *sad_array
205;);
206global sym(vp8_sad16x8x8_sse4) PRIVATE
207sym(vp8_sad16x8x8_sse4):
208    push        rbp
209    mov         rbp, rsp
210    SHADOW_ARGS_TO_STACK 5
211    push        rsi
212    push        rdi
213    ; end prolog
214
215        mov             rsi,        arg(0)           ;src_ptr
216        mov             rdi,        arg(2)           ;ref_ptr
217
218        movsxd          rax,        dword ptr arg(1) ;src_stride
219        movsxd          rdx,        dword ptr arg(3) ;ref_stride
220
221        PROCESS_16X2X8 1
222        PROCESS_16X2X8 0
223        PROCESS_16X2X8 0
224        PROCESS_16X2X8 0
225
226        mov             rdi,        arg(4)           ;Results
227        movdqa          XMMWORD PTR [rdi],    xmm1
228
229    ; begin epilog
230    pop         rdi
231    pop         rsi
232    UNSHADOW_ARGS
233    pop         rbp
234    ret
235
236
237;void vp8_sad8x8x8_sse4(
238;    const unsigned char *src_ptr,
239;    int  src_stride,
240;    const unsigned char *ref_ptr,
241;    int  ref_stride,
242;    unsigned short *sad_array
243;);
244global sym(vp8_sad8x8x8_sse4) PRIVATE
245sym(vp8_sad8x8x8_sse4):
246    push        rbp
247    mov         rbp, rsp
248    SHADOW_ARGS_TO_STACK 5
249    push        rsi
250    push        rdi
251    ; end prolog
252
253        mov             rsi,        arg(0)           ;src_ptr
254        mov             rdi,        arg(2)           ;ref_ptr
255
256        movsxd          rax,        dword ptr arg(1) ;src_stride
257        movsxd          rdx,        dword ptr arg(3) ;ref_stride
258
259        PROCESS_8X2X8 1
260        PROCESS_8X2X8 0
261        PROCESS_8X2X8 0
262        PROCESS_8X2X8 0
263
264        mov             rdi,        arg(4)           ;Results
265        movdqa          XMMWORD PTR [rdi],    xmm1
266
267    ; begin epilog
268    pop         rdi
269    pop         rsi
270    UNSHADOW_ARGS
271    pop         rbp
272    ret
273
274
275;void vp8_sad8x16x8_sse4(
276;    const unsigned char *src_ptr,
277;    int  src_stride,
278;    const unsigned char *ref_ptr,
279;    int  ref_stride,
280;    unsigned short *sad_array
281;);
282global sym(vp8_sad8x16x8_sse4) PRIVATE
283sym(vp8_sad8x16x8_sse4):
284    push        rbp
285    mov         rbp, rsp
286    SHADOW_ARGS_TO_STACK 5
287    push        rsi
288    push        rdi
289    ; end prolog
290
291        mov             rsi,        arg(0)           ;src_ptr
292        mov             rdi,        arg(2)           ;ref_ptr
293
294        movsxd          rax,        dword ptr arg(1) ;src_stride
295        movsxd          rdx,        dword ptr arg(3) ;ref_stride
296
297        PROCESS_8X2X8 1
298        PROCESS_8X2X8 0
299        PROCESS_8X2X8 0
300        PROCESS_8X2X8 0
301        PROCESS_8X2X8 0
302        PROCESS_8X2X8 0
303        PROCESS_8X2X8 0
304        PROCESS_8X2X8 0
305        mov             rdi,        arg(4)           ;Results
306        movdqa          XMMWORD PTR [rdi],    xmm1
307
308    ; begin epilog
309    pop         rdi
310    pop         rsi
311    UNSHADOW_ARGS
312    pop         rbp
313    ret
314
315
316;void vp8_sad4x4x8_c(
317;    const unsigned char *src_ptr,
318;    int  src_stride,
319;    const unsigned char *ref_ptr,
320;    int  ref_stride,
321;    unsigned short *sad_array
322;);
323global sym(vp8_sad4x4x8_sse4) PRIVATE
324sym(vp8_sad4x4x8_sse4):
325    push        rbp
326    mov         rbp, rsp
327    SHADOW_ARGS_TO_STACK 5
328    push        rsi
329    push        rdi
330    ; end prolog
331
332        mov             rsi,        arg(0)           ;src_ptr
333        mov             rdi,        arg(2)           ;ref_ptr
334
335        movsxd          rax,        dword ptr arg(1) ;src_stride
336        movsxd          rdx,        dword ptr arg(3) ;ref_stride
337
338        PROCESS_4X2X8 1
339        PROCESS_4X2X8 0
340
341        mov             rdi,        arg(4)           ;Results
342        movdqa          XMMWORD PTR [rdi],    xmm1
343
344    ; begin epilog
345    pop         rdi
346    pop         rsi
347    UNSHADOW_ARGS
348    pop         rbp
349    ret
350
351
352
353
354