1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro PROCESS_16X2X8 1
15%if %1
16        movdqa          xmm0,       XMMWORD PTR [rsi]
17        movq            xmm1,       MMWORD PTR [rdi]
18        movq            xmm3,       MMWORD PTR [rdi+8]
19        movq            xmm2,       MMWORD PTR [rdi+16]
20        punpcklqdq      xmm1,       xmm3
21        punpcklqdq      xmm3,       xmm2
22
23        movdqa          xmm2,       xmm1
24        mpsadbw         xmm1,       xmm0,  0x0
25        mpsadbw         xmm2,       xmm0,  0x5
26
27        psrldq          xmm0,       8
28
29        movdqa          xmm4,       xmm3
30        mpsadbw         xmm3,       xmm0,  0x0
31        mpsadbw         xmm4,       xmm0,  0x5
32
33        paddw           xmm1,       xmm2
34        paddw           xmm1,       xmm3
35        paddw           xmm1,       xmm4
36%else
37        movdqa          xmm0,       XMMWORD PTR [rsi]
38        movq            xmm5,       MMWORD PTR [rdi]
39        movq            xmm3,       MMWORD PTR [rdi+8]
40        movq            xmm2,       MMWORD PTR [rdi+16]
41        punpcklqdq      xmm5,       xmm3
42        punpcklqdq      xmm3,       xmm2
43
44        movdqa          xmm2,       xmm5
45        mpsadbw         xmm5,       xmm0,  0x0
46        mpsadbw         xmm2,       xmm0,  0x5
47
48        psrldq          xmm0,       8
49
50        movdqa          xmm4,       xmm3
51        mpsadbw         xmm3,       xmm0,  0x0
52        mpsadbw         xmm4,       xmm0,  0x5
53
54        paddw           xmm5,       xmm2
55        paddw           xmm5,       xmm3
56        paddw           xmm5,       xmm4
57
58        paddw           xmm1,       xmm5
59%endif
60        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
61        movq            xmm5,       MMWORD PTR [rdi+ rdx]
62        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
63        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
64        punpcklqdq      xmm5,       xmm3
65        punpcklqdq      xmm3,       xmm2
66
67        lea             rsi,        [rsi+rax*2]
68        lea             rdi,        [rdi+rdx*2]
69
70        movdqa          xmm2,       xmm5
71        mpsadbw         xmm5,       xmm0,  0x0
72        mpsadbw         xmm2,       xmm0,  0x5
73
74        psrldq          xmm0,       8
75        movdqa          xmm4,       xmm3
76        mpsadbw         xmm3,       xmm0,  0x0
77        mpsadbw         xmm4,       xmm0,  0x5
78
79        paddw           xmm5,       xmm2
80        paddw           xmm5,       xmm3
81        paddw           xmm5,       xmm4
82
83        paddw           xmm1,       xmm5
84%endmacro
85
86%macro PROCESS_8X2X8 1
87%if %1
88        movq            xmm0,       MMWORD PTR [rsi]
89        movq            xmm1,       MMWORD PTR [rdi]
90        movq            xmm3,       MMWORD PTR [rdi+8]
91        punpcklqdq      xmm1,       xmm3
92
93        movdqa          xmm2,       xmm1
94        mpsadbw         xmm1,       xmm0,  0x0
95        mpsadbw         xmm2,       xmm0,  0x5
96        paddw           xmm1,       xmm2
97%else
98        movq            xmm0,       MMWORD PTR [rsi]
99        movq            xmm5,       MMWORD PTR [rdi]
100        movq            xmm3,       MMWORD PTR [rdi+8]
101        punpcklqdq      xmm5,       xmm3
102
103        movdqa          xmm2,       xmm5
104        mpsadbw         xmm5,       xmm0,  0x0
105        mpsadbw         xmm2,       xmm0,  0x5
106        paddw           xmm5,       xmm2
107
108        paddw           xmm1,       xmm5
109%endif
110        movq            xmm0,       MMWORD PTR [rsi + rax]
111        movq            xmm5,       MMWORD PTR [rdi+ rdx]
112        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
113        punpcklqdq      xmm5,       xmm3
114
115        lea             rsi,        [rsi+rax*2]
116        lea             rdi,        [rdi+rdx*2]
117
118        movdqa          xmm2,       xmm5
119        mpsadbw         xmm5,       xmm0,  0x0
120        mpsadbw         xmm2,       xmm0,  0x5
121        paddw           xmm5,       xmm2
122
123        paddw           xmm1,       xmm5
124%endmacro
125
126%macro PROCESS_4X2X8 1
127%if %1
128        movd            xmm0,       [rsi]
129        movq            xmm1,       MMWORD PTR [rdi]
130        movq            xmm3,       MMWORD PTR [rdi+8]
131        punpcklqdq      xmm1,       xmm3
132
133        mpsadbw         xmm1,       xmm0,  0x0
134%else
135        movd            xmm0,       [rsi]
136        movq            xmm5,       MMWORD PTR [rdi]
137        movq            xmm3,       MMWORD PTR [rdi+8]
138        punpcklqdq      xmm5,       xmm3
139
140        mpsadbw         xmm5,       xmm0,  0x0
141
142        paddw           xmm1,       xmm5
143%endif
144        movd            xmm0,       [rsi + rax]
145        movq            xmm5,       MMWORD PTR [rdi+ rdx]
146        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
147        punpcklqdq      xmm5,       xmm3
148
149        lea             rsi,        [rsi+rax*2]
150        lea             rdi,        [rdi+rdx*2]
151
152        mpsadbw         xmm5,       xmm0,  0x0
153
154        paddw           xmm1,       xmm5
155%endmacro
156
157%macro WRITE_AS_INTS 0
158    mov             rdi,        arg(4)           ;Results
159    pxor            xmm0, xmm0
160    movdqa          xmm2, xmm1
161    punpcklwd       xmm1, xmm0
162    punpckhwd       xmm2, xmm0
163
164    movdqa          [rdi],    xmm1
165    movdqa          [rdi + 16],    xmm2
166%endmacro
167
168;void vp9_sad16x16x8_sse4(
169;    const unsigned char *src_ptr,
170;    int  src_stride,
171;    const unsigned char *ref_ptr,
172;    int  ref_stride,
173;    unsigned short *sad_array);
174global sym(vp9_sad16x16x8_sse4) PRIVATE
175sym(vp9_sad16x16x8_sse4):
176    push        rbp
177    mov         rbp, rsp
178    SHADOW_ARGS_TO_STACK 5
179    push        rsi
180    push        rdi
181    ; end prolog
182
183    mov             rsi,        arg(0)           ;src_ptr
184    mov             rdi,        arg(2)           ;ref_ptr
185
186    movsxd          rax,        dword ptr arg(1) ;src_stride
187    movsxd          rdx,        dword ptr arg(3) ;ref_stride
188
189    PROCESS_16X2X8 1
190    PROCESS_16X2X8 0
191    PROCESS_16X2X8 0
192    PROCESS_16X2X8 0
193    PROCESS_16X2X8 0
194    PROCESS_16X2X8 0
195    PROCESS_16X2X8 0
196    PROCESS_16X2X8 0
197
198    WRITE_AS_INTS
199
200    ; begin epilog
201    pop         rdi
202    pop         rsi
203    UNSHADOW_ARGS
204    pop         rbp
205    ret
206
207
208;void vp9_sad16x8x8_sse4(
209;    const unsigned char *src_ptr,
210;    int  src_stride,
211;    const unsigned char *ref_ptr,
212;    int  ref_stride,
213;    unsigned short *sad_array
214;);
215global sym(vp9_sad16x8x8_sse4) PRIVATE
216sym(vp9_sad16x8x8_sse4):
217    push        rbp
218    mov         rbp, rsp
219    SHADOW_ARGS_TO_STACK 5
220    push        rsi
221    push        rdi
222    ; end prolog
223
224    mov             rsi,        arg(0)           ;src_ptr
225    mov             rdi,        arg(2)           ;ref_ptr
226
227    movsxd          rax,        dword ptr arg(1) ;src_stride
228    movsxd          rdx,        dword ptr arg(3) ;ref_stride
229
230    PROCESS_16X2X8 1
231    PROCESS_16X2X8 0
232    PROCESS_16X2X8 0
233    PROCESS_16X2X8 0
234
235    WRITE_AS_INTS
236
237    ; begin epilog
238    pop         rdi
239    pop         rsi
240    UNSHADOW_ARGS
241    pop         rbp
242    ret
243
244
245;void vp9_sad8x8x8_sse4(
246;    const unsigned char *src_ptr,
247;    int  src_stride,
248;    const unsigned char *ref_ptr,
249;    int  ref_stride,
250;    unsigned short *sad_array
251;);
252global sym(vp9_sad8x8x8_sse4) PRIVATE
253sym(vp9_sad8x8x8_sse4):
254    push        rbp
255    mov         rbp, rsp
256    SHADOW_ARGS_TO_STACK 5
257    push        rsi
258    push        rdi
259    ; end prolog
260
261    mov             rsi,        arg(0)           ;src_ptr
262    mov             rdi,        arg(2)           ;ref_ptr
263
264    movsxd          rax,        dword ptr arg(1) ;src_stride
265    movsxd          rdx,        dword ptr arg(3) ;ref_stride
266
267    PROCESS_8X2X8 1
268    PROCESS_8X2X8 0
269    PROCESS_8X2X8 0
270    PROCESS_8X2X8 0
271
272    WRITE_AS_INTS
273
274    ; begin epilog
275    pop         rdi
276    pop         rsi
277    UNSHADOW_ARGS
278    pop         rbp
279    ret
280
281
282;void vp9_sad8x16x8_sse4(
283;    const unsigned char *src_ptr,
284;    int  src_stride,
285;    const unsigned char *ref_ptr,
286;    int  ref_stride,
287;    unsigned short *sad_array
288;);
289global sym(vp9_sad8x16x8_sse4) PRIVATE
290sym(vp9_sad8x16x8_sse4):
291    push        rbp
292    mov         rbp, rsp
293    SHADOW_ARGS_TO_STACK 5
294    push        rsi
295    push        rdi
296    ; end prolog
297
298    mov             rsi,        arg(0)           ;src_ptr
299    mov             rdi,        arg(2)           ;ref_ptr
300
301    movsxd          rax,        dword ptr arg(1) ;src_stride
302    movsxd          rdx,        dword ptr arg(3) ;ref_stride
303
304    PROCESS_8X2X8 1
305    PROCESS_8X2X8 0
306    PROCESS_8X2X8 0
307    PROCESS_8X2X8 0
308    PROCESS_8X2X8 0
309    PROCESS_8X2X8 0
310    PROCESS_8X2X8 0
311    PROCESS_8X2X8 0
312
313    WRITE_AS_INTS
314
315    ; begin epilog
316    pop         rdi
317    pop         rsi
318    UNSHADOW_ARGS
319    pop         rbp
320    ret
321
322
323;void vp9_sad4x4x8_c(
324;    const unsigned char *src_ptr,
325;    int  src_stride,
326;    const unsigned char *ref_ptr,
327;    int  ref_stride,
328;    unsigned short *sad_array
329;);
330global sym(vp9_sad4x4x8_sse4) PRIVATE
331sym(vp9_sad4x4x8_sse4):
332    push        rbp
333    mov         rbp, rsp
334    SHADOW_ARGS_TO_STACK 5
335    push        rsi
336    push        rdi
337    ; end prolog
338
339    mov             rsi,        arg(0)           ;src_ptr
340    mov             rdi,        arg(2)           ;ref_ptr
341
342    movsxd          rax,        dword ptr arg(1) ;src_stride
343    movsxd          rdx,        dword ptr arg(3) ;ref_stride
344
345    PROCESS_4X2X8 1
346    PROCESS_4X2X8 0
347
348    WRITE_AS_INTS
349
350    ; begin epilog
351    pop         rdi
352    pop         rsi
353    UNSHADOW_ARGS
354    pop         rbp
355    ret
356
357
358
359
360