1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;unsigned int vp8_sad16x16_wmt(
15;    unsigned char *src_ptr,
16;    int  src_stride,
17;    unsigned char *ref_ptr,
18;    int  ref_stride)
19global sym(vp8_sad16x16_wmt) PRIVATE
20sym(vp8_sad16x16_wmt):
21    push        rbp
22    mov         rbp, rsp
23    SHADOW_ARGS_TO_STACK 4
24    SAVE_XMM 6
25    push        rsi
26    push        rdi
27    ; end prolog
28
29        mov             rsi,        arg(0) ;src_ptr
30        mov             rdi,        arg(2) ;ref_ptr
31
32        movsxd          rax,        dword ptr arg(1) ;src_stride
33        movsxd          rdx,        dword ptr arg(3) ;ref_stride
34
35        lea             rcx,        [rsi+rax*8]
36
37        lea             rcx,        [rcx+rax*8]
38        pxor            xmm6,       xmm6
39
40.x16x16sad_wmt_loop:
41
42        movq            xmm0,       QWORD PTR [rsi]
43        movq            xmm2,       QWORD PTR [rsi+8]
44
45        movq            xmm1,       QWORD PTR [rdi]
46        movq            xmm3,       QWORD PTR [rdi+8]
47
48        movq            xmm4,       QWORD PTR [rsi+rax]
49        movq            xmm5,       QWORD PTR [rdi+rdx]
50
51
52        punpcklbw       xmm0,       xmm2
53        punpcklbw       xmm1,       xmm3
54
55        psadbw          xmm0,       xmm1
56        movq            xmm2,       QWORD PTR [rsi+rax+8]
57
58        movq            xmm3,       QWORD PTR [rdi+rdx+8]
59        lea             rsi,        [rsi+rax*2]
60
61        lea             rdi,        [rdi+rdx*2]
62        punpcklbw       xmm4,       xmm2
63
64        punpcklbw       xmm5,       xmm3
65        psadbw          xmm4,       xmm5
66
67        paddw           xmm6,       xmm0
68        paddw           xmm6,       xmm4
69
70        cmp             rsi,        rcx
71        jne             .x16x16sad_wmt_loop
72
73        movq            xmm0,       xmm6
74        psrldq          xmm6,       8
75
76        paddw           xmm0,       xmm6
77        movq            rax,        xmm0
78
79    ; begin epilog
80    pop rdi
81    pop rsi
82    RESTORE_XMM
83    UNSHADOW_ARGS
84    pop         rbp
85    ret
86
87;unsigned int vp8_sad8x16_wmt(
88;    unsigned char *src_ptr,
89;    int  src_stride,
90;    unsigned char *ref_ptr,
91;    int  ref_stride,
92;    int  max_sad)
93global sym(vp8_sad8x16_wmt) PRIVATE
94sym(vp8_sad8x16_wmt):
95    push        rbp
96    mov         rbp, rsp
97    SHADOW_ARGS_TO_STACK 5
98    push        rbx
99    push        rsi
100    push        rdi
101    ; end prolog
102
103        mov             rsi,        arg(0) ;src_ptr
104        mov             rdi,        arg(2) ;ref_ptr
105
106        movsxd          rbx,        dword ptr arg(1) ;src_stride
107        movsxd          rdx,        dword ptr arg(3) ;ref_stride
108
109        lea             rcx,        [rsi+rbx*8]
110
111        lea             rcx,        [rcx+rbx*8]
112        pxor            mm7,        mm7
113
114.x8x16sad_wmt_loop:
115
116        movq            rax,        mm7
117        cmp             eax,        arg(4)
118        ja              .x8x16sad_wmt_early_exit
119
120        movq            mm0,        QWORD PTR [rsi]
121        movq            mm1,        QWORD PTR [rdi]
122
123        movq            mm2,        QWORD PTR [rsi+rbx]
124        movq            mm3,        QWORD PTR [rdi+rdx]
125
126        psadbw          mm0,        mm1
127        psadbw          mm2,        mm3
128
129        lea             rsi,        [rsi+rbx*2]
130        lea             rdi,        [rdi+rdx*2]
131
132        paddw           mm7,        mm0
133        paddw           mm7,        mm2
134
135        cmp             rsi,        rcx
136        jne             .x8x16sad_wmt_loop
137
138        movq            rax,        mm7
139
140.x8x16sad_wmt_early_exit:
141
142    ; begin epilog
143    pop         rdi
144    pop         rsi
145    pop         rbx
146    UNSHADOW_ARGS
147    pop         rbp
148    ret
149
150
151;unsigned int vp8_sad8x8_wmt(
152;    unsigned char *src_ptr,
153;    int  src_stride,
154;    unsigned char *ref_ptr,
155;    int  ref_stride)
156global sym(vp8_sad8x8_wmt) PRIVATE
157sym(vp8_sad8x8_wmt):
158    push        rbp
159    mov         rbp, rsp
160    SHADOW_ARGS_TO_STACK 5
161    push        rbx
162    push        rsi
163    push        rdi
164    ; end prolog
165
166        mov             rsi,        arg(0) ;src_ptr
167        mov             rdi,        arg(2) ;ref_ptr
168
169        movsxd          rbx,        dword ptr arg(1) ;src_stride
170        movsxd          rdx,        dword ptr arg(3) ;ref_stride
171
172        lea             rcx,        [rsi+rbx*8]
173        pxor            mm7,        mm7
174
175.x8x8sad_wmt_loop:
176
177        movq            rax,        mm7
178        cmp             eax,        arg(4)
179        ja              .x8x8sad_wmt_early_exit
180
181        movq            mm0,        QWORD PTR [rsi]
182        movq            mm1,        QWORD PTR [rdi]
183
184        psadbw          mm0,        mm1
185        lea             rsi,        [rsi+rbx]
186
187        add             rdi,        rdx
188        paddw           mm7,        mm0
189
190        cmp             rsi,        rcx
191        jne             .x8x8sad_wmt_loop
192
193        movq            rax,        mm7
194.x8x8sad_wmt_early_exit:
195
196    ; begin epilog
197    pop         rdi
198    pop         rsi
199    pop         rbx
200    UNSHADOW_ARGS
201    pop         rbp
202    ret
203
204;unsigned int vp8_sad4x4_wmt(
205;    unsigned char *src_ptr,
206;    int  src_stride,
207;    unsigned char *ref_ptr,
208;    int  ref_stride)
209global sym(vp8_sad4x4_wmt) PRIVATE
210sym(vp8_sad4x4_wmt):
211    push        rbp
212    mov         rbp, rsp
213    SHADOW_ARGS_TO_STACK 4
214    push        rsi
215    push        rdi
216    ; end prolog
217
218        mov             rsi,        arg(0) ;src_ptr
219        mov             rdi,        arg(2) ;ref_ptr
220
221        movsxd          rax,        dword ptr arg(1) ;src_stride
222        movsxd          rdx,        dword ptr arg(3) ;ref_stride
223
224        movd            mm0,        DWORD PTR [rsi]
225        movd            mm1,        DWORD PTR [rdi]
226
227        movd            mm2,        DWORD PTR [rsi+rax]
228        movd            mm3,        DWORD PTR [rdi+rdx]
229
230        punpcklbw       mm0,        mm2
231        punpcklbw       mm1,        mm3
232
233        psadbw          mm0,        mm1
234        lea             rsi,        [rsi+rax*2]
235
236        lea             rdi,        [rdi+rdx*2]
237        movd            mm4,        DWORD PTR [rsi]
238
239        movd            mm5,        DWORD PTR [rdi]
240        movd            mm6,        DWORD PTR [rsi+rax]
241
242        movd            mm7,        DWORD PTR [rdi+rdx]
243        punpcklbw       mm4,        mm6
244
245        punpcklbw       mm5,        mm7
246        psadbw          mm4,        mm5
247
248        paddw           mm0,        mm4
249        movq            rax,        mm0
250
251    ; begin epilog
252    pop rdi
253    pop rsi
254    UNSHADOW_ARGS
255    pop         rbp
256    ret
257
258
259;unsigned int vp8_sad16x8_wmt(
260;    unsigned char *src_ptr,
261;    int  src_stride,
262;    unsigned char *ref_ptr,
263;    int  ref_stride)
264global sym(vp8_sad16x8_wmt) PRIVATE
265sym(vp8_sad16x8_wmt):
266    push        rbp
267    mov         rbp, rsp
268    SHADOW_ARGS_TO_STACK 5
269    push        rbx
270    push        rsi
271    push        rdi
272    ; end prolog
273
274
275        mov             rsi,        arg(0) ;src_ptr
276        mov             rdi,        arg(2) ;ref_ptr
277
278        movsxd          rbx,        dword ptr arg(1) ;src_stride
279        movsxd          rdx,        dword ptr arg(3) ;ref_stride
280
281        lea             rcx,        [rsi+rbx*8]
282        pxor            mm7,        mm7
283
284.x16x8sad_wmt_loop:
285
286        movq            rax,        mm7
287        cmp             eax,        arg(4)
288        ja              .x16x8sad_wmt_early_exit
289
290        movq            mm0,        QWORD PTR [rsi]
291        movq            mm2,        QWORD PTR [rsi+8]
292
293        movq            mm1,        QWORD PTR [rdi]
294        movq            mm3,        QWORD PTR [rdi+8]
295
296        movq            mm4,        QWORD PTR [rsi+rbx]
297        movq            mm5,        QWORD PTR [rdi+rdx]
298
299        psadbw          mm0,        mm1
300        psadbw          mm2,        mm3
301
302        movq            mm1,        QWORD PTR [rsi+rbx+8]
303        movq            mm3,        QWORD PTR [rdi+rdx+8]
304
305        psadbw          mm4,        mm5
306        psadbw          mm1,        mm3
307
308        lea             rsi,        [rsi+rbx*2]
309        lea             rdi,        [rdi+rdx*2]
310
311        paddw           mm0,        mm2
312        paddw           mm4,        mm1
313
314        paddw           mm7,        mm0
315        paddw           mm7,        mm4
316
317        cmp             rsi,        rcx
318        jne             .x16x8sad_wmt_loop
319
320        movq            rax,        mm7
321
322.x16x8sad_wmt_early_exit:
323
324    ; begin epilog
325    pop         rdi
326    pop         rsi
327    pop         rbx
328    UNSHADOW_ARGS
329    pop         rbp
330    ret
331
332;void vp8_copy32xn_sse2(
333;    unsigned char *src_ptr,
334;    int  src_stride,
335;    unsigned char *dst_ptr,
336;    int  dst_stride,
337;    int height);
338global sym(vp8_copy32xn_sse2) PRIVATE
339sym(vp8_copy32xn_sse2):
340    push        rbp
341    mov         rbp, rsp
342    SHADOW_ARGS_TO_STACK 5
343    SAVE_XMM 7
344    push        rsi
345    push        rdi
346    ; end prolog
347
348        mov             rsi,        arg(0) ;src_ptr
349        mov             rdi,        arg(2) ;dst_ptr
350
351        movsxd          rax,        dword ptr arg(1) ;src_stride
352        movsxd          rdx,        dword ptr arg(3) ;dst_stride
353        movsxd          rcx,        dword ptr arg(4) ;height
354
355.block_copy_sse2_loopx4:
356        movdqu          xmm0,       XMMWORD PTR [rsi]
357        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
358        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
359        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
360
361        lea             rsi,        [rsi+rax*2]
362
363        movdqu          xmm4,       XMMWORD PTR [rsi]
364        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
365        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
366        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
367
368        lea             rsi,    [rsi+rax*2]
369
370        movdqa          XMMWORD PTR [rdi], xmm0
371        movdqa          XMMWORD PTR [rdi + 16], xmm1
372        movdqa          XMMWORD PTR [rdi + rdx], xmm2
373        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
374
375        lea             rdi,    [rdi+rdx*2]
376
377        movdqa          XMMWORD PTR [rdi], xmm4
378        movdqa          XMMWORD PTR [rdi + 16], xmm5
379        movdqa          XMMWORD PTR [rdi + rdx], xmm6
380        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
381
382        lea             rdi,    [rdi+rdx*2]
383
384        sub             rcx,     4
385        cmp             rcx,     4
386        jge             .block_copy_sse2_loopx4
387
388        cmp             rcx, 0
389        je              .copy_is_done
390
391.block_copy_sse2_loop:
392        movdqu          xmm0,       XMMWORD PTR [rsi]
393        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
394        lea             rsi,    [rsi+rax]
395
396        movdqa          XMMWORD PTR [rdi], xmm0
397        movdqa          XMMWORD PTR [rdi + 16], xmm1
398        lea             rdi,    [rdi+rdx]
399
400        sub             rcx,     1
401        jne             .block_copy_sse2_loop
402
403.copy_is_done:
404    ; begin epilog
405    pop rdi
406    pop rsi
407    RESTORE_XMM
408    UNSHADOW_ARGS
409    pop         rbp
410    ret
411