1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
15global sym(vp8_block_error_xmm)
16sym(vp8_block_error_xmm):
17    push        rbp
18    mov         rbp, rsp
19    SHADOW_ARGS_TO_STACK 2
20    push rsi
21    push rdi
22    ; end prologue
23
24        mov         rsi,        arg(0) ;coeff_ptr
25
26        mov         rdi,        arg(1) ;dcoef_ptr
27        movdqa      xmm3,       [rsi]
28
29        movdqa      xmm4,       [rdi]
30        movdqa      xmm5,       [rsi+16]
31
32        movdqa      xmm6,       [rdi+16]
33        psubw       xmm3,       xmm4
34
35        psubw       xmm5,       xmm6
36        pmaddwd     xmm3,       xmm3
37        pmaddwd     xmm5,       xmm5
38
39        paddd       xmm3,       xmm5
40
41        pxor        xmm7,       xmm7
42        movdqa      xmm0,       xmm3
43
44        punpckldq   xmm0,       xmm7
45        punpckhdq   xmm3,       xmm7
46
47        paddd       xmm0,       xmm3
48        movdqa      xmm3,       xmm0
49
50        psrldq      xmm0,       8
51        paddd       xmm0,       xmm3
52
53        movq        rax,        xmm0
54
55    pop rdi
56    pop rsi
57    ; begin epilog
58    UNSHADOW_ARGS
59    pop         rbp
60    ret
61
62;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
63global sym(vp8_block_error_mmx)
64sym(vp8_block_error_mmx):
65    push        rbp
66    mov         rbp, rsp
67    SHADOW_ARGS_TO_STACK 2
68    push rsi
69    push rdi
70    ; end prolog
71
72
73        mov         rsi,        arg(0) ;coeff_ptr
74        pxor        mm7,        mm7
75
76        mov         rdi,        arg(1) ;dcoef_ptr
77        movq        mm3,        [rsi]
78
79        movq        mm4,        [rdi]
80        movq        mm5,        [rsi+8]
81
82        movq        mm6,        [rdi+8]
83        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
84
85        movq        mm2,        mm7
86        psubw       mm5,        mm6
87
88        por         mm1,        mm2
89        pmaddwd     mm5,        mm5
90
91        pcmpeqw     mm1,        mm7
92        psubw       mm3,        mm4
93
94        pand        mm1,        mm3
95        pmaddwd     mm1,        mm1
96
97        paddd       mm1,        mm5
98        movq        mm3,        [rsi+16]
99
100        movq        mm4,        [rdi+16]
101        movq        mm5,        [rsi+24]
102
103        movq        mm6,        [rdi+24]
104        psubw       mm5,        mm6
105
106        pmaddwd     mm5,        mm5
107        psubw       mm3,        mm4
108
109        pmaddwd     mm3,        mm3
110        paddd       mm3,        mm5
111
112        paddd       mm1,        mm3
113        movq        mm0,        mm1
114
115        psrlq       mm1,        32
116        paddd       mm0,        mm1
117
118        movq        rax,        mm0
119
120    pop rdi
121    pop rsi
122    ; begin epilog
123    UNSHADOW_ARGS
124    pop         rbp
125    ret
126
127
128;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
129global sym(vp8_mbblock_error_mmx_impl)
130sym(vp8_mbblock_error_mmx_impl):
131    push        rbp
132    mov         rbp, rsp
133    SHADOW_ARGS_TO_STACK 3
134    push rsi
135    push rdi
136    ; end prolog
137
138
139        mov         rsi,        arg(0) ;coeff_ptr
140        pxor        mm7,        mm7
141
142        mov         rdi,        arg(1) ;dcoef_ptr
143        pxor        mm2,        mm2
144
145        movd        mm1,        dword ptr arg(2) ;dc
146        por         mm1,        mm2
147
148        pcmpeqw     mm1,        mm7
149        mov         rcx,        16
150
151mberror_loop_mmx:
152        movq        mm3,       [rsi]
153        movq        mm4,       [rdi]
154
155        movq        mm5,       [rsi+8]
156        movq        mm6,       [rdi+8]
157
158
159        psubw       mm5,        mm6
160        pmaddwd     mm5,        mm5
161
162        psubw       mm3,        mm4
163        pand        mm3,        mm1
164
165        pmaddwd     mm3,        mm3
166        paddd       mm2,        mm5
167
168        paddd       mm2,        mm3
169        movq        mm3,       [rsi+16]
170
171        movq        mm4,       [rdi+16]
172        movq        mm5,       [rsi+24]
173
174        movq        mm6,       [rdi+24]
175        psubw       mm5,        mm6
176
177        pmaddwd     mm5,        mm5
178        psubw       mm3,        mm4
179
180        pmaddwd     mm3,        mm3
181        paddd       mm2,        mm5
182
183        paddd       mm2,        mm3
184        add         rsi,        32
185
186        add         rdi,        32
187        sub         rcx,        1
188
189        jnz         mberror_loop_mmx
190
191        movq        mm0,        mm2
192        psrlq       mm2,        32
193
194        paddd       mm0,        mm2
195        movq        rax,        mm0
196
197    pop rdi
198    pop rsi
199    ; begin epilog
200    UNSHADOW_ARGS
201    pop         rbp
202    ret
203
204
205;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
206global sym(vp8_mbblock_error_xmm_impl)
207sym(vp8_mbblock_error_xmm_impl):
208    push        rbp
209    mov         rbp, rsp
210    SHADOW_ARGS_TO_STACK 3
211    push rsi
212    push rdi
213    ; end prolog
214
215
216        mov         rsi,        arg(0) ;coeff_ptr
217        pxor        xmm7,       xmm7
218
219        mov         rdi,        arg(1) ;dcoef_ptr
220        pxor        xmm2,       xmm2
221
222        movd        xmm1,       dword ptr arg(2) ;dc
223        por         xmm1,       xmm2
224
225        pcmpeqw     xmm1,       xmm7
226        mov         rcx,        16
227
228mberror_loop:
229        movdqa      xmm3,       [rsi]
230        movdqa      xmm4,       [rdi]
231
232        movdqa      xmm5,       [rsi+16]
233        movdqa      xmm6,       [rdi+16]
234
235
236        psubw       xmm5,       xmm6
237        pmaddwd     xmm5,       xmm5
238
239        psubw       xmm3,       xmm4
240        pand        xmm3,       xmm1
241
242        pmaddwd     xmm3,       xmm3
243        add         rsi,        32
244
245        add         rdi,        32
246
247        sub         rcx,        1
248        paddd       xmm2,       xmm5
249
250        paddd       xmm2,       xmm3
251        jnz         mberror_loop
252
253        movdqa      xmm0,       xmm2
254        punpckldq   xmm0,       xmm7
255
256        punpckhdq   xmm2,       xmm7
257        paddd       xmm0,       xmm2
258
259        movdqa      xmm1,       xmm0
260        psrldq      xmm0,       8
261
262        paddd       xmm0,       xmm1
263        movq        rax,        xmm0
264
265    pop rdi
266    pop rsi
267    ; begin epilog
268    UNSHADOW_ARGS
269    pop         rbp
270    ret
271
272
273;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
274global sym(vp8_mbuverror_mmx_impl)
275sym(vp8_mbuverror_mmx_impl):
276    push        rbp
277    mov         rbp, rsp
278    SHADOW_ARGS_TO_STACK 2
279    push rsi
280    push rdi
281    ; end prolog
282
283
284        mov             rsi,        arg(0) ;s_ptr
285        mov             rdi,        arg(1) ;d_ptr
286
287        mov             rcx,        16
288        pxor            mm7,        mm7
289
290mbuverror_loop_mmx:
291
292        movq            mm1,        [rsi]
293        movq            mm2,        [rdi]
294
295        psubw           mm1,        mm2
296        pmaddwd         mm1,        mm1
297
298
299        movq            mm3,        [rsi+8]
300        movq            mm4,        [rdi+8]
301
302        psubw           mm3,        mm4
303        pmaddwd         mm3,        mm3
304
305
306        paddd           mm7,        mm1
307        paddd           mm7,        mm3
308
309
310        add             rsi,        16
311        add             rdi,        16
312
313        dec             rcx
314        jnz             mbuverror_loop_mmx
315
316        movq            mm0,        mm7
317        psrlq           mm7,        32
318
319        paddd           mm0,        mm7
320        movq            rax,        mm0
321
322    pop rdi
323    pop rsi
324    ; begin epilog
325    UNSHADOW_ARGS
326    pop         rbp
327    ret
328
329
330;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
331global sym(vp8_mbuverror_xmm_impl)
332sym(vp8_mbuverror_xmm_impl):
333    push        rbp
334    mov         rbp, rsp
335    SHADOW_ARGS_TO_STACK 2
336    push rsi
337    push rdi
338    ; end prolog
339
340
341        mov             rsi,        arg(0) ;s_ptr
342        mov             rdi,        arg(1) ;d_ptr
343
344        mov             rcx,        16
345        pxor            xmm7,       xmm7
346
347mbuverror_loop:
348
349        movdqa          xmm1,       [rsi]
350        movdqa          xmm2,       [rdi]
351
352        psubw           xmm1,       xmm2
353        pmaddwd         xmm1,       xmm1
354
355        paddd           xmm7,       xmm1
356
357        add             rsi,        16
358        add             rdi,        16
359
360        dec             rcx
361        jnz             mbuverror_loop
362
363        pxor        xmm0,           xmm0
364        movdqa      xmm1,           xmm7
365
366        movdqa      xmm2,           xmm1
367        punpckldq   xmm1,           xmm0
368
369        punpckhdq   xmm2,           xmm0
370        paddd       xmm1,           xmm2
371
372        movdqa      xmm2,           xmm1
373
374        psrldq      xmm1,           8
375        paddd       xmm1,           xmm2
376
377        movq            rax,            xmm1
378
379    pop rdi
380    pop rsi
381    ; begin epilog
382    UNSHADOW_ARGS
383    pop         rbp
384    ret
385