1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
15global sym(vp8_block_error_xmm) PRIVATE
16sym(vp8_block_error_xmm):
17    push        rbp
18    mov         rbp, rsp
19    SHADOW_ARGS_TO_STACK 2
20    push rsi
21    push rdi
22    ; end prologue
23
24        mov         rsi,        arg(0) ;coeff_ptr
25        mov         rdi,        arg(1) ;dcoef_ptr
26
27        movdqa      xmm0,       [rsi]
28        movdqa      xmm1,       [rdi]
29
30        movdqa      xmm2,       [rsi+16]
31        movdqa      xmm3,       [rdi+16]
32
33        psubw       xmm0,       xmm1
34        psubw       xmm2,       xmm3
35
36        pmaddwd     xmm0,       xmm0
37        pmaddwd     xmm2,       xmm2
38
39        paddd       xmm0,       xmm2
40
41        pxor        xmm5,       xmm5
42        movdqa      xmm1,       xmm0
43
44        punpckldq   xmm0,       xmm5
45        punpckhdq   xmm1,       xmm5
46
47        paddd       xmm0,       xmm1
48        movdqa      xmm1,       xmm0
49
50        psrldq      xmm0,       8
51        paddd       xmm0,       xmm1
52
53        movq        rax,        xmm0
54
55    pop rdi
56    pop rsi
57    ; begin epilog
58    UNSHADOW_ARGS
59    pop         rbp
60    ret
61
62;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
63global sym(vp8_block_error_mmx) PRIVATE
64sym(vp8_block_error_mmx):
65    push        rbp
66    mov         rbp, rsp
67    SHADOW_ARGS_TO_STACK 2
68    push rsi
69    push rdi
70    ; end prolog
71
72
73        mov         rsi,        arg(0) ;coeff_ptr
74        pxor        mm7,        mm7
75
76        mov         rdi,        arg(1) ;dcoef_ptr
77        movq        mm3,        [rsi]
78
79        movq        mm4,        [rdi]
80        movq        mm5,        [rsi+8]
81
82        movq        mm6,        [rdi+8]
83        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
84
85        movq        mm2,        mm7
86        psubw       mm5,        mm6
87
88        por         mm1,        mm2
89        pmaddwd     mm5,        mm5
90
91        pcmpeqw     mm1,        mm7
92        psubw       mm3,        mm4
93
94        pand        mm1,        mm3
95        pmaddwd     mm1,        mm1
96
97        paddd       mm1,        mm5
98        movq        mm3,        [rsi+16]
99
100        movq        mm4,        [rdi+16]
101        movq        mm5,        [rsi+24]
102
103        movq        mm6,        [rdi+24]
104        psubw       mm5,        mm6
105
106        pmaddwd     mm5,        mm5
107        psubw       mm3,        mm4
108
109        pmaddwd     mm3,        mm3
110        paddd       mm3,        mm5
111
112        paddd       mm1,        mm3
113        movq        mm0,        mm1
114
115        psrlq       mm1,        32
116        paddd       mm0,        mm1
117
118        movq        rax,        mm0
119
120    pop rdi
121    pop rsi
122    ; begin epilog
123    UNSHADOW_ARGS
124    pop         rbp
125    ret
126
127
128;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
129global sym(vp8_mbblock_error_mmx_impl) PRIVATE
130sym(vp8_mbblock_error_mmx_impl):
131    push        rbp
132    mov         rbp, rsp
133    SHADOW_ARGS_TO_STACK 3
134    push rsi
135    push rdi
136    ; end prolog
137
138
139        mov         rsi,        arg(0) ;coeff_ptr
140        pxor        mm7,        mm7
141
142        mov         rdi,        arg(1) ;dcoef_ptr
143        pxor        mm2,        mm2
144
145        movd        mm1,        dword ptr arg(2) ;dc
146        por         mm1,        mm2
147
148        pcmpeqw     mm1,        mm7
149        mov         rcx,        16
150
151.mberror_loop_mmx:
152        movq        mm3,       [rsi]
153        movq        mm4,       [rdi]
154
155        movq        mm5,       [rsi+8]
156        movq        mm6,       [rdi+8]
157
158
159        psubw       mm5,        mm6
160        pmaddwd     mm5,        mm5
161
162        psubw       mm3,        mm4
163        pand        mm3,        mm1
164
165        pmaddwd     mm3,        mm3
166        paddd       mm2,        mm5
167
168        paddd       mm2,        mm3
169        movq        mm3,       [rsi+16]
170
171        movq        mm4,       [rdi+16]
172        movq        mm5,       [rsi+24]
173
174        movq        mm6,       [rdi+24]
175        psubw       mm5,        mm6
176
177        pmaddwd     mm5,        mm5
178        psubw       mm3,        mm4
179
180        pmaddwd     mm3,        mm3
181        paddd       mm2,        mm5
182
183        paddd       mm2,        mm3
184        add         rsi,        32
185
186        add         rdi,        32
187        sub         rcx,        1
188
189        jnz         .mberror_loop_mmx
190
191        movq        mm0,        mm2
192        psrlq       mm2,        32
193
194        paddd       mm0,        mm2
195        movq        rax,        mm0
196
197    pop rdi
198    pop rsi
199    ; begin epilog
200    UNSHADOW_ARGS
201    pop         rbp
202    ret
203
204
205;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
206global sym(vp8_mbblock_error_xmm_impl) PRIVATE
207sym(vp8_mbblock_error_xmm_impl):
208    push        rbp
209    mov         rbp, rsp
210    SHADOW_ARGS_TO_STACK 3
211    SAVE_XMM 6
212    push rsi
213    push rdi
214    ; end prolog
215
216
217        mov         rsi,        arg(0) ;coeff_ptr
218        pxor        xmm6,       xmm6
219
220        mov         rdi,        arg(1) ;dcoef_ptr
221        pxor        xmm4,       xmm4
222
223        movd        xmm5,       dword ptr arg(2) ;dc
224        por         xmm5,       xmm4
225
226        pcmpeqw     xmm5,       xmm6
227        mov         rcx,        16
228
229.mberror_loop:
230        movdqa      xmm0,       [rsi]
231        movdqa      xmm1,       [rdi]
232
233        movdqa      xmm2,       [rsi+16]
234        movdqa      xmm3,       [rdi+16]
235
236
237        psubw       xmm2,       xmm3
238        pmaddwd     xmm2,       xmm2
239
240        psubw       xmm0,       xmm1
241        pand        xmm0,       xmm5
242
243        pmaddwd     xmm0,       xmm0
244        add         rsi,        32
245
246        add         rdi,        32
247
248        sub         rcx,        1
249        paddd       xmm4,       xmm2
250
251        paddd       xmm4,       xmm0
252        jnz         .mberror_loop
253
254        movdqa      xmm0,       xmm4
255        punpckldq   xmm0,       xmm6
256
257        punpckhdq   xmm4,       xmm6
258        paddd       xmm0,       xmm4
259
260        movdqa      xmm1,       xmm0
261        psrldq      xmm0,       8
262
263        paddd       xmm0,       xmm1
264        movq        rax,        xmm0
265
266    pop rdi
267    pop rsi
268    ; begin epilog
269    RESTORE_XMM
270    UNSHADOW_ARGS
271    pop         rbp
272    ret
273
274
275;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
276global sym(vp8_mbuverror_mmx_impl) PRIVATE
277sym(vp8_mbuverror_mmx_impl):
278    push        rbp
279    mov         rbp, rsp
280    SHADOW_ARGS_TO_STACK 2
281    push rsi
282    push rdi
283    ; end prolog
284
285
286        mov             rsi,        arg(0) ;s_ptr
287        mov             rdi,        arg(1) ;d_ptr
288
289        mov             rcx,        16
290        pxor            mm7,        mm7
291
292.mbuverror_loop_mmx:
293
294        movq            mm1,        [rsi]
295        movq            mm2,        [rdi]
296
297        psubw           mm1,        mm2
298        pmaddwd         mm1,        mm1
299
300
301        movq            mm3,        [rsi+8]
302        movq            mm4,        [rdi+8]
303
304        psubw           mm3,        mm4
305        pmaddwd         mm3,        mm3
306
307
308        paddd           mm7,        mm1
309        paddd           mm7,        mm3
310
311
312        add             rsi,        16
313        add             rdi,        16
314
315        dec             rcx
316        jnz             .mbuverror_loop_mmx
317
318        movq            mm0,        mm7
319        psrlq           mm7,        32
320
321        paddd           mm0,        mm7
322        movq            rax,        mm0
323
324    pop rdi
325    pop rsi
326    ; begin epilog
327    UNSHADOW_ARGS
328    pop         rbp
329    ret
330
331
332;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
333global sym(vp8_mbuverror_xmm_impl) PRIVATE
334sym(vp8_mbuverror_xmm_impl):
335    push        rbp
336    mov         rbp, rsp
337    SHADOW_ARGS_TO_STACK 2
338    push rsi
339    push rdi
340    ; end prolog
341
342
343        mov             rsi,        arg(0) ;s_ptr
344        mov             rdi,        arg(1) ;d_ptr
345
346        mov             rcx,        16
347        pxor            xmm3,       xmm3
348
349.mbuverror_loop:
350
351        movdqa          xmm1,       [rsi]
352        movdqa          xmm2,       [rdi]
353
354        psubw           xmm1,       xmm2
355        pmaddwd         xmm1,       xmm1
356
357        paddd           xmm3,       xmm1
358
359        add             rsi,        16
360        add             rdi,        16
361
362        dec             rcx
363        jnz             .mbuverror_loop
364
365        pxor        xmm0,           xmm0
366        movdqa      xmm1,           xmm3
367
368        movdqa      xmm2,           xmm1
369        punpckldq   xmm1,           xmm0
370
371        punpckhdq   xmm2,           xmm0
372        paddd       xmm1,           xmm2
373
374        movdqa      xmm2,           xmm1
375
376        psrldq      xmm1,           8
377        paddd       xmm1,           xmm2
378
379        movq            rax,            xmm1
380
381    pop rdi
382    pop rsi
383    ; begin epilog
384    UNSHADOW_ARGS
385    pop         rbp
386    ret
387