1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
15global sym(vp9_get_mb_ss_mmx) PRIVATE
16sym(vp9_get_mb_ss_mmx):
17    push        rbp
18    mov         rbp, rsp
19    SHADOW_ARGS_TO_STACK 7
20    GET_GOT     rbx
21    push rsi
22    push rdi
23    sub         rsp, 8
24    ; end prolog
25
26        mov         rax, arg(0) ;src_ptr
27        mov         rcx, 16
28        pxor        mm4, mm4
29
30.NEXTROW:
31        movq        mm0, [rax]
32        movq        mm1, [rax+8]
33        movq        mm2, [rax+16]
34        movq        mm3, [rax+24]
35        pmaddwd     mm0, mm0
36        pmaddwd     mm1, mm1
37        pmaddwd     mm2, mm2
38        pmaddwd     mm3, mm3
39
40        paddd       mm4, mm0
41        paddd       mm4, mm1
42        paddd       mm4, mm2
43        paddd       mm4, mm3
44
45        add         rax, 32
46        dec         rcx
47        ja          .NEXTROW
48        movq        QWORD PTR [rsp], mm4
49
50        ;return sum[0]+sum[1];
51        movsxd      rax, dword ptr [rsp]
52        movsxd      rcx, dword ptr [rsp+4]
53        add         rax, rcx
54
55
56    ; begin epilog
57    add rsp, 8
58    pop rdi
59    pop rsi
60    RESTORE_GOT
61    UNSHADOW_ARGS
62    pop         rbp
63    ret
64
65
66;unsigned int vp9_get8x8var_mmx
67;(
68;    unsigned char *src_ptr,
69;    int  source_stride,
70;    unsigned char *ref_ptr,
71;    int  recon_stride,
72;    unsigned int *SSE,
73;    int *Sum
74;)
75global sym(vp9_get8x8var_mmx) PRIVATE
76sym(vp9_get8x8var_mmx):
77    push        rbp
78    mov         rbp, rsp
79    SHADOW_ARGS_TO_STACK 6
80    push rsi
81    push rdi
82    push rbx
83    sub         rsp, 16
84    ; end prolog
85
86
87        pxor        mm5, mm5                    ; Blank mmx6
88        pxor        mm6, mm6                    ; Blank mmx7
89        pxor        mm7, mm7                    ; Blank mmx7
90
91        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
92        mov         rbx, arg(2) ;[ref_ptr]
93        movsxd      rcx, dword ptr arg(1) ;[source_stride]
94        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
95
96        ; Row 1
97        movq        mm0, [rax]                  ; Copy eight bytes to mm0
98        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
99        movq        mm2, mm0                    ; Take copies
100        movq        mm3, mm1                    ; Take copies
101
102        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
103        punpcklbw   mm1, mm6
104        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
105        punpckhbw   mm3, mm6
106        psubsw      mm0, mm1                    ; A-B (low order) to MM0
107        psubsw      mm2, mm3                    ; A-B (high order) to MM2
108
109        paddw       mm5, mm0                    ; accumulate differences in mm5
110        paddw       mm5, mm2                    ; accumulate differences in mm5
111
112        pmaddwd     mm0, mm0                    ; square and accumulate
113        pmaddwd     mm2, mm2                    ; square and accumulate
114        add         rbx,rdx                     ; Inc pointer into ref data
115        add         rax,rcx                     ; Inc pointer into the new data
116        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
117        paddd       mm7, mm0                    ; accumulate in mm7
118        paddd       mm7, mm2                    ; accumulate in mm7
119
120
121        ; Row 2
122        movq        mm0, [rax]                  ; Copy eight bytes to mm0
123        movq        mm2, mm0                    ; Take copies
124        movq        mm3, mm1                    ; Take copies
125
126        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
127        punpcklbw   mm1, mm6
128        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
129        punpckhbw   mm3, mm6
130        psubsw      mm0, mm1                    ; A-B (low order) to MM0
131        psubsw      mm2, mm3                    ; A-B (high order) to MM2
132
133        paddw       mm5, mm0                    ; accumulate differences in mm5
134        paddw       mm5, mm2                    ; accumulate differences in mm5
135
136        pmaddwd     mm0, mm0                    ; square and accumulate
137        pmaddwd     mm2, mm2                    ; square and accumulate
138        add         rbx,rdx                     ; Inc pointer into ref data
139        add         rax,rcx                     ; Inc pointer into the new data
140        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
141        paddd       mm7, mm0                    ; accumulate in mm7
142        paddd       mm7, mm2                    ; accumulate in mm7
143
144        ; Row 3
145        movq        mm0, [rax]                  ; Copy eight bytes to mm0
146        movq        mm2, mm0                    ; Take copies
147        movq        mm3, mm1                    ; Take copies
148
149        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
150        punpcklbw   mm1, mm6
151        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
152        punpckhbw   mm3, mm6
153        psubsw      mm0, mm1                    ; A-B (low order) to MM0
154        psubsw      mm2, mm3                    ; A-B (high order) to MM2
155
156        paddw       mm5, mm0                    ; accumulate differences in mm5
157        paddw       mm5, mm2                    ; accumulate differences in mm5
158
159        pmaddwd     mm0, mm0                    ; square and accumulate
160        pmaddwd     mm2, mm2                    ; square and accumulate
161        add         rbx,rdx                     ; Inc pointer into ref data
162        add         rax,rcx                     ; Inc pointer into the new data
163        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
164        paddd       mm7, mm0                    ; accumulate in mm7
165        paddd       mm7, mm2                    ; accumulate in mm7
166
167        ; Row 4
168        movq        mm0, [rax]                  ; Copy eight bytes to mm0
169        movq        mm2, mm0                    ; Take copies
170        movq        mm3, mm1                    ; Take copies
171
172        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
173        punpcklbw   mm1, mm6
174        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
175        punpckhbw   mm3, mm6
176        psubsw      mm0, mm1                    ; A-B (low order) to MM0
177        psubsw      mm2, mm3                    ; A-B (high order) to MM2
178
179        paddw       mm5, mm0                    ; accumulate differences in mm5
180        paddw       mm5, mm2                    ; accumulate differences in mm5
181
182        pmaddwd     mm0, mm0                    ; square and accumulate
183        pmaddwd     mm2, mm2                    ; square and accumulate
184        add         rbx,rdx                     ; Inc pointer into ref data
185        add         rax,rcx                     ; Inc pointer into the new data
186        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
187        paddd       mm7, mm0                    ; accumulate in mm7
188        paddd       mm7, mm2                    ; accumulate in mm7
189
190        ; Row 5
191        movq        mm0, [rax]                  ; Copy eight bytes to mm0
192        movq        mm2, mm0                    ; Take copies
193        movq        mm3, mm1                    ; Take copies
194
195        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
196        punpcklbw   mm1, mm6
197        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
198        punpckhbw   mm3, mm6
199        psubsw      mm0, mm1                    ; A-B (low order) to MM0
200        psubsw      mm2, mm3                    ; A-B (high order) to MM2
201
202        paddw       mm5, mm0                    ; accumulate differences in mm5
203        paddw       mm5, mm2                    ; accumulate differences in mm5
204
205        pmaddwd     mm0, mm0                    ; square and accumulate
206        pmaddwd     mm2, mm2                    ; square and accumulate
207        add         rbx,rdx                     ; Inc pointer into ref data
208        add         rax,rcx                     ; Inc pointer into the new data
209        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
210        ;              movq        mm4, [rbx + rdx]
211        paddd       mm7, mm0                    ; accumulate in mm7
212        paddd       mm7, mm2                    ; accumulate in mm7
213
214        ; Row 6
215        movq        mm0, [rax]                  ; Copy eight bytes to mm0
216        movq        mm2, mm0                    ; Take copies
217        movq        mm3, mm1                    ; Take copies
218
219        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
220        punpcklbw   mm1, mm6
221        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
222        punpckhbw   mm3, mm6
223        psubsw      mm0, mm1                    ; A-B (low order) to MM0
224        psubsw      mm2, mm3                    ; A-B (high order) to MM2
225
226        paddw       mm5, mm0                    ; accumulate differences in mm5
227        paddw       mm5, mm2                    ; accumulate differences in mm5
228
229        pmaddwd     mm0, mm0                    ; square and accumulate
230        pmaddwd     mm2, mm2                    ; square and accumulate
231        add         rbx,rdx                     ; Inc pointer into ref data
232        add         rax,rcx                     ; Inc pointer into the new data
233        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
234        paddd       mm7, mm0                    ; accumulate in mm7
235        paddd       mm7, mm2                    ; accumulate in mm7
236
237        ; Row 7
238        movq        mm0, [rax]                  ; Copy eight bytes to mm0
239        movq        mm2, mm0                    ; Take copies
240        movq        mm3, mm1                    ; Take copies
241
242        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
243        punpcklbw   mm1, mm6
244        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
245        punpckhbw   mm3, mm6
246        psubsw      mm0, mm1                    ; A-B (low order) to MM0
247        psubsw      mm2, mm3                    ; A-B (high order) to MM2
248
249        paddw       mm5, mm0                    ; accumulate differences in mm5
250        paddw       mm5, mm2                    ; accumulate differences in mm5
251
252        pmaddwd     mm0, mm0                    ; square and accumulate
253        pmaddwd     mm2, mm2                    ; square and accumulate
254        add         rbx,rdx                     ; Inc pointer into ref data
255        add         rax,rcx                     ; Inc pointer into the new data
256        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
257        paddd       mm7, mm0                    ; accumulate in mm7
258        paddd       mm7, mm2                    ; accumulate in mm7
259
260        ; Row 8
261        movq        mm0, [rax]                  ; Copy eight bytes to mm0
262        movq        mm2, mm0                    ; Take copies
263        movq        mm3, mm1                    ; Take copies
264
265        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
266        punpcklbw   mm1, mm6
267        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
268        punpckhbw   mm3, mm6
269        psubsw      mm0, mm1                    ; A-B (low order) to MM0
270        psubsw      mm2, mm3                    ; A-B (high order) to MM2
271
272        paddw       mm5, mm0                    ; accumulate differences in mm5
273        paddw       mm5, mm2                    ; accumulate differences in mm5
274
275        pmaddwd     mm0, mm0                    ; square and accumulate
276        pmaddwd     mm2, mm2                    ; square and accumulate
277        add         rbx,rdx                     ; Inc pointer into ref data
278        add         rax,rcx                     ; Inc pointer into the new data
279        paddd       mm7, mm0                    ; accumulate in mm7
280        paddd       mm7, mm2                    ; accumulate in mm7
281
282        ; Now accumulate the final results.
283        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
284        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
285        movsx       rdx, WORD PTR [rsp+8]
286        movsx       rcx, WORD PTR [rsp+10]
287        movsx       rbx, WORD PTR [rsp+12]
288        movsx       rax, WORD PTR [rsp+14]
289        add         rdx, rcx
290        add         rbx, rax
291        add         rdx, rbx    ;XSum
292        movsxd      rax, DWORD PTR [rsp]
293        movsxd      rcx, DWORD PTR [rsp+4]
294        add         rax, rcx    ;XXSum
295        mov         rsi, arg(4) ;SSE
296        mov         rdi, arg(5) ;Sum
297        mov         dword ptr [rsi], eax
298        mov         dword ptr [rdi], edx
299        xor         rax, rax    ; return 0
300
301
302    ; begin epilog
303    add rsp, 16
304    pop rbx
305    pop rdi
306    pop rsi
307    UNSHADOW_ARGS
308    pop         rbp
309    ret
310
311
312
313;unsigned int
314;vp9_get4x4var_mmx
315;(
316;    unsigned char *src_ptr,
317;    int  source_stride,
318;    unsigned char *ref_ptr,
319;    int  recon_stride,
320;    unsigned int *SSE,
321;    int *Sum
322;)
323global sym(vp9_get4x4var_mmx) PRIVATE
324sym(vp9_get4x4var_mmx):
325    push        rbp
326    mov         rbp, rsp
327    SHADOW_ARGS_TO_STACK 6
328    push rsi
329    push rdi
330    push rbx
331    sub         rsp, 16
332    ; end prolog
333
334
335        pxor        mm5, mm5                    ; Blank mmx6
336        pxor        mm6, mm6                    ; Blank mmx7
337        pxor        mm7, mm7                    ; Blank mmx7
338
339        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
340        mov         rbx, arg(2) ;[ref_ptr]
341        movsxd      rcx, dword ptr arg(1) ;[source_stride]
342        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
343
344        ; Row 1
345        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
346        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
347        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
348        punpcklbw   mm1, mm6
349        psubsw      mm0, mm1                    ; A-B (low order) to MM0
350        paddw       mm5, mm0                    ; accumulate differences in mm5
351        pmaddwd     mm0, mm0                    ; square and accumulate
352        add         rbx,rdx                     ; Inc pointer into ref data
353        add         rax,rcx                     ; Inc pointer into the new data
354        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
355        paddd       mm7, mm0                    ; accumulate in mm7
356
357
358        ; Row 2
359        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
360        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
361        punpcklbw   mm1, mm6
362        psubsw      mm0, mm1                    ; A-B (low order) to MM0
363        paddw       mm5, mm0                    ; accumulate differences in mm5
364
365        pmaddwd     mm0, mm0                    ; square and accumulate
366        add         rbx,rdx                     ; Inc pointer into ref data
367        add         rax,rcx                     ; Inc pointer into the new data
368        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
369        paddd       mm7, mm0                    ; accumulate in mm7
370
371        ; Row 3
372        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
373        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
374        punpcklbw   mm1, mm6
375        psubsw      mm0, mm1                    ; A-B (low order) to MM0
376        paddw       mm5, mm0                    ; accumulate differences in mm5
377
378        pmaddwd     mm0, mm0                    ; square and accumulate
379        add         rbx,rdx                     ; Inc pointer into ref data
380        add         rax,rcx                     ; Inc pointer into the new data
381        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
382        paddd       mm7, mm0                    ; accumulate in mm7
383
384        ; Row 4
385        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
386
387        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
388        punpcklbw   mm1, mm6
389        psubsw      mm0, mm1                    ; A-B (low order) to MM0
390
391        paddw       mm5, mm0                    ; accumulate differences in mm5
392
393        pmaddwd     mm0, mm0                    ; square and accumulate
394        paddd       mm7, mm0                    ; accumulate in mm7
395
396
397        ; Now accumulate the final results.
398        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
399        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
400        movsx       rdx, WORD PTR [rsp+8]
401        movsx       rcx, WORD PTR [rsp+10]
402        movsx       rbx, WORD PTR [rsp+12]
403        movsx       rax, WORD PTR [rsp+14]
404        add         rdx, rcx
405        add         rbx, rax
406        add         rdx, rbx    ;XSum
407        movsxd      rax, DWORD PTR [rsp]
408        movsxd      rcx, DWORD PTR [rsp+4]
409        add         rax, rcx    ;XXSum
410        mov         rsi, arg(4) ;SSE
411        mov         rdi, arg(5) ;Sum
412        mov         dword ptr [rsi], eax
413        mov         dword ptr [rdi], edx
414        xor         rax, rax    ; return 0
415
416
417    ; begin epilog
418    add rsp, 16
419    pop rbx
420    pop rdi
421    pop rsi
422    UNSHADOW_ARGS
423    pop         rbp
424    ret
425
426
427
428;unsigned int
429;vp9_get4x4sse_cs_mmx
430;(
431;    unsigned char *src_ptr,
432;    int  source_stride,
433;    unsigned char *ref_ptr,
434;    int  recon_stride
435;)
436global sym(vp9_get4x4sse_cs_mmx) PRIVATE
437sym(vp9_get4x4sse_cs_mmx):
438    push        rbp
439    mov         rbp, rsp
440    SHADOW_ARGS_TO_STACK 4
441    push rsi
442    push rdi
443    push rbx
444    ; end prolog
445
446
447        pxor        mm6, mm6                    ; Blank mmx7
448        pxor        mm7, mm7                    ; Blank mmx7
449
450        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
451        mov         rbx, arg(2) ;[ref_ptr]
452        movsxd      rcx, dword ptr arg(1) ;[source_stride]
453        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
454        ; Row 1
455        movd        mm0, [rax]                  ; Copy eight bytes to mm0
456        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
457        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
458        punpcklbw   mm1, mm6
459        psubsw      mm0, mm1                    ; A-B (low order) to MM0
460        pmaddwd     mm0, mm0                    ; square and accumulate
461        add         rbx,rdx                     ; Inc pointer into ref data
462        add         rax,rcx                     ; Inc pointer into the new data
463        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
464        paddd       mm7, mm0                    ; accumulate in mm7
465
466        ; Row 2
467        movd        mm0, [rax]                  ; Copy eight bytes to mm0
468        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
469        punpcklbw   mm1, mm6
470        psubsw      mm0, mm1                    ; A-B (low order) to MM0
471        pmaddwd     mm0, mm0                    ; square and accumulate
472        add         rbx,rdx                     ; Inc pointer into ref data
473        add         rax,rcx                     ; Inc pointer into the new data
474        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
475        paddd       mm7, mm0                    ; accumulate in mm7
476
477        ; Row 3
478        movd        mm0, [rax]                  ; Copy eight bytes to mm0
479        punpcklbw   mm1, mm6
480        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
481        psubsw      mm0, mm1                    ; A-B (low order) to MM0
482
483        pmaddwd     mm0, mm0                    ; square and accumulate
484        add         rbx,rdx                     ; Inc pointer into ref data
485        add         rax,rcx                     ; Inc pointer into the new data
486        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
487        paddd       mm7, mm0                    ; accumulate in mm7
488
489        ; Row 4
490        movd        mm0, [rax]                  ; Copy eight bytes to mm0
491        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
492        punpcklbw   mm1, mm6
493        psubsw      mm0, mm1                    ; A-B (low order) to MM0
494        pmaddwd     mm0, mm0                    ; square and accumulate
495        paddd       mm7, mm0                    ; accumulate in mm7
496
497        movq        mm0,    mm7                 ;
498        psrlq       mm7,    32
499
500        paddd       mm0,    mm7
501        movq        rax,    mm0
502
503
504    ; begin epilog
505    pop rbx
506    pop rdi
507    pop rsi
508    UNSHADOW_ARGS
509    pop         rbp
510    ret
511