1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro PROCESS_16X2X3 1
15%if %1
16        movdqa          xmm0,       XMMWORD PTR [rsi]
17        lddqu           xmm5,       XMMWORD PTR [rdi]
18        lddqu           xmm6,       XMMWORD PTR [rdi+1]
19        lddqu           xmm7,       XMMWORD PTR [rdi+2]
20
21        psadbw          xmm5,       xmm0
22        psadbw          xmm6,       xmm0
23        psadbw          xmm7,       xmm0
24%else
25        movdqa          xmm0,       XMMWORD PTR [rsi]
26        lddqu           xmm1,       XMMWORD PTR [rdi]
27        lddqu           xmm2,       XMMWORD PTR [rdi+1]
28        lddqu           xmm3,       XMMWORD PTR [rdi+2]
29
30        psadbw          xmm1,       xmm0
31        psadbw          xmm2,       xmm0
32        psadbw          xmm3,       xmm0
33
34        paddw           xmm5,       xmm1
35        paddw           xmm6,       xmm2
36        paddw           xmm7,       xmm3
37%endif
38        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
39        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
40        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
41        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
42
43        lea             rsi,        [rsi+rax*2]
44        lea             rdi,        [rdi+rdx*2]
45
46        psadbw          xmm1,       xmm0
47        psadbw          xmm2,       xmm0
48        psadbw          xmm3,       xmm0
49
50        paddw           xmm5,       xmm1
51        paddw           xmm6,       xmm2
52        paddw           xmm7,       xmm3
53%endmacro
54
55%macro PROCESS_16X2X3_OFFSET 2
56%if %1
57        movdqa          xmm0,       XMMWORD PTR [rsi]
58        movdqa          xmm4,       XMMWORD PTR [rdi]
59        movdqa          xmm7,       XMMWORD PTR [rdi+16]
60
61        movdqa          xmm5,       xmm7
62        palignr         xmm5,       xmm4,       %2
63
64        movdqa          xmm6,       xmm7
65        palignr         xmm6,       xmm4,       (%2+1)
66
67        palignr         xmm7,       xmm4,       (%2+2)
68
69        psadbw          xmm5,       xmm0
70        psadbw          xmm6,       xmm0
71        psadbw          xmm7,       xmm0
72%else
73        movdqa          xmm0,       XMMWORD PTR [rsi]
74        movdqa          xmm4,       XMMWORD PTR [rdi]
75        movdqa          xmm3,       XMMWORD PTR [rdi+16]
76
77        movdqa          xmm1,       xmm3
78        palignr         xmm1,       xmm4,       %2
79
80        movdqa          xmm2,       xmm3
81        palignr         xmm2,       xmm4,       (%2+1)
82
83        palignr         xmm3,       xmm4,       (%2+2)
84
85        psadbw          xmm1,       xmm0
86        psadbw          xmm2,       xmm0
87        psadbw          xmm3,       xmm0
88
89        paddw           xmm5,       xmm1
90        paddw           xmm6,       xmm2
91        paddw           xmm7,       xmm3
92%endif
93        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
94        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
95        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
96
97        movdqa          xmm1,       xmm3
98        palignr         xmm1,       xmm4,       %2
99
100        movdqa          xmm2,       xmm3
101        palignr         xmm2,       xmm4,       (%2+1)
102
103        palignr         xmm3,       xmm4,       (%2+2)
104
105        lea             rsi,        [rsi+rax*2]
106        lea             rdi,        [rdi+rdx*2]
107
108        psadbw          xmm1,       xmm0
109        psadbw          xmm2,       xmm0
110        psadbw          xmm3,       xmm0
111
112        paddw           xmm5,       xmm1
113        paddw           xmm6,       xmm2
114        paddw           xmm7,       xmm3
115%endmacro
116
117%macro PROCESS_16X16X3_OFFSET 2
118%2_aligned_by_%1:
119
120        sub             rdi,        %1
121
122        PROCESS_16X2X3_OFFSET 1, %1
123        PROCESS_16X2X3_OFFSET 0, %1
124        PROCESS_16X2X3_OFFSET 0, %1
125        PROCESS_16X2X3_OFFSET 0, %1
126        PROCESS_16X2X3_OFFSET 0, %1
127        PROCESS_16X2X3_OFFSET 0, %1
128        PROCESS_16X2X3_OFFSET 0, %1
129        PROCESS_16X2X3_OFFSET 0, %1
130
131        jmp             %2_store_off
132
133%endmacro
134
135%macro PROCESS_16X8X3_OFFSET 2
136%2_aligned_by_%1:
137
138        sub             rdi,        %1
139
140        PROCESS_16X2X3_OFFSET 1, %1
141        PROCESS_16X2X3_OFFSET 0, %1
142        PROCESS_16X2X3_OFFSET 0, %1
143        PROCESS_16X2X3_OFFSET 0, %1
144
145        jmp             %2_store_off
146
147%endmacro
148
149;void int vp8_sad16x16x3_ssse3(
150;    unsigned char *src_ptr,
151;    int  src_stride,
152;    unsigned char *ref_ptr,
153;    int  ref_stride,
154;    int  *results)
155global sym(vp8_sad16x16x3_ssse3)
156sym(vp8_sad16x16x3_ssse3):
157    push        rbp
158    mov         rbp, rsp
159    SHADOW_ARGS_TO_STACK 5
160    push        rsi
161    push        rdi
162    push        rcx
163    ; end prolog
164
165        mov             rsi,        arg(0) ;src_ptr
166        mov             rdi,        arg(2) ;ref_ptr
167
168        mov             rdx,        0xf
169        and             rdx,        rdi
170
171        jmp vp8_sad16x16x3_ssse3_skiptable
172vp8_sad16x16x3_ssse3_jumptable:
173        dd vp8_sad16x16x3_ssse3_aligned_by_0  - vp8_sad16x16x3_ssse3_do_jump
174        dd vp8_sad16x16x3_ssse3_aligned_by_1  - vp8_sad16x16x3_ssse3_do_jump
175        dd vp8_sad16x16x3_ssse3_aligned_by_2  - vp8_sad16x16x3_ssse3_do_jump
176        dd vp8_sad16x16x3_ssse3_aligned_by_3  - vp8_sad16x16x3_ssse3_do_jump
177        dd vp8_sad16x16x3_ssse3_aligned_by_4  - vp8_sad16x16x3_ssse3_do_jump
178        dd vp8_sad16x16x3_ssse3_aligned_by_5  - vp8_sad16x16x3_ssse3_do_jump
179        dd vp8_sad16x16x3_ssse3_aligned_by_6  - vp8_sad16x16x3_ssse3_do_jump
180        dd vp8_sad16x16x3_ssse3_aligned_by_7  - vp8_sad16x16x3_ssse3_do_jump
181        dd vp8_sad16x16x3_ssse3_aligned_by_8  - vp8_sad16x16x3_ssse3_do_jump
182        dd vp8_sad16x16x3_ssse3_aligned_by_9  - vp8_sad16x16x3_ssse3_do_jump
183        dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
184        dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
185        dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
186        dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
187        dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
188        dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
189vp8_sad16x16x3_ssse3_skiptable:
190
191        call vp8_sad16x16x3_ssse3_do_jump
192vp8_sad16x16x3_ssse3_do_jump:
193        pop             rcx                         ; get the address of do_jump
194        mov             rax,  vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
195        add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
196
197        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
198        add             rcx,        rax
199
200        movsxd          rax,        dword ptr arg(1) ;src_stride
201        movsxd          rdx,        dword ptr arg(3) ;ref_stride
202
203        jmp             rcx
204
205        PROCESS_16X16X3_OFFSET 0,  vp8_sad16x16x3_ssse3
206        PROCESS_16X16X3_OFFSET 1,  vp8_sad16x16x3_ssse3
207        PROCESS_16X16X3_OFFSET 2,  vp8_sad16x16x3_ssse3
208        PROCESS_16X16X3_OFFSET 3,  vp8_sad16x16x3_ssse3
209        PROCESS_16X16X3_OFFSET 4,  vp8_sad16x16x3_ssse3
210        PROCESS_16X16X3_OFFSET 5,  vp8_sad16x16x3_ssse3
211        PROCESS_16X16X3_OFFSET 6,  vp8_sad16x16x3_ssse3
212        PROCESS_16X16X3_OFFSET 7,  vp8_sad16x16x3_ssse3
213        PROCESS_16X16X3_OFFSET 8,  vp8_sad16x16x3_ssse3
214        PROCESS_16X16X3_OFFSET 9,  vp8_sad16x16x3_ssse3
215        PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
216        PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
217        PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
218        PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
219        PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
220
221vp8_sad16x16x3_ssse3_aligned_by_15:
222        PROCESS_16X2X3 1
223        PROCESS_16X2X3 0
224        PROCESS_16X2X3 0
225        PROCESS_16X2X3 0
226        PROCESS_16X2X3 0
227        PROCESS_16X2X3 0
228        PROCESS_16X2X3 0
229        PROCESS_16X2X3 0
230
231vp8_sad16x16x3_ssse3_store_off:
232        mov             rdi,        arg(4) ;Results
233
234        movq            xmm0,       xmm5
235        psrldq          xmm5,       8
236
237        paddw           xmm0,       xmm5
238        movd            [rdi],      xmm0
239;-
240        movq            xmm0,       xmm6
241        psrldq          xmm6,       8
242
243        paddw           xmm0,       xmm6
244        movd            [rdi+4],    xmm0
245;-
246        movq            xmm0,       xmm7
247        psrldq          xmm7,       8
248
249        paddw           xmm0,       xmm7
250        movd            [rdi+8],    xmm0
251
252    ; begin epilog
253    pop         rcx
254    pop         rdi
255    pop         rsi
256    UNSHADOW_ARGS
257    pop         rbp
258    ret
259
260;void int vp8_sad16x8x3_ssse3(
261;    unsigned char *src_ptr,
262;    int  src_stride,
263;    unsigned char *ref_ptr,
264;    int  ref_stride,
265;    int  *results)
266global sym(vp8_sad16x8x3_ssse3)
267sym(vp8_sad16x8x3_ssse3):
268    push        rbp
269    mov         rbp, rsp
270    SHADOW_ARGS_TO_STACK 5
271    push        rsi
272    push        rdi
273    push        rcx
274    ; end prolog
275
276        mov             rsi,        arg(0) ;src_ptr
277        mov             rdi,        arg(2) ;ref_ptr
278
279        mov             rdx,        0xf
280        and             rdx,        rdi
281
282        jmp vp8_sad16x8x3_ssse3_skiptable
283vp8_sad16x8x3_ssse3_jumptable:
284        dd vp8_sad16x8x3_ssse3_aligned_by_0  - vp8_sad16x8x3_ssse3_do_jump
285        dd vp8_sad16x8x3_ssse3_aligned_by_1  - vp8_sad16x8x3_ssse3_do_jump
286        dd vp8_sad16x8x3_ssse3_aligned_by_2  - vp8_sad16x8x3_ssse3_do_jump
287        dd vp8_sad16x8x3_ssse3_aligned_by_3  - vp8_sad16x8x3_ssse3_do_jump
288        dd vp8_sad16x8x3_ssse3_aligned_by_4  - vp8_sad16x8x3_ssse3_do_jump
289        dd vp8_sad16x8x3_ssse3_aligned_by_5  - vp8_sad16x8x3_ssse3_do_jump
290        dd vp8_sad16x8x3_ssse3_aligned_by_6  - vp8_sad16x8x3_ssse3_do_jump
291        dd vp8_sad16x8x3_ssse3_aligned_by_7  - vp8_sad16x8x3_ssse3_do_jump
292        dd vp8_sad16x8x3_ssse3_aligned_by_8  - vp8_sad16x8x3_ssse3_do_jump
293        dd vp8_sad16x8x3_ssse3_aligned_by_9  - vp8_sad16x8x3_ssse3_do_jump
294        dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
295        dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
296        dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
297        dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
298        dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
299        dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
300vp8_sad16x8x3_ssse3_skiptable:
301
302        call vp8_sad16x8x3_ssse3_do_jump
303vp8_sad16x8x3_ssse3_do_jump:
304        pop             rcx                         ; get the address of do_jump
305        mov             rax,  vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
306        add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
307
308        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
309        add             rcx,        rax
310
311        movsxd          rax,        dword ptr arg(1) ;src_stride
312        movsxd          rdx,        dword ptr arg(3) ;ref_stride
313
314        jmp             rcx
315
316        PROCESS_16X8X3_OFFSET 0,  vp8_sad16x8x3_ssse3
317        PROCESS_16X8X3_OFFSET 1,  vp8_sad16x8x3_ssse3
318        PROCESS_16X8X3_OFFSET 2,  vp8_sad16x8x3_ssse3
319        PROCESS_16X8X3_OFFSET 3,  vp8_sad16x8x3_ssse3
320        PROCESS_16X8X3_OFFSET 4,  vp8_sad16x8x3_ssse3
321        PROCESS_16X8X3_OFFSET 5,  vp8_sad16x8x3_ssse3
322        PROCESS_16X8X3_OFFSET 6,  vp8_sad16x8x3_ssse3
323        PROCESS_16X8X3_OFFSET 7,  vp8_sad16x8x3_ssse3
324        PROCESS_16X8X3_OFFSET 8,  vp8_sad16x8x3_ssse3
325        PROCESS_16X8X3_OFFSET 9,  vp8_sad16x8x3_ssse3
326        PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
327        PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
328        PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
329        PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
330        PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
331
332vp8_sad16x8x3_ssse3_aligned_by_15:
333
334        PROCESS_16X2X3 1
335        PROCESS_16X2X3 0
336        PROCESS_16X2X3 0
337        PROCESS_16X2X3 0
338
339vp8_sad16x8x3_ssse3_store_off:
340        mov             rdi,        arg(4) ;Results
341
342        movq            xmm0,       xmm5
343        psrldq          xmm5,       8
344
345        paddw           xmm0,       xmm5
346        movd            [rdi],      xmm0
347;-
348        movq            xmm0,       xmm6
349        psrldq          xmm6,       8
350
351        paddw           xmm0,       xmm6
352        movd            [rdi+4],    xmm0
353;-
354        movq            xmm0,       xmm7
355        psrldq          xmm7,       8
356
357        paddw           xmm0,       xmm7
358        movd            [rdi+8],    xmm0
359
360    ; begin epilog
361    pop         rcx
362    pop         rdi
363    pop         rsi
364    UNSHADOW_ARGS
365    pop         rbp
366    ret
367