1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%macro PROCESS_16X2X3 1
15%if %1
16        movdqa          xmm0,       XMMWORD PTR [rsi]
17        lddqu           xmm5,       XMMWORD PTR [rdi]
18        lddqu           xmm6,       XMMWORD PTR [rdi+1]
19        lddqu           xmm7,       XMMWORD PTR [rdi+2]
20
21        psadbw          xmm5,       xmm0
22        psadbw          xmm6,       xmm0
23        psadbw          xmm7,       xmm0
24%else
25        movdqa          xmm0,       XMMWORD PTR [rsi]
26        lddqu           xmm1,       XMMWORD PTR [rdi]
27        lddqu           xmm2,       XMMWORD PTR [rdi+1]
28        lddqu           xmm3,       XMMWORD PTR [rdi+2]
29
30        psadbw          xmm1,       xmm0
31        psadbw          xmm2,       xmm0
32        psadbw          xmm3,       xmm0
33
34        paddw           xmm5,       xmm1
35        paddw           xmm6,       xmm2
36        paddw           xmm7,       xmm3
37%endif
38        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
39        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
40        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
41        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
42
43        lea             rsi,        [rsi+rax*2]
44        lea             rdi,        [rdi+rdx*2]
45
46        psadbw          xmm1,       xmm0
47        psadbw          xmm2,       xmm0
48        psadbw          xmm3,       xmm0
49
50        paddw           xmm5,       xmm1
51        paddw           xmm6,       xmm2
52        paddw           xmm7,       xmm3
53%endmacro
54
55%macro PROCESS_16X2X3_OFFSET 2
56%if %1
57        movdqa          xmm0,       XMMWORD PTR [rsi]
58        movdqa          xmm4,       XMMWORD PTR [rdi]
59        movdqa          xmm7,       XMMWORD PTR [rdi+16]
60
61        movdqa          xmm5,       xmm7
62        palignr         xmm5,       xmm4,       %2
63
64        movdqa          xmm6,       xmm7
65        palignr         xmm6,       xmm4,       (%2+1)
66
67        palignr         xmm7,       xmm4,       (%2+2)
68
69        psadbw          xmm5,       xmm0
70        psadbw          xmm6,       xmm0
71        psadbw          xmm7,       xmm0
72%else
73        movdqa          xmm0,       XMMWORD PTR [rsi]
74        movdqa          xmm4,       XMMWORD PTR [rdi]
75        movdqa          xmm3,       XMMWORD PTR [rdi+16]
76
77        movdqa          xmm1,       xmm3
78        palignr         xmm1,       xmm4,       %2
79
80        movdqa          xmm2,       xmm3
81        palignr         xmm2,       xmm4,       (%2+1)
82
83        palignr         xmm3,       xmm4,       (%2+2)
84
85        psadbw          xmm1,       xmm0
86        psadbw          xmm2,       xmm0
87        psadbw          xmm3,       xmm0
88
89        paddw           xmm5,       xmm1
90        paddw           xmm6,       xmm2
91        paddw           xmm7,       xmm3
92%endif
93        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
94        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
95        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
96
97        movdqa          xmm1,       xmm3
98        palignr         xmm1,       xmm4,       %2
99
100        movdqa          xmm2,       xmm3
101        palignr         xmm2,       xmm4,       (%2+1)
102
103        palignr         xmm3,       xmm4,       (%2+2)
104
105        lea             rsi,        [rsi+rax*2]
106        lea             rdi,        [rdi+rdx*2]
107
108        psadbw          xmm1,       xmm0
109        psadbw          xmm2,       xmm0
110        psadbw          xmm3,       xmm0
111
112        paddw           xmm5,       xmm1
113        paddw           xmm6,       xmm2
114        paddw           xmm7,       xmm3
115%endmacro
116
117%macro PROCESS_16X16X3_OFFSET 2
118%2_aligned_by_%1:
119
120        sub             rdi,        %1
121
122        PROCESS_16X2X3_OFFSET 1, %1
123        PROCESS_16X2X3_OFFSET 0, %1
124        PROCESS_16X2X3_OFFSET 0, %1
125        PROCESS_16X2X3_OFFSET 0, %1
126        PROCESS_16X2X3_OFFSET 0, %1
127        PROCESS_16X2X3_OFFSET 0, %1
128        PROCESS_16X2X3_OFFSET 0, %1
129        PROCESS_16X2X3_OFFSET 0, %1
130
131        jmp             %2_store_off
132
133%endmacro
134
135%macro PROCESS_16X8X3_OFFSET 2
136%2_aligned_by_%1:
137
138        sub             rdi,        %1
139
140        PROCESS_16X2X3_OFFSET 1, %1
141        PROCESS_16X2X3_OFFSET 0, %1
142        PROCESS_16X2X3_OFFSET 0, %1
143        PROCESS_16X2X3_OFFSET 0, %1
144
145        jmp             %2_store_off
146
147%endmacro
148
149;void int vp8_sad16x16x3_ssse3(
150;    unsigned char *src_ptr,
151;    int  src_stride,
152;    unsigned char *ref_ptr,
153;    int  ref_stride,
154;    int  *results)
155global sym(vp8_sad16x16x3_ssse3) PRIVATE
156sym(vp8_sad16x16x3_ssse3):
157    push        rbp
158    mov         rbp, rsp
159    SHADOW_ARGS_TO_STACK 5
160    SAVE_XMM 7
161    push        rsi
162    push        rdi
163    push        rcx
164    ; end prolog
165
166        mov             rsi,        arg(0) ;src_ptr
167        mov             rdi,        arg(2) ;ref_ptr
168
169        mov             rdx,        0xf
170        and             rdx,        rdi
171
172        jmp .vp8_sad16x16x3_ssse3_skiptable
173.vp8_sad16x16x3_ssse3_jumptable:
174        dd .vp8_sad16x16x3_ssse3_aligned_by_0  - .vp8_sad16x16x3_ssse3_do_jump
175        dd .vp8_sad16x16x3_ssse3_aligned_by_1  - .vp8_sad16x16x3_ssse3_do_jump
176        dd .vp8_sad16x16x3_ssse3_aligned_by_2  - .vp8_sad16x16x3_ssse3_do_jump
177        dd .vp8_sad16x16x3_ssse3_aligned_by_3  - .vp8_sad16x16x3_ssse3_do_jump
178        dd .vp8_sad16x16x3_ssse3_aligned_by_4  - .vp8_sad16x16x3_ssse3_do_jump
179        dd .vp8_sad16x16x3_ssse3_aligned_by_5  - .vp8_sad16x16x3_ssse3_do_jump
180        dd .vp8_sad16x16x3_ssse3_aligned_by_6  - .vp8_sad16x16x3_ssse3_do_jump
181        dd .vp8_sad16x16x3_ssse3_aligned_by_7  - .vp8_sad16x16x3_ssse3_do_jump
182        dd .vp8_sad16x16x3_ssse3_aligned_by_8  - .vp8_sad16x16x3_ssse3_do_jump
183        dd .vp8_sad16x16x3_ssse3_aligned_by_9  - .vp8_sad16x16x3_ssse3_do_jump
184        dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
185        dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
186        dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
187        dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
188        dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
189        dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
190.vp8_sad16x16x3_ssse3_skiptable:
191
192        call .vp8_sad16x16x3_ssse3_do_jump
193.vp8_sad16x16x3_ssse3_do_jump:
194        pop             rcx                         ; get the address of do_jump
195        mov             rax,  .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
196        add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
197
198        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
199        add             rcx,        rax
200
201        movsxd          rax,        dword ptr arg(1) ;src_stride
202        movsxd          rdx,        dword ptr arg(3) ;ref_stride
203
204        jmp             rcx
205
206        PROCESS_16X16X3_OFFSET 0,  .vp8_sad16x16x3_ssse3
207        PROCESS_16X16X3_OFFSET 1,  .vp8_sad16x16x3_ssse3
208        PROCESS_16X16X3_OFFSET 2,  .vp8_sad16x16x3_ssse3
209        PROCESS_16X16X3_OFFSET 3,  .vp8_sad16x16x3_ssse3
210        PROCESS_16X16X3_OFFSET 4,  .vp8_sad16x16x3_ssse3
211        PROCESS_16X16X3_OFFSET 5,  .vp8_sad16x16x3_ssse3
212        PROCESS_16X16X3_OFFSET 6,  .vp8_sad16x16x3_ssse3
213        PROCESS_16X16X3_OFFSET 7,  .vp8_sad16x16x3_ssse3
214        PROCESS_16X16X3_OFFSET 8,  .vp8_sad16x16x3_ssse3
215        PROCESS_16X16X3_OFFSET 9,  .vp8_sad16x16x3_ssse3
216        PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
217        PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
218        PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
219        PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
220        PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
221
222.vp8_sad16x16x3_ssse3_aligned_by_15:
223        PROCESS_16X2X3 1
224        PROCESS_16X2X3 0
225        PROCESS_16X2X3 0
226        PROCESS_16X2X3 0
227        PROCESS_16X2X3 0
228        PROCESS_16X2X3 0
229        PROCESS_16X2X3 0
230        PROCESS_16X2X3 0
231
232.vp8_sad16x16x3_ssse3_store_off:
233        mov             rdi,        arg(4) ;Results
234
235        movq            xmm0,       xmm5
236        psrldq          xmm5,       8
237
238        paddw           xmm0,       xmm5
239        movd            [rdi],      xmm0
240;-
241        movq            xmm0,       xmm6
242        psrldq          xmm6,       8
243
244        paddw           xmm0,       xmm6
245        movd            [rdi+4],    xmm0
246;-
247        movq            xmm0,       xmm7
248        psrldq          xmm7,       8
249
250        paddw           xmm0,       xmm7
251        movd            [rdi+8],    xmm0
252
253    ; begin epilog
254    pop         rcx
255    pop         rdi
256    pop         rsi
257    RESTORE_XMM
258    UNSHADOW_ARGS
259    pop         rbp
260    ret
261
262;void int vp8_sad16x8x3_ssse3(
263;    unsigned char *src_ptr,
264;    int  src_stride,
265;    unsigned char *ref_ptr,
266;    int  ref_stride,
267;    int  *results)
268global sym(vp8_sad16x8x3_ssse3) PRIVATE
269sym(vp8_sad16x8x3_ssse3):
270    push        rbp
271    mov         rbp, rsp
272    SHADOW_ARGS_TO_STACK 5
273    SAVE_XMM 7
274    push        rsi
275    push        rdi
276    push        rcx
277    ; end prolog
278
279        mov             rsi,        arg(0) ;src_ptr
280        mov             rdi,        arg(2) ;ref_ptr
281
282        mov             rdx,        0xf
283        and             rdx,        rdi
284
285        jmp .vp8_sad16x8x3_ssse3_skiptable
286.vp8_sad16x8x3_ssse3_jumptable:
287        dd .vp8_sad16x8x3_ssse3_aligned_by_0  - .vp8_sad16x8x3_ssse3_do_jump
288        dd .vp8_sad16x8x3_ssse3_aligned_by_1  - .vp8_sad16x8x3_ssse3_do_jump
289        dd .vp8_sad16x8x3_ssse3_aligned_by_2  - .vp8_sad16x8x3_ssse3_do_jump
290        dd .vp8_sad16x8x3_ssse3_aligned_by_3  - .vp8_sad16x8x3_ssse3_do_jump
291        dd .vp8_sad16x8x3_ssse3_aligned_by_4  - .vp8_sad16x8x3_ssse3_do_jump
292        dd .vp8_sad16x8x3_ssse3_aligned_by_5  - .vp8_sad16x8x3_ssse3_do_jump
293        dd .vp8_sad16x8x3_ssse3_aligned_by_6  - .vp8_sad16x8x3_ssse3_do_jump
294        dd .vp8_sad16x8x3_ssse3_aligned_by_7  - .vp8_sad16x8x3_ssse3_do_jump
295        dd .vp8_sad16x8x3_ssse3_aligned_by_8  - .vp8_sad16x8x3_ssse3_do_jump
296        dd .vp8_sad16x8x3_ssse3_aligned_by_9  - .vp8_sad16x8x3_ssse3_do_jump
297        dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
298        dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
299        dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
300        dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
301        dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
302        dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
303.vp8_sad16x8x3_ssse3_skiptable:
304
305        call .vp8_sad16x8x3_ssse3_do_jump
306.vp8_sad16x8x3_ssse3_do_jump:
307        pop             rcx                         ; get the address of do_jump
308        mov             rax,  .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
309        add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
310
311        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
312        add             rcx,        rax
313
314        movsxd          rax,        dword ptr arg(1) ;src_stride
315        movsxd          rdx,        dword ptr arg(3) ;ref_stride
316
317        jmp             rcx
318
319        PROCESS_16X8X3_OFFSET 0,  .vp8_sad16x8x3_ssse3
320        PROCESS_16X8X3_OFFSET 1,  .vp8_sad16x8x3_ssse3
321        PROCESS_16X8X3_OFFSET 2,  .vp8_sad16x8x3_ssse3
322        PROCESS_16X8X3_OFFSET 3,  .vp8_sad16x8x3_ssse3
323        PROCESS_16X8X3_OFFSET 4,  .vp8_sad16x8x3_ssse3
324        PROCESS_16X8X3_OFFSET 5,  .vp8_sad16x8x3_ssse3
325        PROCESS_16X8X3_OFFSET 6,  .vp8_sad16x8x3_ssse3
326        PROCESS_16X8X3_OFFSET 7,  .vp8_sad16x8x3_ssse3
327        PROCESS_16X8X3_OFFSET 8,  .vp8_sad16x8x3_ssse3
328        PROCESS_16X8X3_OFFSET 9,  .vp8_sad16x8x3_ssse3
329        PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
330        PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
331        PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
332        PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
333        PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
334
335.vp8_sad16x8x3_ssse3_aligned_by_15:
336
337        PROCESS_16X2X3 1
338        PROCESS_16X2X3 0
339        PROCESS_16X2X3 0
340        PROCESS_16X2X3 0
341
342.vp8_sad16x8x3_ssse3_store_off:
343        mov             rdi,        arg(4) ;Results
344
345        movq            xmm0,       xmm5
346        psrldq          xmm5,       8
347
348        paddw           xmm0,       xmm5
349        movd            [rdi],      xmm0
350;-
351        movq            xmm0,       xmm6
352        psrldq          xmm6,       8
353
354        paddw           xmm0,       xmm6
355        movd            [rdi+4],    xmm0
356;-
357        movq            xmm0,       xmm7
358        psrldq          xmm7,       8
359
360        paddw           xmm0,       xmm7
361        movd            [rdi+8],    xmm0
362
363    ; begin epilog
364    pop         rcx
365    pop         rdi
366    pop         rsi
367    RESTORE_XMM
368    UNSHADOW_ARGS
369    pop         rbp
370    ret
371