1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13;void vp9_half_horiz_vert_variance16x_h_sse2
14;(
15;    unsigned char *ref_ptr,
16;    int ref_pixels_per_line,
17;    unsigned char *src_ptr,
18;    int src_pixels_per_line,
19;    unsigned int Height,
20;    int *sum,
21;    unsigned int *sumsquared
22;)
23global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
24sym(vp9_half_horiz_vert_variance16x_h_sse2):
25    push        rbp
26    mov         rbp, rsp
27    SHADOW_ARGS_TO_STACK 7
28    SAVE_XMM 7
29    GET_GOT     rbx
30    push rsi
31    push rdi
32    ; end prolog
33
34        pxor            xmm6,           xmm6                ;  error accumulator
35        pxor            xmm7,           xmm7                ;  sse eaccumulator
36        mov             rsi,            arg(0) ;ref_ptr              ;
37
38        mov             rdi,            arg(2) ;src_ptr              ;
39        movsxd          rcx,            dword ptr arg(4) ;Height              ;
40        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
41        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
42
43        pxor            xmm0,           xmm0                ;
44
45        movdqu          xmm5,           XMMWORD PTR [rsi]
46        movdqu          xmm3,           XMMWORD PTR [rsi+1]
47        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
48
49        lea             rsi,            [rsi + rax]
50
51.half_horiz_vert_variance16x_h_1:
52        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
53        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
54        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
55
56        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
57
58        movdqa          xmm4,           xmm5
59        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
60        punpckhbw       xmm4,           xmm0
61
62        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
63        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
64        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
65
66        movq            xmm3,           QWORD PTR [rdi+8]
67        punpcklbw       xmm3,           xmm0
68        psubw           xmm4,           xmm3
69
70        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
71        paddw           xmm6,           xmm4
72        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
73        pmaddwd         xmm4,           xmm4
74        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
75        paddd           xmm7,           xmm4
76
77        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
78
79        lea             rsi,            [rsi + rax]
80        lea             rdi,            [rdi + rdx]
81
82        sub             rcx,            1                   ;
83        jnz             .half_horiz_vert_variance16x_h_1    ;
84
85        pxor        xmm1,           xmm1
86        pxor        xmm5,           xmm5
87
88        punpcklwd   xmm0,           xmm6
89        punpckhwd   xmm1,           xmm6
90        psrad       xmm0,           16
91        psrad       xmm1,           16
92        paddd       xmm0,           xmm1
93        movdqa      xmm1,           xmm0
94
95        movdqa      xmm6,           xmm7
96        punpckldq   xmm6,           xmm5
97        punpckhdq   xmm7,           xmm5
98        paddd       xmm6,           xmm7
99
100        punpckldq   xmm0,           xmm5
101        punpckhdq   xmm1,           xmm5
102        paddd       xmm0,           xmm1
103
104        movdqa      xmm7,           xmm6
105        movdqa      xmm1,           xmm0
106
107        psrldq      xmm7,           8
108        psrldq      xmm1,           8
109
110        paddd       xmm6,           xmm7
111        paddd       xmm0,           xmm1
112
113        mov         rsi,            arg(5) ;[Sum]
114        mov         rdi,            arg(6) ;[SSE]
115
116        movd        [rsi],       xmm0
117        movd        [rdi],       xmm6
118
119    ; begin epilog
120    pop rdi
121    pop rsi
122    RESTORE_GOT
123    RESTORE_XMM
124    UNSHADOW_ARGS
125    pop         rbp
126    ret
127
128;void vp9_half_vert_variance16x_h_sse2
129;(
130;    unsigned char *ref_ptr,
131;    int ref_pixels_per_line,
132;    unsigned char *src_ptr,
133;    int src_pixels_per_line,
134;    unsigned int Height,
135;    int *sum,
136;    unsigned int *sumsquared
137;)
138global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
139sym(vp9_half_vert_variance16x_h_sse2):
140    push        rbp
141    mov         rbp, rsp
142    SHADOW_ARGS_TO_STACK 7
143    SAVE_XMM 7
144    GET_GOT     rbx
145    push rsi
146    push rdi
147    ; end prolog
148
149        pxor            xmm6,           xmm6                ;  error accumulator
150        pxor            xmm7,           xmm7                ;  sse eaccumulator
151        mov             rsi,            arg(0)              ;ref_ptr
152
153        mov             rdi,            arg(2)              ;src_ptr
154        movsxd          rcx,            dword ptr arg(4)    ;Height
155        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
156        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
157
158        movdqu          xmm5,           XMMWORD PTR [rsi]
159        lea             rsi,            [rsi + rax          ]
160        pxor            xmm0,           xmm0
161
162.half_vert_variance16x_h_1:
163        movdqu          xmm3,           XMMWORD PTR [rsi]
164
165        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
166        movdqa          xmm4,           xmm5
167        punpcklbw       xmm5,           xmm0
168        punpckhbw       xmm4,           xmm0
169
170        movq            xmm2,           QWORD PTR [rdi]
171        punpcklbw       xmm2,           xmm0
172        psubw           xmm5,           xmm2
173        movq            xmm2,           QWORD PTR [rdi+8]
174        punpcklbw       xmm2,           xmm0
175        psubw           xmm4,           xmm2
176
177        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
178        paddw           xmm6,           xmm4
179        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
180        pmaddwd         xmm4,           xmm4
181        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
182        paddd           xmm7,           xmm4
183
184        movdqa          xmm5,           xmm3
185
186        lea             rsi,            [rsi + rax]
187        lea             rdi,            [rdi + rdx]
188
189        sub             rcx,            1
190        jnz             .half_vert_variance16x_h_1
191
192        pxor        xmm1,           xmm1
193        pxor        xmm5,           xmm5
194
195        punpcklwd   xmm0,           xmm6
196        punpckhwd   xmm1,           xmm6
197        psrad       xmm0,           16
198        psrad       xmm1,           16
199        paddd       xmm0,           xmm1
200        movdqa      xmm1,           xmm0
201
202        movdqa      xmm6,           xmm7
203        punpckldq   xmm6,           xmm5
204        punpckhdq   xmm7,           xmm5
205        paddd       xmm6,           xmm7
206
207        punpckldq   xmm0,           xmm5
208        punpckhdq   xmm1,           xmm5
209        paddd       xmm0,           xmm1
210
211        movdqa      xmm7,           xmm6
212        movdqa      xmm1,           xmm0
213
214        psrldq      xmm7,           8
215        psrldq      xmm1,           8
216
217        paddd       xmm6,           xmm7
218        paddd       xmm0,           xmm1
219
220        mov         rsi,            arg(5) ;[Sum]
221        mov         rdi,            arg(6) ;[SSE]
222
223        movd        [rsi],       xmm0
224        movd        [rdi],       xmm6
225
226    ; begin epilog
227    pop rdi
228    pop rsi
229    RESTORE_GOT
230    RESTORE_XMM
231    UNSHADOW_ARGS
232    pop         rbp
233    ret
234
235;void vp9_half_horiz_variance16x_h_sse2
236;(
237;    unsigned char *ref_ptr,
238;    int ref_pixels_per_line,
239;    unsigned char *src_ptr,
240;    int src_pixels_per_line,
241;    unsigned int Height,
242;    int *sum,
243;    unsigned int *sumsquared
244;)
245global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
246sym(vp9_half_horiz_variance16x_h_sse2):
247    push        rbp
248    mov         rbp, rsp
249    SHADOW_ARGS_TO_STACK 7
250    SAVE_XMM 7
251    GET_GOT     rbx
252    push rsi
253    push rdi
254    ; end prolog
255
256        pxor            xmm6,           xmm6                ;  error accumulator
257        pxor            xmm7,           xmm7                ;  sse eaccumulator
258        mov             rsi,            arg(0) ;ref_ptr              ;
259
260        mov             rdi,            arg(2) ;src_ptr              ;
261        movsxd          rcx,            dword ptr arg(4) ;Height              ;
262        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
263        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
264
265        pxor            xmm0,           xmm0                ;
266
267.half_horiz_variance16x_h_1:
268        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
269        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
270
271        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
272        movdqa          xmm1,           xmm5
273        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
274        punpckhbw       xmm1,           xmm0
275
276        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
277        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
278        movq            xmm2,           QWORD PTR [rdi+8]
279        punpcklbw       xmm2,           xmm0
280
281        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
282        psubw           xmm1,           xmm2
283        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
284        paddw           xmm6,           xmm1
285        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
286        pmaddwd         xmm1,           xmm1
287        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
288        paddd           xmm7,           xmm1
289
290        lea             rsi,            [rsi + rax]
291        lea             rdi,            [rdi + rdx]
292
293        sub             rcx,            1                   ;
294        jnz             .half_horiz_variance16x_h_1         ;
295
296        pxor        xmm1,           xmm1
297        pxor        xmm5,           xmm5
298
299        punpcklwd   xmm0,           xmm6
300        punpckhwd   xmm1,           xmm6
301        psrad       xmm0,           16
302        psrad       xmm1,           16
303        paddd       xmm0,           xmm1
304        movdqa      xmm1,           xmm0
305
306        movdqa      xmm6,           xmm7
307        punpckldq   xmm6,           xmm5
308        punpckhdq   xmm7,           xmm5
309        paddd       xmm6,           xmm7
310
311        punpckldq   xmm0,           xmm5
312        punpckhdq   xmm1,           xmm5
313        paddd       xmm0,           xmm1
314
315        movdqa      xmm7,           xmm6
316        movdqa      xmm1,           xmm0
317
318        psrldq      xmm7,           8
319        psrldq      xmm1,           8
320
321        paddd       xmm6,           xmm7
322        paddd       xmm0,           xmm1
323
324        mov         rsi,            arg(5) ;[Sum]
325        mov         rdi,            arg(6) ;[SSE]
326
327        movd        [rsi],       xmm0
328        movd        [rdi],       xmm6
329
330    ; begin epilog
331    pop rdi
332    pop rsi
333    RESTORE_GOT
334    RESTORE_XMM
335    UNSHADOW_ARGS
336    pop         rbp
337    ret
338