1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
14%macro TABULATE_SSIM 0
15        paddusw         xmm15, xmm3  ; sum_s
16        paddusw         xmm14, xmm4  ; sum_r
17        movdqa          xmm1, xmm3
18        pmaddwd         xmm1, xmm1
19        paddd           xmm13, xmm1 ; sum_sq_s
20        movdqa          xmm2, xmm4
21        pmaddwd         xmm2, xmm2
22        paddd           xmm12, xmm2 ; sum_sq_r
23        pmaddwd         xmm3, xmm4
24        paddd           xmm11, xmm3  ; sum_sxr
25%endmacro
26
27; Sum across the register %1 starting with q words
28%macro SUM_ACROSS_Q 1
29        movdqa          xmm2,%1
30        punpckldq       %1,xmm0
31        punpckhdq       xmm2,xmm0
32        paddq           %1,xmm2
33        movdqa          xmm2,%1
34        punpcklqdq      %1,xmm0
35        punpckhqdq      xmm2,xmm0
36        paddq           %1,xmm2
37%endmacro
38
39; Sum across the register %1 starting with q words
40%macro SUM_ACROSS_W 1
41        movdqa          xmm1, %1
42        punpcklwd       %1,xmm0
43        punpckhwd       xmm1,xmm0
44        paddd           %1, xmm1
45        SUM_ACROSS_Q    %1
46%endmacro
47;void ssim_parms_sse2(
48;    unsigned char *s,
49;    int sp,
50;    unsigned char *r,
51;    int rp
52;    unsigned long *sum_s,
53;    unsigned long *sum_r,
54;    unsigned long *sum_sq_s,
55;    unsigned long *sum_sq_r,
56;    unsigned long *sum_sxr);
57;
58; TODO: Use parm passing through structure, probably don't need the pxors
59; ( calling app will initialize to 0 ) could easily fit everything in sse2
60; without too much hastle, and can probably do better estimates with psadw
61; or pavgb At this point this is just meant to be first pass for calculating
62; all the parms needed for 16x16 ssim so we can play with dssim as distortion
63; in mode selection code.
64global sym(vp8_ssim_parms_16x16_sse2) PRIVATE
65sym(vp8_ssim_parms_16x16_sse2):
66    push        rbp
67    mov         rbp, rsp
68    SHADOW_ARGS_TO_STACK 9
69    SAVE_XMM 15
70    push        rsi
71    push        rdi
72    ; end prolog
73
74    mov             rsi,        arg(0) ;s
75    mov             rcx,        arg(1) ;sp
76    mov             rdi,        arg(2) ;r
77    mov             rax,        arg(3) ;rp
78
79    pxor            xmm0, xmm0
80    pxor            xmm15,xmm15  ;sum_s
81    pxor            xmm14,xmm14  ;sum_r
82    pxor            xmm13,xmm13  ;sum_sq_s
83    pxor            xmm12,xmm12  ;sum_sq_r
84    pxor            xmm11,xmm11  ;sum_sxr
85
86    mov             rdx, 16      ;row counter
87.NextRow:
88
89    ;grab source and reference pixels
90    movdqu          xmm5, [rsi]
91    movdqu          xmm6, [rdi]
92    movdqa          xmm3, xmm5
93    movdqa          xmm4, xmm6
94    punpckhbw       xmm3, xmm0 ; high_s
95    punpckhbw       xmm4, xmm0 ; high_r
96
97    TABULATE_SSIM
98
99    movdqa          xmm3, xmm5
100    movdqa          xmm4, xmm6
101    punpcklbw       xmm3, xmm0 ; low_s
102    punpcklbw       xmm4, xmm0 ; low_r
103
104    TABULATE_SSIM
105
106    add             rsi, rcx   ; next s row
107    add             rdi, rax   ; next r row
108
109    dec             rdx        ; counter
110    jnz .NextRow
111
112    SUM_ACROSS_W    xmm15
113    SUM_ACROSS_W    xmm14
114    SUM_ACROSS_Q    xmm13
115    SUM_ACROSS_Q    xmm12
116    SUM_ACROSS_Q    xmm11
117
118    mov             rdi,arg(4)
119    movd            [rdi], xmm15;
120    mov             rdi,arg(5)
121    movd            [rdi], xmm14;
122    mov             rdi,arg(6)
123    movd            [rdi], xmm13;
124    mov             rdi,arg(7)
125    movd            [rdi], xmm12;
126    mov             rdi,arg(8)
127    movd            [rdi], xmm11;
128
129    ; begin epilog
130    pop         rdi
131    pop         rsi
132    RESTORE_XMM
133    UNSHADOW_ARGS
134    pop         rbp
135    ret
136
137;void ssim_parms_sse2(
138;    unsigned char *s,
139;    int sp,
140;    unsigned char *r,
141;    int rp
142;    unsigned long *sum_s,
143;    unsigned long *sum_r,
144;    unsigned long *sum_sq_s,
145;    unsigned long *sum_sq_r,
146;    unsigned long *sum_sxr);
147;
148; TODO: Use parm passing through structure, probably don't need the pxors
149; ( calling app will initialize to 0 ) could easily fit everything in sse2
150; without too much hastle, and can probably do better estimates with psadw
151; or pavgb At this point this is just meant to be first pass for calculating
152; all the parms needed for 16x16 ssim so we can play with dssim as distortion
153; in mode selection code.
154global sym(vp8_ssim_parms_8x8_sse2) PRIVATE
155sym(vp8_ssim_parms_8x8_sse2):
156    push        rbp
157    mov         rbp, rsp
158    SHADOW_ARGS_TO_STACK 9
159    SAVE_XMM 15
160    push        rsi
161    push        rdi
162    ; end prolog
163
164    mov             rsi,        arg(0) ;s
165    mov             rcx,        arg(1) ;sp
166    mov             rdi,        arg(2) ;r
167    mov             rax,        arg(3) ;rp
168
169    pxor            xmm0, xmm0
170    pxor            xmm15,xmm15  ;sum_s
171    pxor            xmm14,xmm14  ;sum_r
172    pxor            xmm13,xmm13  ;sum_sq_s
173    pxor            xmm12,xmm12  ;sum_sq_r
174    pxor            xmm11,xmm11  ;sum_sxr
175
176    mov             rdx, 8      ;row counter
177.NextRow:
178
179    ;grab source and reference pixels
180    movq            xmm3, [rsi]
181    movq            xmm4, [rdi]
182    punpcklbw       xmm3, xmm0 ; low_s
183    punpcklbw       xmm4, xmm0 ; low_r
184
185    TABULATE_SSIM
186
187    add             rsi, rcx   ; next s row
188    add             rdi, rax   ; next r row
189
190    dec             rdx        ; counter
191    jnz .NextRow
192
193    SUM_ACROSS_W    xmm15
194    SUM_ACROSS_W    xmm14
195    SUM_ACROSS_Q    xmm13
196    SUM_ACROSS_Q    xmm12
197    SUM_ACROSS_Q    xmm11
198
199    mov             rdi,arg(4)
200    movd            [rdi], xmm15;
201    mov             rdi,arg(5)
202    movd            [rdi], xmm14;
203    mov             rdi,arg(6)
204    movd            [rdi], xmm13;
205    mov             rdi,arg(7)
206    movd            [rdi], xmm12;
207    mov             rdi,arg(8)
208    movd            [rdi], xmm11;
209
210    ; begin epilog
211    pop         rdi
212    pop         rsi
213    RESTORE_XMM
214    UNSHADOW_ARGS
215    pop         rbp
216    ret
217