1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
14%macro TABULATE_SSIM 0
15        paddusw         xmm15, xmm3  ; sum_s
16        paddusw         xmm14, xmm4  ; sum_r
17        movdqa          xmm1, xmm3
18        pmaddwd         xmm1, xmm1
19        paddq           xmm13, xmm1 ; sum_sq_s
20        movdqa          xmm2, xmm4
21        pmaddwd         xmm2, xmm2
22        paddq           xmm12, xmm2 ; sum_sq_r
23        pmaddwd         xmm3, xmm4
24        paddq           xmm11, xmm3  ; sum_sxr
25%endmacro
26
27; Sum across the register %1 starting with q words
28%macro SUM_ACROSS_Q 1
29        movdqa          xmm2,%1
30        punpckldq       %1,xmm0
31        punpckhdq       xmm2,xmm0
32        paddq           %1,xmm2
33        movdqa          xmm2,%1
34        punpcklqdq      %1,xmm0
35        punpckhqdq      xmm2,xmm0
36        paddq           %1,xmm2
37%endmacro
38
39; Sum across the register %1 starting with q words
40%macro SUM_ACROSS_W 1
41        movdqa          xmm1, %1
42        punpcklwd       %1,xmm0
43        punpckhwd       xmm1,xmm0
44        paddd           %1, xmm1
45        SUM_ACROSS_Q    %1
46%endmacro
47;void ssim_parms_sse3(
48;    unsigned char *s,
49;    int sp,
50;    unsigned char *r,
51;    int rp
52;    unsigned long *sum_s,
53;    unsigned long *sum_r,
54;    unsigned long *sum_sq_s,
55;    unsigned long *sum_sq_r,
56;    unsigned long *sum_sxr);
57;
58; TODO: Use parm passing through structure, probably don't need the pxors
59; ( calling app will initialize to 0 ) could easily fit everything in sse2
60; without too much hastle, and can probably do better estimates with psadw
61; or pavgb At this point this is just meant to be first pass for calculating
62; all the parms needed for 16x16 ssim so we can play with dssim as distortion
63; in mode selection code.
64global sym(vp8_ssim_parms_16x16_sse3)
65sym(vp8_ssim_parms_16x16_sse3):
66    push        rbp
67    mov         rbp, rsp
68    SHADOW_ARGS_TO_STACK 9
69    push        rsi
70    push        rdi
71    ; end prolog
72
73    mov             rsi,        arg(0) ;s
74    mov             rcx,        arg(1) ;sp
75    mov             rdi,        arg(2) ;r
76    mov             rax,        arg(3) ;rp
77
78    pxor            xmm0, xmm0
79    pxor            xmm15,xmm15  ;sum_s
80    pxor            xmm14,xmm14  ;sum_r
81    pxor            xmm13,xmm13  ;sum_sq_s
82    pxor            xmm12,xmm12  ;sum_sq_r
83    pxor            xmm11,xmm11  ;sum_sxr
84
85    mov             rdx, 16      ;row counter
86NextRow:
87
88    ;grab source and reference pixels
89    movdqu          xmm5, [rsi]
90    movdqu          xmm6, [rdi]
91    movdqa          xmm3, xmm5
92    movdqa          xmm4, xmm6
93    punpckhbw       xmm3, xmm0 ; high_s
94    punpckhbw       xmm4, xmm0 ; high_r
95
96    TABULATE_SSIM
97
98    movdqa          xmm3, xmm5
99    movdqa          xmm4, xmm6
100    punpcklbw       xmm3, xmm0 ; low_s
101    punpcklbw       xmm4, xmm0 ; low_r
102
103    TABULATE_SSIM
104
105    add             rsi, rcx   ; next s row
106    add             rdi, rax   ; next r row
107
108    dec             rdx        ; counter
109    jnz NextRow
110
111    SUM_ACROSS_W    xmm15
112    SUM_ACROSS_W    xmm14
113    SUM_ACROSS_Q    xmm13
114    SUM_ACROSS_Q    xmm12
115    SUM_ACROSS_Q    xmm11
116
117    mov             rdi,arg(4)
118    movq            [rdi], xmm15;
119    mov             rdi,arg(5)
120    movq            [rdi], xmm14;
121    mov             rdi,arg(6)
122    movq            [rdi], xmm13;
123    mov             rdi,arg(7)
124    movq            [rdi], xmm12;
125    mov             rdi,arg(8)
126    movq            [rdi], xmm11;
127
128    ; begin epilog
129    pop         rdi
130    pop         rsi
131    UNSHADOW_ARGS
132    pop         rbp
133    ret
134
135;void ssim_parms_sse3(
136;    unsigned char *s,
137;    int sp,
138;    unsigned char *r,
139;    int rp
140;    unsigned long *sum_s,
141;    unsigned long *sum_r,
142;    unsigned long *sum_sq_s,
143;    unsigned long *sum_sq_r,
144;    unsigned long *sum_sxr);
145;
146; TODO: Use parm passing through structure, probably don't need the pxors
147; ( calling app will initialize to 0 ) could easily fit everything in sse2
148; without too much hastle, and can probably do better estimates with psadw
149; or pavgb At this point this is just meant to be first pass for calculating
150; all the parms needed for 16x16 ssim so we can play with dssim as distortion
151; in mode selection code.
152global sym(vp8_ssim_parms_8x8_sse3)
153sym(vp8_ssim_parms_8x8_sse3):
154    push        rbp
155    mov         rbp, rsp
156    SHADOW_ARGS_TO_STACK 9
157    push        rsi
158    push        rdi
159    ; end prolog
160
161    mov             rsi,        arg(0) ;s
162    mov             rcx,        arg(1) ;sp
163    mov             rdi,        arg(2) ;r
164    mov             rax,        arg(3) ;rp
165
166    pxor            xmm0, xmm0
167    pxor            xmm15,xmm15  ;sum_s
168    pxor            xmm14,xmm14  ;sum_r
169    pxor            xmm13,xmm13  ;sum_sq_s
170    pxor            xmm12,xmm12  ;sum_sq_r
171    pxor            xmm11,xmm11  ;sum_sxr
172
173    mov             rdx, 8      ;row counter
174NextRow2:
175
176    ;grab source and reference pixels
177    movq            xmm5, [rsi]
178    movq            xmm6, [rdi]
179
180    movdqa          xmm3, xmm5
181    movdqa          xmm4, xmm6
182    punpcklbw       xmm3, xmm0 ; low_s
183    punpcklbw       xmm4, xmm0 ; low_r
184
185    TABULATE_SSIM
186
187    add             rsi, rcx   ; next s row
188    add             rdi, rax   ; next r row
189
190    dec             rdx        ; counter
191    jnz NextRow2
192
193    SUM_ACROSS_W    xmm15
194    SUM_ACROSS_W    xmm14
195    SUM_ACROSS_Q    xmm13
196    SUM_ACROSS_Q    xmm12
197    SUM_ACROSS_Q    xmm11
198
199    mov             rdi,arg(4)
200    movq            [rdi], xmm15;
201    mov             rdi,arg(5)
202    movq            [rdi], xmm14;
203    mov             rdi,arg(6)
204    movq            [rdi], xmm13;
205    mov             rdi,arg(7)
206    movq            [rdi], xmm12;
207    mov             rdi,arg(8)
208    movq            [rdi], xmm11;
209
210    ; begin epilog
211    pop         rdi
212    pop         rsi
213    UNSHADOW_ARGS
214    pop         rbp
215    ret
216