1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vpx_ports/x86_abi_support.asm"
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro TABULATE_SSIM 0
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw         xmm15, xmm3  ; sum_s
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw         xmm14, xmm4  ; sum_r
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa          xmm1, xmm3
18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd         xmm1, xmm1
19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           xmm13, xmm1 ; sum_sq_s
20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa          xmm2, xmm4
21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd         xmm2, xmm2
22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           xmm12, xmm2 ; sum_sq_r
23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd         xmm3, xmm4
24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           xmm11, xmm3  ; sum_sxr
25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Sum across the register %1 starting with q words
28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SUM_ACROSS_Q 1
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa          xmm2,%1
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckldq       %1,xmm0
31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhdq       xmm2,xmm0
32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddq           %1,xmm2
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa          xmm2,%1
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklqdq      %1,xmm0
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhqdq      xmm2,xmm0
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddq           %1,xmm2
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Sum across the register %1 starting with q words
40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SUM_ACROSS_W 1
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa          xmm1, %1
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd       %1,xmm0
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhwd       xmm1,xmm0
44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           %1, xmm1
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        SUM_ACROSS_Q    %1
46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void ssim_parms_sse2(
48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *s,
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int sp,
50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *r,
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int rp
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned long *sum_s,
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned long *sum_r,
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned long *sum_sq_s,
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned long *sum_sq_r,
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned long *sum_sxr);
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; TODO: Use parm passing through structure, probably don't need the pxors
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; ( calling app will initialize to 0 ) could easily fit everything in sse2
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; without too much hastle, and can probably do better estimates with psadw
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; or pavgb At this point this is just meant to be first pass for calculating
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; all the parms needed for 16x16 ssim so we can play with dssim as distortion
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; in mode selection code.
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_ssim_parms_16x16_sse2) PRIVATE
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_ssim_parms_16x16_sse2):
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 9
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SAVE_XMM 15
70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rsi
71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rdi
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rsi,        arg(0) ;s
75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rcx,        arg(1) ;sp
76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,        arg(2) ;r
77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rax,        arg(3) ;rp
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm0, xmm0
80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm15,xmm15  ;sum_s
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm14,xmm14  ;sum_r
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm13,xmm13  ;sum_sq_s
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm12,xmm12  ;sum_sq_r
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm11,xmm11  ;sum_sxr
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdx, 16      ;row counter
87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.NextRow:
88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ;grab source and reference pixels
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movdqu          xmm5, [rsi]
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movdqu          xmm6, [rdi]
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movdqa          xmm3, xmm5
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movdqa          xmm4, xmm6
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    punpckhbw       xmm3, xmm0 ; high_s
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    punpckhbw       xmm4, xmm0 ; high_r
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    TABULATE_SSIM
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movdqa          xmm3, xmm5
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movdqa          xmm4, xmm6
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    punpcklbw       xmm3, xmm0 ; low_s
102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    punpcklbw       xmm4, xmm0 ; low_r
103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    TABULATE_SSIM
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add             rsi, rcx   ; next s row
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add             rdi, rax   ; next r row
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dec             rdx        ; counter
110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    jnz .NextRow
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SUM_ACROSS_W    xmm15
113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SUM_ACROSS_W    xmm14
114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SUM_ACROSS_Q    xmm13
115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SUM_ACROSS_Q    xmm12
116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SUM_ACROSS_Q    xmm11
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,arg(4)
119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movd            [rdi], xmm15;
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,arg(5)
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movd            [rdi], xmm14;
122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,arg(6)
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movd            [rdi], xmm13;
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,arg(7)
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movd            [rdi], xmm12;
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,arg(8)
127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movd            [rdi], xmm11;
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rdi
131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rsi
132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_XMM
133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void ssim_parms_sse2(
138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *s,
139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int sp,
140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *r,
141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int rp
142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned long *sum_s,
143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned long *sum_r,
144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned long *sum_sq_s,
145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned long *sum_sq_r,
146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned long *sum_sxr);
147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; TODO: Use parm passing through structure, probably don't need the pxors
149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; ( calling app will initialize to 0 ) could easily fit everything in sse2
150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; without too much hastle, and can probably do better estimates with psadw
151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; or pavgb At this point this is just meant to be first pass for calculating
152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; all the parms needed for 16x16 ssim so we can play with dssim as distortion
153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; in mode selection code.
154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_ssim_parms_8x8_sse2) PRIVATE
155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_ssim_parms_8x8_sse2):
156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 9
159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SAVE_XMM 15
160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rsi
161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rdi
162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rsi,        arg(0) ;s
165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rcx,        arg(1) ;sp
166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,        arg(2) ;r
167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rax,        arg(3) ;rp
168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm0, xmm0
170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm15,xmm15  ;sum_s
171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm14,xmm14  ;sum_r
172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm13,xmm13  ;sum_sq_s
173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm12,xmm12  ;sum_sq_r
174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pxor            xmm11,xmm11  ;sum_sxr
175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdx, 8      ;row counter
177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.NextRow:
178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ;grab source and reference pixels
180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movq            xmm3, [rsi]
181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movq            xmm4, [rdi]
182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    punpcklbw       xmm3, xmm0 ; low_s
183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    punpcklbw       xmm4, xmm0 ; low_r
184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    TABULATE_SSIM
186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add             rsi, rcx   ; next s row
188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add             rdi, rax   ; next r row
189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    dec             rdx        ; counter
191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    jnz .NextRow
192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SUM_ACROSS_W    xmm15
194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SUM_ACROSS_W    xmm14
195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SUM_ACROSS_Q    xmm13
196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SUM_ACROSS_Q    xmm12
197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SUM_ACROSS_Q    xmm11
198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,arg(4)
200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movd            [rdi], xmm15;
201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,arg(5)
202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movd            [rdi], xmm14;
203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,arg(6)
204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movd            [rdi], xmm13;
205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,arg(7)
206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movd            [rdi], xmm12;
207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov             rdi,arg(8)
208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movd            [rdi], xmm11;
209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rdi
212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rsi
213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_XMM
214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
217