1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr 14%macro TABULATE_SSIM 0 15 paddusw xmm15, xmm3 ; sum_s 16 paddusw xmm14, xmm4 ; sum_r 17 movdqa xmm1, xmm3 18 pmaddwd xmm1, xmm1 19 paddq xmm13, xmm1 ; sum_sq_s 20 movdqa xmm2, xmm4 21 pmaddwd xmm2, xmm2 22 paddq xmm12, xmm2 ; sum_sq_r 23 pmaddwd xmm3, xmm4 24 paddq xmm11, xmm3 ; sum_sxr 25%endmacro 26 27; Sum across the register %1 starting with q words 28%macro SUM_ACROSS_Q 1 29 movdqa xmm2,%1 30 punpckldq %1,xmm0 31 punpckhdq xmm2,xmm0 32 paddq %1,xmm2 33 movdqa xmm2,%1 34 punpcklqdq %1,xmm0 35 punpckhqdq xmm2,xmm0 36 paddq %1,xmm2 37%endmacro 38 39; Sum across the register %1 starting with q words 40%macro SUM_ACROSS_W 1 41 movdqa xmm1, %1 42 punpcklwd %1,xmm0 43 punpckhwd xmm1,xmm0 44 paddd %1, xmm1 45 SUM_ACROSS_Q %1 46%endmacro 47;void ssim_parms_sse3( 48; unsigned char *s, 49; int sp, 50; unsigned char *r, 51; int rp 52; unsigned long *sum_s, 53; unsigned long *sum_r, 54; unsigned long *sum_sq_s, 55; unsigned long *sum_sq_r, 56; unsigned long *sum_sxr); 57; 58; TODO: Use parm passing through structure, probably don't need the pxors 59; ( calling app will initialize to 0 ) could easily fit everything in sse2 60; without too much hastle, and can probably do better estimates with psadw 61; or pavgb At this point this is just meant to be first pass for calculating 62; all the parms needed for 16x16 ssim so we can play with dssim as distortion 63; in mode selection code. 64global sym(vp8_ssim_parms_16x16_sse3) 65sym(vp8_ssim_parms_16x16_sse3): 66 push rbp 67 mov rbp, rsp 68 SHADOW_ARGS_TO_STACK 9 69 push rsi 70 push rdi 71 ; end prolog 72 73 mov rsi, arg(0) ;s 74 mov rcx, arg(1) ;sp 75 mov rdi, arg(2) ;r 76 mov rax, arg(3) ;rp 77 78 pxor xmm0, xmm0 79 pxor xmm15,xmm15 ;sum_s 80 pxor xmm14,xmm14 ;sum_r 81 pxor xmm13,xmm13 ;sum_sq_s 82 pxor xmm12,xmm12 ;sum_sq_r 83 pxor xmm11,xmm11 ;sum_sxr 84 85 mov rdx, 16 ;row counter 86NextRow: 87 88 ;grab source and reference pixels 89 movdqu xmm5, [rsi] 90 movdqu xmm6, [rdi] 91 movdqa xmm3, xmm5 92 movdqa xmm4, xmm6 93 punpckhbw xmm3, xmm0 ; high_s 94 punpckhbw xmm4, xmm0 ; high_r 95 96 TABULATE_SSIM 97 98 movdqa xmm3, xmm5 99 movdqa xmm4, xmm6 100 punpcklbw xmm3, xmm0 ; low_s 101 punpcklbw xmm4, xmm0 ; low_r 102 103 TABULATE_SSIM 104 105 add rsi, rcx ; next s row 106 add rdi, rax ; next r row 107 108 dec rdx ; counter 109 jnz NextRow 110 111 SUM_ACROSS_W xmm15 112 SUM_ACROSS_W xmm14 113 SUM_ACROSS_Q xmm13 114 SUM_ACROSS_Q xmm12 115 SUM_ACROSS_Q xmm11 116 117 mov rdi,arg(4) 118 movq [rdi], xmm15; 119 mov rdi,arg(5) 120 movq [rdi], xmm14; 121 mov rdi,arg(6) 122 movq [rdi], xmm13; 123 mov rdi,arg(7) 124 movq [rdi], xmm12; 125 mov rdi,arg(8) 126 movq [rdi], xmm11; 127 128 ; begin epilog 129 pop rdi 130 pop rsi 131 UNSHADOW_ARGS 132 pop rbp 133 ret 134 135;void ssim_parms_sse3( 136; unsigned char *s, 137; int sp, 138; unsigned char *r, 139; int rp 140; unsigned long *sum_s, 141; unsigned long *sum_r, 142; unsigned long *sum_sq_s, 143; unsigned long *sum_sq_r, 144; unsigned long *sum_sxr); 145; 146; TODO: Use parm passing through structure, probably don't need the pxors 147; ( calling app will initialize to 0 ) could easily fit everything in sse2 148; without too much hastle, and can probably do better estimates with psadw 149; or pavgb At this point this is just meant to be first pass for calculating 150; all the parms needed for 16x16 ssim so we can play with dssim as distortion 151; in mode selection code. 152global sym(vp8_ssim_parms_8x8_sse3) 153sym(vp8_ssim_parms_8x8_sse3): 154 push rbp 155 mov rbp, rsp 156 SHADOW_ARGS_TO_STACK 9 157 push rsi 158 push rdi 159 ; end prolog 160 161 mov rsi, arg(0) ;s 162 mov rcx, arg(1) ;sp 163 mov rdi, arg(2) ;r 164 mov rax, arg(3) ;rp 165 166 pxor xmm0, xmm0 167 pxor xmm15,xmm15 ;sum_s 168 pxor xmm14,xmm14 ;sum_r 169 pxor xmm13,xmm13 ;sum_sq_s 170 pxor xmm12,xmm12 ;sum_sq_r 171 pxor xmm11,xmm11 ;sum_sxr 172 173 mov rdx, 8 ;row counter 174NextRow2: 175 176 ;grab source and reference pixels 177 movq xmm5, [rsi] 178 movq xmm6, [rdi] 179 180 movdqa xmm3, xmm5 181 movdqa xmm4, xmm6 182 punpcklbw xmm3, xmm0 ; low_s 183 punpcklbw xmm4, xmm0 ; low_r 184 185 TABULATE_SSIM 186 187 add rsi, rcx ; next s row 188 add rdi, rax ; next r row 189 190 dec rdx ; counter 191 jnz NextRow2 192 193 SUM_ACROSS_W xmm15 194 SUM_ACROSS_W xmm14 195 SUM_ACROSS_Q xmm13 196 SUM_ACROSS_Q xmm12 197 SUM_ACROSS_Q xmm11 198 199 mov rdi,arg(4) 200 movq [rdi], xmm15; 201 mov rdi,arg(5) 202 movq [rdi], xmm14; 203 mov rdi,arg(6) 204 movq [rdi], xmm13; 205 mov rdi,arg(7) 206 movq [rdi], xmm12; 207 mov rdi,arg(8) 208 movq [rdi], xmm11; 209 210 ; begin epilog 211 pop rdi 212 pop rsi 213 UNSHADOW_ARGS 214 pop rbp 215 ret 216