179f15823c34ae1e423108295e416213200bb280fAndreas Huber; 279f15823c34ae1e423108295e416213200bb280fAndreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 379f15823c34ae1e423108295e416213200bb280fAndreas Huber; 479f15823c34ae1e423108295e416213200bb280fAndreas Huber; Use of this source code is governed by a BSD-style license 579f15823c34ae1e423108295e416213200bb280fAndreas Huber; that can be found in the LICENSE file in the root of the source 679f15823c34ae1e423108295e416213200bb280fAndreas Huber; tree. An additional intellectual property rights grant can be found 779f15823c34ae1e423108295e416213200bb280fAndreas Huber; in the file PATENTS. All contributing project authors may 879f15823c34ae1e423108295e416213200bb280fAndreas Huber; be found in the AUTHORS file in the root of the source tree. 979f15823c34ae1e423108295e416213200bb280fAndreas Huber; 1079f15823c34ae1e423108295e416213200bb280fAndreas Huber 1179f15823c34ae1e423108295e416213200bb280fAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1279f15823c34ae1e423108295e416213200bb280fAndreas Huber 1379f15823c34ae1e423108295e416213200bb280fAndreas Huber; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr 1479f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro TABULATE_SSIM 0 1579f15823c34ae1e423108295e416213200bb280fAndreas Huber paddusw xmm15, xmm3 ; sum_s 1679f15823c34ae1e423108295e416213200bb280fAndreas Huber paddusw xmm14, xmm4 ; sum_r 1779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm3 1879f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm1, xmm1 1979f15823c34ae1e423108295e416213200bb280fAndreas Huber paddq xmm13, xmm1 ; sum_sq_s 2079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, xmm4 2179f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm2, xmm2 2279f15823c34ae1e423108295e416213200bb280fAndreas Huber paddq xmm12, xmm2 ; sum_sq_r 2379f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm3, xmm4 2479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddq xmm11, xmm3 ; sum_sxr 2579f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro 2679f15823c34ae1e423108295e416213200bb280fAndreas Huber 2779f15823c34ae1e423108295e416213200bb280fAndreas Huber; Sum across the register %1 starting with q words 2879f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro SUM_ACROSS_Q 1 2979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2,%1 3079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq %1,xmm0 3179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm2,xmm0 3279f15823c34ae1e423108295e416213200bb280fAndreas Huber paddq %1,xmm2 3379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2,%1 3479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklqdq %1,xmm0 3579f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhqdq xmm2,xmm0 3679f15823c34ae1e423108295e416213200bb280fAndreas Huber paddq %1,xmm2 3779f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro 3879f15823c34ae1e423108295e416213200bb280fAndreas Huber 3979f15823c34ae1e423108295e416213200bb280fAndreas Huber; Sum across the register %1 starting with q words 4079f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro SUM_ACROSS_W 1 4179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, %1 4279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd %1,xmm0 4379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm1,xmm0 4479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd %1, xmm1 4579f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q %1 4679f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro 4779f15823c34ae1e423108295e416213200bb280fAndreas Huber;void ssim_parms_sse3( 4879f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *s, 4979f15823c34ae1e423108295e416213200bb280fAndreas Huber; int sp, 5079f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *r, 5179f15823c34ae1e423108295e416213200bb280fAndreas Huber; int rp 5279f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_s, 5379f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_r, 5479f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sq_s, 5579f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sq_r, 5679f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sxr); 5779f15823c34ae1e423108295e416213200bb280fAndreas Huber; 5879f15823c34ae1e423108295e416213200bb280fAndreas Huber; TODO: Use parm passing through structure, probably don't need the pxors 5979f15823c34ae1e423108295e416213200bb280fAndreas Huber; ( calling app will initialize to 0 ) could easily fit everything in sse2 6079f15823c34ae1e423108295e416213200bb280fAndreas Huber; without too much hastle, and can probably do better estimates with psadw 6179f15823c34ae1e423108295e416213200bb280fAndreas Huber; or pavgb At this point this is just meant to be first pass for calculating 6279f15823c34ae1e423108295e416213200bb280fAndreas Huber; all the parms needed for 16x16 ssim so we can play with dssim as distortion 6379f15823c34ae1e423108295e416213200bb280fAndreas Huber; in mode selection code. 6479f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_ssim_parms_16x16_sse3) 6579f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_ssim_parms_16x16_sse3): 6679f15823c34ae1e423108295e416213200bb280fAndreas Huber push rbp 6779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rbp, rsp 6879f15823c34ae1e423108295e416213200bb280fAndreas Huber SHADOW_ARGS_TO_STACK 9 6979f15823c34ae1e423108295e416213200bb280fAndreas Huber push rsi 7079f15823c34ae1e423108295e416213200bb280fAndreas Huber push rdi 7179f15823c34ae1e423108295e416213200bb280fAndreas Huber ; end prolog 7279f15823c34ae1e423108295e416213200bb280fAndreas Huber 7379f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ;s 7479f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rcx, arg(1) ;sp 7579f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(2) ;r 7679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rax, arg(3) ;rp 7779f15823c34ae1e423108295e416213200bb280fAndreas Huber 7879f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm0, xmm0 7979f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm15,xmm15 ;sum_s 8079f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm14,xmm14 ;sum_r 8179f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm13,xmm13 ;sum_sq_s 8279f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm12,xmm12 ;sum_sq_r 8379f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm11,xmm11 ;sum_sxr 8479f15823c34ae1e423108295e416213200bb280fAndreas Huber 8579f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdx, 16 ;row counter 8679f15823c34ae1e423108295e416213200bb280fAndreas HuberNextRow: 8779f15823c34ae1e423108295e416213200bb280fAndreas Huber 8879f15823c34ae1e423108295e416213200bb280fAndreas Huber ;grab source and reference pixels 8979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm5, [rsi] 9079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm6, [rdi] 9179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm5 9279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm6 9379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhbw xmm3, xmm0 ; high_s 9479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhbw xmm4, xmm0 ; high_r 9579f15823c34ae1e423108295e416213200bb280fAndreas Huber 9679f15823c34ae1e423108295e416213200bb280fAndreas Huber TABULATE_SSIM 9779f15823c34ae1e423108295e416213200bb280fAndreas Huber 9879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm5 9979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm6 10079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm3, xmm0 ; low_s 10179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm4, xmm0 ; low_r 10279f15823c34ae1e423108295e416213200bb280fAndreas Huber 10379f15823c34ae1e423108295e416213200bb280fAndreas Huber TABULATE_SSIM 10479f15823c34ae1e423108295e416213200bb280fAndreas Huber 10579f15823c34ae1e423108295e416213200bb280fAndreas Huber add rsi, rcx ; next s row 10679f15823c34ae1e423108295e416213200bb280fAndreas Huber add rdi, rax ; next r row 10779f15823c34ae1e423108295e416213200bb280fAndreas Huber 10879f15823c34ae1e423108295e416213200bb280fAndreas Huber dec rdx ; counter 10979f15823c34ae1e423108295e416213200bb280fAndreas Huber jnz NextRow 11079f15823c34ae1e423108295e416213200bb280fAndreas Huber 11179f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_W xmm15 11279f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_W xmm14 11379f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm13 11479f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm12 11579f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm11 11679f15823c34ae1e423108295e416213200bb280fAndreas Huber 11779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(4) 11879f15823c34ae1e423108295e416213200bb280fAndreas Huber movq [rdi], xmm15; 11979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(5) 12079f15823c34ae1e423108295e416213200bb280fAndreas Huber movq [rdi], xmm14; 12179f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(6) 12279f15823c34ae1e423108295e416213200bb280fAndreas Huber movq [rdi], xmm13; 12379f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(7) 12479f15823c34ae1e423108295e416213200bb280fAndreas Huber movq [rdi], xmm12; 12579f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(8) 12679f15823c34ae1e423108295e416213200bb280fAndreas Huber movq [rdi], xmm11; 12779f15823c34ae1e423108295e416213200bb280fAndreas Huber 12879f15823c34ae1e423108295e416213200bb280fAndreas Huber ; begin epilog 12979f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rdi 13079f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rsi 13179f15823c34ae1e423108295e416213200bb280fAndreas Huber UNSHADOW_ARGS 13279f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rbp 13379f15823c34ae1e423108295e416213200bb280fAndreas Huber ret 13479f15823c34ae1e423108295e416213200bb280fAndreas Huber 13579f15823c34ae1e423108295e416213200bb280fAndreas Huber;void ssim_parms_sse3( 13679f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *s, 13779f15823c34ae1e423108295e416213200bb280fAndreas Huber; int sp, 13879f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *r, 13979f15823c34ae1e423108295e416213200bb280fAndreas Huber; int rp 14079f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_s, 14179f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_r, 14279f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sq_s, 14379f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sq_r, 14479f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sxr); 14579f15823c34ae1e423108295e416213200bb280fAndreas Huber; 14679f15823c34ae1e423108295e416213200bb280fAndreas Huber; TODO: Use parm passing through structure, probably don't need the pxors 14779f15823c34ae1e423108295e416213200bb280fAndreas Huber; ( calling app will initialize to 0 ) could easily fit everything in sse2 14879f15823c34ae1e423108295e416213200bb280fAndreas Huber; without too much hastle, and can probably do better estimates with psadw 14979f15823c34ae1e423108295e416213200bb280fAndreas Huber; or pavgb At this point this is just meant to be first pass for calculating 15079f15823c34ae1e423108295e416213200bb280fAndreas Huber; all the parms needed for 16x16 ssim so we can play with dssim as distortion 15179f15823c34ae1e423108295e416213200bb280fAndreas Huber; in mode selection code. 15279f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_ssim_parms_8x8_sse3) 15379f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_ssim_parms_8x8_sse3): 15479f15823c34ae1e423108295e416213200bb280fAndreas Huber push rbp 15579f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rbp, rsp 15679f15823c34ae1e423108295e416213200bb280fAndreas Huber SHADOW_ARGS_TO_STACK 9 15779f15823c34ae1e423108295e416213200bb280fAndreas Huber push rsi 15879f15823c34ae1e423108295e416213200bb280fAndreas Huber push rdi 15979f15823c34ae1e423108295e416213200bb280fAndreas Huber ; end prolog 16079f15823c34ae1e423108295e416213200bb280fAndreas Huber 16179f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ;s 16279f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rcx, arg(1) ;sp 16379f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(2) ;r 16479f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rax, arg(3) ;rp 16579f15823c34ae1e423108295e416213200bb280fAndreas Huber 16679f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm0, xmm0 16779f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm15,xmm15 ;sum_s 16879f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm14,xmm14 ;sum_r 16979f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm13,xmm13 ;sum_sq_s 17079f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm12,xmm12 ;sum_sq_r 17179f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm11,xmm11 ;sum_sxr 17279f15823c34ae1e423108295e416213200bb280fAndreas Huber 17379f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdx, 8 ;row counter 17479f15823c34ae1e423108295e416213200bb280fAndreas HuberNextRow2: 17579f15823c34ae1e423108295e416213200bb280fAndreas Huber 17679f15823c34ae1e423108295e416213200bb280fAndreas Huber ;grab source and reference pixels 17779f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm5, [rsi] 17879f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm6, [rdi] 17979f15823c34ae1e423108295e416213200bb280fAndreas Huber 18079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm5 18179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm6 18279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm3, xmm0 ; low_s 18379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm4, xmm0 ; low_r 18479f15823c34ae1e423108295e416213200bb280fAndreas Huber 18579f15823c34ae1e423108295e416213200bb280fAndreas Huber TABULATE_SSIM 18679f15823c34ae1e423108295e416213200bb280fAndreas Huber 18779f15823c34ae1e423108295e416213200bb280fAndreas Huber add rsi, rcx ; next s row 18879f15823c34ae1e423108295e416213200bb280fAndreas Huber add rdi, rax ; next r row 18979f15823c34ae1e423108295e416213200bb280fAndreas Huber 19079f15823c34ae1e423108295e416213200bb280fAndreas Huber dec rdx ; counter 19179f15823c34ae1e423108295e416213200bb280fAndreas Huber jnz NextRow2 19279f15823c34ae1e423108295e416213200bb280fAndreas Huber 19379f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_W xmm15 19479f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_W xmm14 19579f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm13 19679f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm12 19779f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm11 19879f15823c34ae1e423108295e416213200bb280fAndreas Huber 19979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(4) 20079f15823c34ae1e423108295e416213200bb280fAndreas Huber movq [rdi], xmm15; 20179f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(5) 20279f15823c34ae1e423108295e416213200bb280fAndreas Huber movq [rdi], xmm14; 20379f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(6) 20479f15823c34ae1e423108295e416213200bb280fAndreas Huber movq [rdi], xmm13; 20579f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(7) 20679f15823c34ae1e423108295e416213200bb280fAndreas Huber movq [rdi], xmm12; 20779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(8) 20879f15823c34ae1e423108295e416213200bb280fAndreas Huber movq [rdi], xmm11; 20979f15823c34ae1e423108295e416213200bb280fAndreas Huber 21079f15823c34ae1e423108295e416213200bb280fAndreas Huber ; begin epilog 21179f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rdi 21279f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rsi 21379f15823c34ae1e423108295e416213200bb280fAndreas Huber UNSHADOW_ARGS 21479f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rbp 21579f15823c34ae1e423108295e416213200bb280fAndreas Huber ret 216