14e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi# SSE2 assist routines for sumsq 24e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi# Copyright 2001 Phil Karn, KA9Q 34e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi# May be used under the terms of the GNU Public License (GPL) 44e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 54e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .text 64e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi# Evaluate sum of squares of signed 16-bit input samples 74e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi# long long sumsq_sse2_assist(signed short *in,int cnt); 84e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .global sumsq_sse2_assist 94e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .type sumsq_sse2_assist,@function 104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .align 16 114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yisumsq_sse2_assist: 124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %ebp 134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl %esp,%ebp 144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %esi 154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pushl %ecx 164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl 8(%ebp),%esi 184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movl 12(%ebp),%ecx 194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pxor %xmm2,%xmm2 # zero sum 204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movaps low,%xmm3 # load mask 214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi1: subl $8,%ecx 234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jl 2f 244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movaps (%esi),%xmm0 # S0 S1 S2 S3 S4 S5 S6 S7 254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pmaddwd %xmm0,%xmm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) (S4*S4+S5*S5) (S6*S6+S7*S7) 264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movaps %xmm0,%xmm1 274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi pand %xmm3,%xmm1 # (S0*S0+S1*S1) 0 (S4*S4+S5*S5) 0 284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddq %xmm1,%xmm2 # sum even-numbered dwords 294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrlq $32,%xmm0 # (S2*S2+S3*S3) 0 (S6*S6+S7*S7) 0 304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddq %xmm0,%xmm2 # sum odd-numbered dwords 314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi addl $16,%esi 324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi jmp 1b 334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi2: movaps %xmm2,%xmm0 354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrldq $8,%xmm0 364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi paddq %xmm2,%xmm0 # combine 64-bit sums 374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movd %xmm0,%eax # low 32 bits of sum 394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi psrldq $4,%xmm0 404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi movd %xmm0,%edx # high 32 bits of sum 414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %ecx 434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %esi 444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi popl %ebp 454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi ret 464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi 474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .data 484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi .align 16 494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yilow: .byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0 50