14e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi# SSE2 assist routines for sumsq
24e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi# Copyright 2001 Phil Karn, KA9Q
34e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi# May be used under the terms of the GNU Public License (GPL)
44e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
54e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.text
64e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi# Evaluate sum of squares of signed 16-bit input samples
74e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi#  long long sumsq_sse2_assist(signed short *in,int cnt);
84e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.global sumsq_sse2_assist
94e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.type sumsq_sse2_assist,@function
104e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.align 16
114e213d510f437769f8a28578dd4f786fb7d16c4Bill Yisumsq_sse2_assist:
124e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %ebp
134e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl %esp,%ebp
144e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %esi
154e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pushl %ecx
164e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
174e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 8(%ebp),%esi
184e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movl 12(%ebp),%ecx
194e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pxor %xmm2,%xmm2		# zero sum
204e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movaps low,%xmm3		# load mask
214e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
224e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi1:	subl $8,%ecx
234e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jl 2f
244e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movaps (%esi),%xmm0	# S0 S1 S2 S3 S4 S5 S6 S7
254e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pmaddwd %xmm0,%xmm0	# (S0*S0+S1*S1) (S2*S2+S3*S3) (S4*S4+S5*S5) (S6*S6+S7*S7)
264e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movaps %xmm0,%xmm1
274e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	pand %xmm3,%xmm1	# (S0*S0+S1*S1) 0 (S4*S4+S5*S5) 0
284e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddq %xmm1,%xmm2	# sum even-numbered dwords
294e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrlq $32,%xmm0		# (S2*S2+S3*S3) 0 (S6*S6+S7*S7) 0
304e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddq %xmm0,%xmm2	# sum odd-numbered dwords
314e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	addl $16,%esi
324e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	jmp 1b
334e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
344e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi2:	movaps %xmm2,%xmm0
354e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrldq $8,%xmm0
364e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	paddq %xmm2,%xmm0	# combine 64-bit sums
374e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
384e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movd %xmm0,%eax		# low 32 bits of sum
394e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	psrldq $4,%xmm0
404e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	movd %xmm0,%edx		# high 32 bits of sum
414e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
424e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %ecx
434e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %esi
444e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	popl %ebp
454e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	ret
464e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi
474e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.data
484e213d510f437769f8a28578dd4f786fb7d16c4Bill Yi	.align 16
494e213d510f437769f8a28578dd4f786fb7d16c4Bill Yilow:	.byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0
50