1# SSE2 assist routines for sumsq
2# Copyright 2001 Phil Karn, KA9Q
3# May be used under the terms of the GNU Public License (GPL)
4
5	.text
6# Evaluate sum of squares of signed 16-bit input samples
7#  long long sumsq_sse2_assist(signed short *in,int cnt);
8	.global sumsq_sse2_assist
9	.type sumsq_sse2_assist,@function
10	.align 16
11sumsq_sse2_assist:
12	pushl %ebp
13	movl %esp,%ebp
14	pushl %esi
15	pushl %ecx
16
17	movl 8(%ebp),%esi
18	movl 12(%ebp),%ecx
19	pxor %xmm2,%xmm2		# zero sum
20	movaps low,%xmm3		# load mask
21
221:	subl $8,%ecx
23	jl 2f
24	movaps (%esi),%xmm0	# S0 S1 S2 S3 S4 S5 S6 S7
25	pmaddwd %xmm0,%xmm0	# (S0*S0+S1*S1) (S2*S2+S3*S3) (S4*S4+S5*S5) (S6*S6+S7*S7)
26	movaps %xmm0,%xmm1
27	pand %xmm3,%xmm1	# (S0*S0+S1*S1) 0 (S4*S4+S5*S5) 0
28	paddq %xmm1,%xmm2	# sum even-numbered dwords
29	psrlq $32,%xmm0		# (S2*S2+S3*S3) 0 (S6*S6+S7*S7) 0
30	paddq %xmm0,%xmm2	# sum odd-numbered dwords
31	addl $16,%esi
32	jmp 1b
33
342:	movaps %xmm2,%xmm0
35	psrldq $8,%xmm0
36	paddq %xmm2,%xmm0	# combine 64-bit sums
37
38	movd %xmm0,%eax		# low 32 bits of sum
39	psrldq $4,%xmm0
40	movd %xmm0,%edx		# high 32 bits of sum
41
42	popl %ecx
43	popl %esi
44	popl %ebp
45	ret
46
47	.data
48	.align 16
49low:	.byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0
50