1/* 16-bit signed integer dot product
2 * Altivec-assisted version
3 * Copyright 2004 Phil Karn
4 * May be used under the terms of the GNU Lesser General Public License (LGPL)
5 */
6#include <stdlib.h>
7#include "fec.h"
8
9struct dotprod {
10  int len; /* Number of coefficients */
11
12  /* On an Altivec machine, these hold 8 copies of the coefficients,
13   * preshifted by 0,1,..7 words to meet all possible input data
14   */
15  signed short *coeffs[8];
16};
17
18/* Create and return a descriptor for use with the dot product function */
19void *initdp_av(signed short coeffs[],int len){
20  struct dotprod *dp;
21  int i,j;
22
23  if(len == 0)
24    return NULL;
25
26  dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
27  dp->len = len;
28
29  /* Make 8 copies of coefficients, one for each data alignment,
30   * each aligned to 16-byte boundary
31   */
32  for(i=0;i<8;i++){
33    dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
34    for(j=0;j<len;j++)
35      dp->coeffs[i][j+i] = coeffs[j];
36  }
37  return (void *)dp;
38}
39
40
41/* Free a dot product descriptor created earlier */
42void freedp_av(void *p){
43  struct dotprod *dp = (struct dotprod *)p;
44  int i;
45
46  for(i=0;i<8;i++)
47    if(dp->coeffs[i] != NULL)
48      free(dp->coeffs[i]);
49  free(dp);
50}
51
52/* Compute a dot product given a descriptor and an input array
53 * The length is taken from the descriptor
54 */
55long dotprod_av(void *p,signed short a[]){
56  struct dotprod *dp = (struct dotprod *)p;
57  int al;
58  vector signed short *ar,*d;
59  vector signed int sums0,sums1,sums2,sums3;
60  union { vector signed int v; signed int w[4];} s;
61  int nblocks;
62
63  /* round ar down to beginning of 16-byte block containing 0th element of
64   * input buffer. Then set d to one of 8 sets of shifted coefficients
65   */
66  ar = (vector signed short *)((int)a & ~15);
67  al = ((int)a & 15)/sizeof(signed short);
68  d = (vector signed short *)dp->coeffs[al];
69
70  nblocks = (dp->len+al-1)/8+1;
71
72  /* Sum into four vectors each holding four 32-bit partial sums */
73  sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
74  while(nblocks >= 4){
75    sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
76    sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
77    sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
78    sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
79    nblocks -= 4;
80  }
81  sums0 = vec_adds(sums0,sums1);
82  sums2 = vec_adds(sums2,sums3);
83  sums0 = vec_adds(sums0,sums2);
84  while(nblocks-- > 0){
85    sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
86  }
87  /* Sum 4 partial sums into final result */
88  s.v = vec_sums(sums0,(vector signed int)(0));
89
90  return s.w[3];
91}
92
93
94