1/** Compute the matrix inverse via Gauss-Jordan elimination.
2 *  This program uses only barriers to separate computation steps but no
3 *  mutexes. It is an example of a race-free program on which no data races
4 *  are reported by the happens-before algorithm (drd), but a lot of data races
5 *  (all false positives) are reported by the Eraser-algorithm (helgrind).
6 */
7
8
9#define _GNU_SOURCE
10
11/***********************/
12/* Include directives. */
13/***********************/
14
15#include <assert.h>
16#include <math.h>
17#include <limits.h>  // PTHREAD_STACK_MIN
18#include <pthread.h>
19#include <stdio.h>
20#include <stdlib.h>
21#include <unistd.h>  // getopt()
22
23
24/*********************/
25/* Type definitions. */
26/*********************/
27
28typedef double elem_t;
29
30struct gj_threadinfo
31{
32  pthread_barrier_t* b;
33  pthread_t tid;
34  elem_t* a;
35  int rows;
36  int cols;
37  int r0;
38  int r1;
39};
40
41
42/********************/
43/* Local variables. */
44/********************/
45
46static int s_nthread = 1;
47
48
49/*************************/
50/* Function definitions. */
51/*************************/
52
53/** Allocate memory for a matrix with the specified number of rows and
54 *  columns.
55 */
56static elem_t* new_matrix(const int rows, const int cols)
57{
58  assert(rows > 0);
59  assert(cols > 0);
60  return malloc(rows * cols * sizeof(elem_t));
61}
62
63/** Free the memory that was allocated for a matrix. */
64static void delete_matrix(elem_t* const a)
65{
66  free(a);
67}
68
69/** Fill in some numbers in a matrix. */
70static void init_matrix(elem_t* const a, const int rows, const int cols)
71{
72  int i, j;
73  for (i = 0; i < rows; i++)
74  {
75    for (j = 0; j < rows; j++)
76    {
77      a[i * cols + j] = 1.0 / (1 + abs(i-j));
78    }
79  }
80}
81
82/** Print all elements of a matrix. */
83void print_matrix(const char* const label,
84                  const elem_t* const a, const int rows, const int cols)
85{
86  int i, j;
87  printf("%s:\n", label);
88  for (i = 0; i < rows; i++)
89  {
90    for (j = 0; j < cols; j++)
91    {
92      printf("%g ", a[i * cols + j]);
93    }
94    printf("\n");
95  }
96}
97
98/** Copy a subset of the elements of a matrix into another matrix. */
99static void copy_matrix(const elem_t* const from,
100                        const int from_rows,
101                        const int from_cols,
102                        const int from_row_first,
103                        const int from_row_last,
104                        const int from_col_first,
105                        const int from_col_last,
106                        elem_t* const to,
107                        const int to_rows,
108                        const int to_cols,
109                        const int to_row_first,
110                        const int to_row_last,
111                        const int to_col_first,
112                        const int to_col_last)
113{
114  int i, j;
115
116  assert(from_row_last - from_row_first == to_row_last - to_row_first);
117  assert(from_col_last - from_col_first == to_col_last - to_col_first);
118
119  for (i = from_row_first; i < from_row_last; i++)
120  {
121    assert(i < from_rows);
122    assert(i - from_row_first + to_row_first < to_rows);
123    for (j = from_col_first; j < from_col_last; j++)
124    {
125      assert(j < from_cols);
126      assert(j - from_col_first + to_col_first < to_cols);
127      to[(i - from_row_first + to_col_first) * to_cols
128         + (j - from_col_first + to_col_first)]
129        = from[i * from_cols + j];
130    }
131  }
132}
133
134/** Compute the matrix product of a1 and a2. */
135static elem_t* multiply_matrices(const elem_t* const a1,
136                                 const int rows1,
137                                 const int cols1,
138                                 const elem_t* const a2,
139                                 const int rows2,
140                                 const int cols2)
141{
142  int i, j, k;
143  elem_t* prod;
144
145  assert(cols1 == rows2);
146
147  prod = new_matrix(rows1, cols2);
148  for (i = 0; i < rows1; i++)
149  {
150    for (j = 0; j < cols2; j++)
151    {
152      prod[i * cols2 + j] = 0;
153      for (k = 0; k < cols1; k++)
154      {
155        prod[i * cols2 + j] += a1[i * cols1 + k] * a2[k * cols2 + j];
156      }
157    }
158  }
159  return prod;
160}
161
162/** Apply the Gauss-Jordan elimination algorithm on the matrix p->a starting
163 *  at row r0 and up to but not including row r1. It is assumed that as many
164 *  threads execute this function concurrently as the count barrier p->b was
165 *  initialized with. If the matrix p->a is nonsingular, and if matrix p->a
166 *  has at least as many columns as rows, the result of this algorithm is that
167 *  submatrix p->a[0..p->rows-1,0..p->rows-1] is the identity matrix.
168 * @see http://en.wikipedia.org/wiki/Gauss-Jordan_elimination
169 */
170static void gj_threadfunc(struct gj_threadinfo* p)
171{
172  int i, j, k;
173  elem_t* const a = p->a;
174  const int rows = p->rows;
175  const int cols = p->cols;
176  elem_t aii;
177
178  for (i = 0; i < p->rows; i++)
179  {
180    if (pthread_barrier_wait(p->b) == PTHREAD_BARRIER_SERIAL_THREAD)
181    {
182      // Pivoting.
183      j = i;
184      for (k = i + 1; k < rows; k++)
185      {
186        if (a[k * cols + i] > a[j * cols + i])
187        {
188          j = k;
189        }
190      }
191      if (j != i)
192      {
193        for (k = 0; k < cols; k++)
194        {
195          const elem_t t = a[i * cols + k];
196          a[i * cols + k] = a[j * cols + k];
197          a[j * cols + k] = t;
198        }
199      }
200      // Normalize row i.
201      aii = a[i * cols + i];
202      if (aii != 0)
203        for (k = i; k < cols; k++)
204          a[i * cols + k] /= aii;
205    }
206    pthread_barrier_wait(p->b);
207    // Reduce all rows j != i.
208    for (j = p->r0; j < p->r1; j++)
209    {
210      if (i != j)
211      {
212        const elem_t factor = a[j * cols + i];
213        for (k = 0; k < cols; k++)
214        {
215          a[j * cols + k] -= a[i * cols + k] * factor;
216        }
217      }
218    }
219  }
220}
221
222/** Multithreaded Gauss-Jordan algorithm. */
223static void gj(elem_t* const a, const int rows, const int cols)
224{
225  int i;
226  struct gj_threadinfo* t;
227  pthread_barrier_t b;
228  pthread_attr_t attr;
229  int err;
230
231  assert(rows <= cols);
232
233  t = malloc(sizeof(struct gj_threadinfo) * s_nthread);
234
235  pthread_barrier_init(&b, 0, s_nthread);
236
237  pthread_attr_init(&attr);
238  err = pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN + 4096);
239  assert(err == 0);
240
241  for (i = 0; i < s_nthread; i++)
242  {
243    t[i].b = &b;
244    t[i].a = a;
245    t[i].rows = rows;
246    t[i].cols = cols;
247    t[i].r0 = i * rows / s_nthread;
248    t[i].r1 = (i+1) * rows / s_nthread;
249    pthread_create(&t[i].tid, &attr, (void*(*)(void*))gj_threadfunc, &t[i]);
250  }
251
252  pthread_attr_destroy(&attr);
253
254  for (i = 0; i < s_nthread; i++)
255  {
256    pthread_join(t[i].tid, 0);
257  }
258
259  pthread_barrier_destroy(&b);
260
261  free(t);
262}
263
264/** Matrix inversion via the Gauss-Jordan algorithm. */
265static elem_t* invert_matrix(const elem_t* const a, const int n)
266{
267  int i, j;
268  elem_t* const inv = new_matrix(n, n);
269  elem_t* const tmp = new_matrix(n, 2*n);
270  copy_matrix(a, n, n, 0, n, 0, n, tmp, n, 2 * n, 0, n, 0, n);
271  for (i = 0; i < n; i++)
272    for (j = 0; j < n; j++)
273      tmp[i * 2 * n + n + j] = (i == j);
274  gj(tmp, n, 2*n);
275  copy_matrix(tmp, n, 2*n, 0, n, n, 2*n, inv, n, n, 0, n, 0, n);
276  delete_matrix(tmp);
277  return inv;
278}
279
280/** Compute the average square error between the identity matrix and the
281 * product of matrix a with its inverse matrix.
282 */
283static double identity_error(const elem_t* const a, const int n)
284{
285  int i, j;
286  elem_t e = 0;
287  for (i = 0; i < n; i++)
288  {
289    for (j = 0; j < n; j++)
290    {
291      const elem_t d = a[i * n + j] - (i == j);
292      e += d * d;
293    }
294  }
295  return sqrt(e / (n * n));
296}
297
298/** Compute epsilon for the numeric type elem_t. Epsilon is defined as the
299 *  smallest number for which the sum of one and that number is different of
300 *  one. It is assumed that the underlying representation of elem_t uses
301 *  base two.
302 */
303static elem_t epsilon()
304{
305  elem_t eps;
306  for (eps = 1; 1 + eps != 1; eps /= 2)
307    ;
308  return 2 * eps;
309}
310
311int main(int argc, char** argv)
312{
313  int matrix_size;
314  int silent = 0;
315  int optchar;
316  elem_t *a, *inv, *prod;
317  elem_t eps;
318  double error;
319  double ratio;
320
321  while ((optchar = getopt(argc, argv, "qt:")) != EOF)
322  {
323    switch (optchar)
324    {
325    case 'q': silent = 1; break;
326    case 't': s_nthread = atoi(optarg); break;
327    default:
328      fprintf(stderr, "Error: unknown option '%c'.\n", optchar);
329      return 1;
330    }
331  }
332
333  if (optind + 1 != argc)
334  {
335    fprintf(stderr, "Error: wrong number of arguments.\n");
336    return 1;
337  }
338  matrix_size = atoi(argv[optind]);
339
340  /* Error checking. */
341  assert(matrix_size >= 1);
342  assert(s_nthread >= 1);
343
344  eps = epsilon();
345  a = new_matrix(matrix_size, matrix_size);
346  init_matrix(a, matrix_size, matrix_size);
347  inv = invert_matrix(a, matrix_size);
348  prod = multiply_matrices(a, matrix_size, matrix_size,
349                           inv, matrix_size, matrix_size);
350  error = identity_error(prod, matrix_size);
351  ratio = error / (eps * matrix_size);
352  if (! silent)
353  {
354    printf("error = %g; epsilon = %g; error / (epsilon * n) = %g\n",
355           error, eps, ratio);
356  }
357  if (isfinite(ratio) && ratio < 100)
358    printf("Error within bounds.\n");
359  else
360    printf("Error out of bounds.\n");
361  delete_matrix(prod);
362  delete_matrix(inv);
363  delete_matrix(a);
364
365  return 0;
366}
367