1// Ceres Solver - A fast non-linear least squares minimizer
2// Copyright 2013 Google Inc. All rights reserved.
3// http://code.google.com/p/ceres-solver/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice,
9//   this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above copyright notice,
11//   this list of conditions and the following disclaimer in the documentation
12//   and/or other materials provided with the distribution.
13// * Neither the name of Google Inc. nor the names of its contributors may be
14//   used to endorse or promote products derived from this software without
15//   specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28//
29// Author: sameeragarwal@google.com (Sameer Agarwal)
30//
31// Simple blas functions for use in the Schur Eliminator. These are
32// fairly basic implementations which already yield a significant
33// speedup in the eliminator performance.
34
35#ifndef CERES_INTERNAL_SMALL_BLAS_H_
36#define CERES_INTERNAL_SMALL_BLAS_H_
37
38#include "ceres/internal/eigen.h"
39#include "glog/logging.h"
40
41namespace ceres {
42namespace internal {
43
44// Remove the ".noalias()" annotation from the matrix matrix
45// mutliplies to produce a correct build with the Android NDK,
46// including versions 6, 7, 8, and 8b, when built with STLPort and the
47// non-standalone toolchain (i.e. ndk-build). This appears to be a
48// compiler bug; if the workaround is not in place, the line
49//
50//   block.noalias() -= A * B;
51//
52// gets compiled to
53//
54//   block.noalias() += A * B;
55//
56// which breaks schur elimination. Introducing a temporary by removing the
57// .noalias() annotation causes the issue to disappear. Tracking this
58// issue down was tricky, since the test suite doesn't run when built with
59// the non-standalone toolchain.
60//
61// TODO(keir): Make a reproduction case for this and send it upstream.
62#ifdef CERES_WORK_AROUND_ANDROID_NDK_COMPILER_BUG
63#define CERES_MAYBE_NOALIAS
64#else
65#define CERES_MAYBE_NOALIAS .noalias()
66#endif
67
68// The following three macros are used to share code and reduce
69// template junk across the various GEMM variants.
70#define CERES_GEMM_BEGIN(name)                                          \
71  template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>  \
72  inline void name(const double* A,                                     \
73                   const int num_row_a,                                 \
74                   const int num_col_a,                                 \
75                   const double* B,                                     \
76                   const int num_row_b,                                 \
77                   const int num_col_b,                                 \
78                   double* C,                                           \
79                   const int start_row_c,                               \
80                   const int start_col_c,                               \
81                   const int row_stride_c,                              \
82                   const int col_stride_c)
83
84#define CERES_GEMM_NAIVE_HEADER                                         \
85  DCHECK_GT(num_row_a, 0);                                              \
86  DCHECK_GT(num_col_a, 0);                                              \
87  DCHECK_GT(num_row_b, 0);                                              \
88  DCHECK_GT(num_col_b, 0);                                              \
89  DCHECK_GE(start_row_c, 0);                                            \
90  DCHECK_GE(start_col_c, 0);                                            \
91  DCHECK_GT(row_stride_c, 0);                                           \
92  DCHECK_GT(col_stride_c, 0);                                           \
93  DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));            \
94  DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));            \
95  DCHECK((kRowB == Eigen::Dynamic) || (kRowB == num_row_b));            \
96  DCHECK((kColB == Eigen::Dynamic) || (kColB == num_col_b));            \
97  const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);  \
98  const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);  \
99  const int NUM_ROW_B = (kColB != Eigen::Dynamic ? kRowB : num_row_b);  \
100  const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b);
101
102#define CERES_GEMM_EIGEN_HEADER                                         \
103  const typename EigenTypes<kRowA, kColA>::ConstMatrixRef               \
104  Aref(A, num_row_a, num_col_a);                                        \
105  const typename EigenTypes<kRowB, kColB>::ConstMatrixRef               \
106  Bref(B, num_row_b, num_col_b);                                        \
107  MatrixRef Cref(C, row_stride_c, col_stride_c);                        \
108
109#define CERES_CALL_GEMM(name)                                           \
110  name<kRowA, kColA, kRowB, kColB, kOperation>(                         \
111      A, num_row_a, num_col_a,                                          \
112      B, num_row_b, num_col_b,                                          \
113      C, start_row_c, start_col_c, row_stride_c, col_stride_c);
114
115
116// For the matrix-matrix functions below, there are three variants for
117// each functionality. Foo, FooNaive and FooEigen. Foo is the one to
118// be called by the user. FooNaive is a basic loop based
119// implementation and FooEigen uses Eigen's implementation. Foo
120// chooses between FooNaive and FooEigen depending on how many of the
121// template arguments are fixed at compile time. Currently, FooEigen
122// is called if all matrix dimensions are compile time
123// constants. FooNaive is called otherwise. This leads to the best
124// performance currently.
125//
126// The MatrixMatrixMultiply variants compute:
127//
128//   C op A * B;
129//
130// The MatrixTransposeMatrixMultiply variants compute:
131//
132//   C op A' * B
133//
134// where op can be +=, -=, or =.
135//
136// The template parameters (kRowA, kColA, kRowB, kColB) allow
137// specialization of the loop at compile time. If this information is
138// not available, then Eigen::Dynamic should be used as the template
139// argument.
140//
141//   kOperation =  1  -> C += A * B
142//   kOperation = -1  -> C -= A * B
143//   kOperation =  0  -> C  = A * B
144//
145// The functions can write into matrices C which are larger than the
146// matrix A * B. This is done by specifying the true size of C via
147// row_stride_c and col_stride_c, and then indicating where A * B
148// should be written into by start_row_c and start_col_c.
149//
150// Graphically if row_stride_c = 10, col_stride_c = 12, start_row_c =
151// 4 and start_col_c = 5, then if A = 3x2 and B = 2x4, we get
152//
153//   ------------
154//   ------------
155//   ------------
156//   ------------
157//   -----xxxx---
158//   -----xxxx---
159//   -----xxxx---
160//   ------------
161//   ------------
162//   ------------
163//
164CERES_GEMM_BEGIN(MatrixMatrixMultiplyEigen) {
165  CERES_GEMM_EIGEN_HEADER
166  Eigen::Block<MatrixRef, kRowA, kColB>
167    block(Cref, start_row_c, start_col_c, num_row_a, num_col_b);
168
169  if (kOperation > 0) {
170    block CERES_MAYBE_NOALIAS += Aref * Bref;
171  } else if (kOperation < 0) {
172    block CERES_MAYBE_NOALIAS -= Aref * Bref;
173  } else {
174    block CERES_MAYBE_NOALIAS = Aref * Bref;
175  }
176}
177
178CERES_GEMM_BEGIN(MatrixMatrixMultiplyNaive) {
179  CERES_GEMM_NAIVE_HEADER
180  DCHECK_EQ(NUM_COL_A, NUM_ROW_B);
181
182  const int NUM_ROW_C = NUM_ROW_A;
183  const int NUM_COL_C = NUM_COL_B;
184  DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c);
185  DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c);
186
187  for (int row = 0; row < NUM_ROW_C; ++row) {
188    for (int col = 0; col < NUM_COL_C; ++col) {
189      double tmp = 0.0;
190      for (int k = 0; k < NUM_COL_A; ++k) {
191        tmp += A[row * NUM_COL_A + k] * B[k * NUM_COL_B + col];
192      }
193
194      const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
195      if (kOperation > 0) {
196        C[index] += tmp;
197      } else if (kOperation < 0) {
198        C[index] -= tmp;
199      } else {
200        C[index] = tmp;
201      }
202    }
203  }
204}
205
206CERES_GEMM_BEGIN(MatrixMatrixMultiply) {
207#ifdef CERES_NO_CUSTOM_BLAS
208
209  CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
210  return;
211
212#else
213
214  if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
215      kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
216    CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
217  } else {
218    CERES_CALL_GEMM(MatrixMatrixMultiplyNaive)
219  }
220
221#endif
222}
223
224CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyEigen) {
225  CERES_GEMM_EIGEN_HEADER
226  Eigen::Block<MatrixRef, kColA, kColB> block(Cref,
227                                              start_row_c, start_col_c,
228                                              num_col_a, num_col_b);
229  if (kOperation > 0) {
230    block CERES_MAYBE_NOALIAS += Aref.transpose() * Bref;
231  } else if (kOperation < 0) {
232    block CERES_MAYBE_NOALIAS -= Aref.transpose() * Bref;
233  } else {
234    block CERES_MAYBE_NOALIAS = Aref.transpose() * Bref;
235  }
236}
237
238CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyNaive) {
239  CERES_GEMM_NAIVE_HEADER
240  DCHECK_EQ(NUM_ROW_A, NUM_ROW_B);
241
242  const int NUM_ROW_C = NUM_COL_A;
243  const int NUM_COL_C = NUM_COL_B;
244  DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c);
245  DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c);
246
247  for (int row = 0; row < NUM_ROW_C; ++row) {
248    for (int col = 0; col < NUM_COL_C; ++col) {
249      double tmp = 0.0;
250      for (int k = 0; k < NUM_ROW_A; ++k) {
251        tmp += A[k * NUM_COL_A + row] * B[k * NUM_COL_B + col];
252      }
253
254      const int index = (row + start_row_c) * col_stride_c + start_col_c + col;
255      if (kOperation > 0) {
256        C[index]+= tmp;
257      } else if (kOperation < 0) {
258        C[index]-= tmp;
259      } else {
260        C[index]= tmp;
261      }
262    }
263  }
264}
265
266CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiply) {
267#ifdef CERES_NO_CUSTOM_BLAS
268
269  CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
270  return;
271
272#else
273
274  if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
275      kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
276    CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
277  } else {
278    CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyNaive)
279  }
280
281#endif
282}
283
284// Matrix-Vector multiplication
285//
286// c op A * b;
287//
288// where op can be +=, -=, or =.
289//
290// The template parameters (kRowA, kColA) allow specialization of the
291// loop at compile time. If this information is not available, then
292// Eigen::Dynamic should be used as the template argument.
293//
294// kOperation =  1  -> c += A' * b
295// kOperation = -1  -> c -= A' * b
296// kOperation =  0  -> c  = A' * b
297template<int kRowA, int kColA, int kOperation>
298inline void MatrixVectorMultiply(const double* A,
299                                 const int num_row_a,
300                                 const int num_col_a,
301                                 const double* b,
302                                 double* c) {
303#ifdef CERES_NO_CUSTOM_BLAS
304  const typename EigenTypes<kRowA, kColA>::ConstMatrixRef
305      Aref(A, num_row_a, num_col_a);
306  const typename EigenTypes<kColA>::ConstVectorRef bref(b, num_col_a);
307  typename EigenTypes<kRowA>::VectorRef cref(c, num_row_a);
308
309  // lazyProduct works better than .noalias() for matrix-vector
310  // products.
311  if (kOperation > 0) {
312    cref += Aref.lazyProduct(bref);
313  } else if (kOperation < 0) {
314    cref -= Aref.lazyProduct(bref);
315  } else {
316    cref = Aref.lazyProduct(bref);
317  }
318#else
319
320  DCHECK_GT(num_row_a, 0);
321  DCHECK_GT(num_col_a, 0);
322  DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));
323  DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));
324
325  const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
326  const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
327
328  for (int row = 0; row < NUM_ROW_A; ++row) {
329    double tmp = 0.0;
330    for (int col = 0; col < NUM_COL_A; ++col) {
331      tmp += A[row * NUM_COL_A + col] * b[col];
332    }
333
334    if (kOperation > 0) {
335      c[row] += tmp;
336    } else if (kOperation < 0) {
337      c[row] -= tmp;
338    } else {
339      c[row] = tmp;
340    }
341  }
342#endif  // CERES_NO_CUSTOM_BLAS
343}
344
345// Similar to MatrixVectorMultiply, except that A is transposed, i.e.,
346//
347// c op A' * b;
348template<int kRowA, int kColA, int kOperation>
349inline void MatrixTransposeVectorMultiply(const double* A,
350                                          const int num_row_a,
351                                          const int num_col_a,
352                                          const double* b,
353                                          double* c) {
354#ifdef CERES_NO_CUSTOM_BLAS
355  const typename EigenTypes<kRowA, kColA>::ConstMatrixRef
356      Aref(A, num_row_a, num_col_a);
357  const typename EigenTypes<kRowA>::ConstVectorRef bref(b, num_row_a);
358  typename EigenTypes<kColA>::VectorRef cref(c, num_col_a);
359
360  // lazyProduct works better than .noalias() for matrix-vector
361  // products.
362  if (kOperation > 0) {
363    cref += Aref.transpose().lazyProduct(bref);
364  } else if (kOperation < 0) {
365    cref -= Aref.transpose().lazyProduct(bref);
366  } else {
367    cref = Aref.transpose().lazyProduct(bref);
368  }
369#else
370
371  DCHECK_GT(num_row_a, 0);
372  DCHECK_GT(num_col_a, 0);
373  DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));
374  DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));
375
376  const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
377  const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
378
379  for (int row = 0; row < NUM_COL_A; ++row) {
380    double tmp = 0.0;
381    for (int col = 0; col < NUM_ROW_A; ++col) {
382      tmp += A[col * NUM_COL_A + row] * b[col];
383    }
384
385    if (kOperation > 0) {
386      c[row] += tmp;
387    } else if (kOperation < 0) {
388      c[row] -= tmp;
389    } else {
390      c[row] = tmp;
391    }
392  }
393#endif  // CERES_NO_CUSTOM_BLAS
394}
395
396
397#undef CERES_MAYBE_NOALIAS
398#undef CERES_GEMM_BEGIN
399#undef CERES_GEMM_EIGEN_HEADER
400#undef CERES_GEMM_NAIVE_HEADER
401#undef CERES_CALL_GEMM
402
403}  // namespace internal
404}  // namespace ceres
405
406#endif  // CERES_INTERNAL_SMALL_BLAS_H_
407