TriangularMatrixMatrix.h revision c981c48f5bc9aefeffc0bcb0cc3934c2fae179dd
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This file is part of Eigen, a lightweight C++ template library
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for linear algebra.
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch//
6bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch// This Source Code Form is subject to the terms of the Mozilla
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Public License v. 2.0. If a copy of the MPL was not distributed
87d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define EIGEN_TRIANGULAR_MATRIX_MATRIX_H
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace Eigen {
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace internal {
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode>
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// struct gemm_pack_lhs_triangular
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// {
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   Matrix<Scalar,mr,mr,
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   {
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     int count = 0;
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     const int peeled_mc = (rows/mr)*mr;
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     for(int i=0; i<peeled_mc; i+=mr)
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     {
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//       for(int k=0; k<depth; k++)
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//         for(int w=0; w<mr; w++)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//           blockA[count++] = cj(lhs(i+w, k));
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     }
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     for(int i=peeled_mc; i<rows; i++)
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     {
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//       for(int k=0; k<depth; k++)
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//         blockA[count++] = cj(lhs(i, k));
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     }
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   }
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// };
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/* Optimized triangular matrix * matrix (_TRMM++) product built on top of
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * the general matrix matrix product.
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)template <typename Scalar, typename Index,
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          int Mode, bool LhsIsTriangular,
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          int LhsStorageOrder, bool ConjugateLhs,
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          int RhsStorageOrder, bool ConjugateRhs,
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          int ResStorageOrder, int Version = Specialized>
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct product_triangular_matrix_matrix;
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)template <typename Scalar, typename Index,
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          int Mode, bool LhsIsTriangular,
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          int LhsStorageOrder, bool ConjugateLhs,
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          int RhsStorageOrder, bool ConjugateRhs, int Version>
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                           LhsStorageOrder,ConjugateLhs,
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                           RhsStorageOrder,ConjugateRhs,RowMajor,Version>
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles){
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static EIGEN_STRONG_INLINE void run(
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Index rows, Index cols, Index depth,
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const Scalar* lhs, Index lhsStride,
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const Scalar* rhs, Index rhsStride,
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Scalar* res,       Index resStride,
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Scalar alpha, level3_blocking<Scalar,Scalar>& blocking)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    product_triangular_matrix_matrix<Scalar, Index,
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper),
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      (!LhsIsTriangular),
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      RhsStorageOrder==RowMajor ? ColMajor : RowMajor,
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ConjugateRhs,
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ConjugateLhs,
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ColMajor>
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
75bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  }
76};
77
78// implements col-major += alpha * op(triangular) * op(general)
79template <typename Scalar, typename Index, int Mode,
80          int LhsStorageOrder, bool ConjugateLhs,
81          int RhsStorageOrder, bool ConjugateRhs, int Version>
82struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
83                                           LhsStorageOrder,ConjugateLhs,
84                                           RhsStorageOrder,ConjugateRhs,ColMajor,Version>
85{
86
87  typedef gebp_traits<Scalar,Scalar> Traits;
88  enum {
89    SmallPanelWidth   = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
90    IsLower = (Mode&Lower) == Lower,
91    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
92  };
93
94  static EIGEN_DONT_INLINE void run(
95    Index _rows, Index _cols, Index _depth,
96    const Scalar* _lhs, Index lhsStride,
97    const Scalar* _rhs, Index rhsStride,
98    Scalar* res,        Index resStride,
99    Scalar alpha, level3_blocking<Scalar,Scalar>& blocking)
100  {
101    // strip zeros
102    Index diagSize  = (std::min)(_rows,_depth);
103    Index rows      = IsLower ? _rows : diagSize;
104    Index depth     = IsLower ? diagSize : _depth;
105    Index cols      = _cols;
106
107    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
108    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
109
110    Index kc = blocking.kc();                   // cache block size along the K direction
111    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
112
113    std::size_t sizeA = kc*mc;
114    std::size_t sizeB = kc*cols;
115    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
116
117    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
118    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
119    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
120
121    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
122    triangularBuffer.setZero();
123    if((Mode&ZeroDiag)==ZeroDiag)
124      triangularBuffer.diagonal().setZero();
125    else
126      triangularBuffer.diagonal().setOnes();
127
128    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
129    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
130    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
131
132    for(Index k2=IsLower ? depth : 0;
133        IsLower ? k2>0 : k2<depth;
134        IsLower ? k2-=kc : k2+=kc)
135    {
136      Index actual_kc = (std::min)(IsLower ? k2 : depth-k2, kc);
137      Index actual_k2 = IsLower ? k2-actual_kc : k2;
138
139      // align blocks with the end of the triangular part for trapezoidal lhs
140      if((!IsLower)&&(k2<rows)&&(k2+actual_kc>rows))
141      {
142        actual_kc = rows-k2;
143        k2 = k2+actual_kc-kc;
144      }
145
146      pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
147
148      // the selected lhs's panel has to be split in three different parts:
149      //  1 - the part which is zero => skip it
150      //  2 - the diagonal block => special kernel
151      //  3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP
152
153      // the block diagonal, if any:
154      if(IsLower || actual_k2<rows)
155      {
156        // for each small vertical panels of lhs
157        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
158        {
159          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
160          Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
161          Index startBlock   = actual_k2+k1;
162          Index blockBOffset = k1;
163
164          // => GEBP with the micro triangular block
165          // The trick is to pack this micro block while filling the opposite triangular part with zeros.
166          // To this end we do an extra triangular copy to a small temporary buffer
167          for (Index k=0;k<actualPanelWidth;++k)
168          {
169            if (SetDiag)
170              triangularBuffer.coeffRef(k,k) = lhs(startBlock+k,startBlock+k);
171            for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
172              triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
173          }
174          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
175
176          gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
177                      actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
178
179          // GEBP with remaining micro panel
180          if (lengthTarget>0)
181          {
182            Index startTarget  = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
183
184            pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
185
186            gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
187                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
188          }
189        }
190      }
191      // the part below (lower case) or above (upper case) the diagonal => GEPP
192      {
193        Index start = IsLower ? k2 : 0;
194        Index end   = IsLower ? rows : (std::min)(actual_k2,rows);
195        for(Index i2=start; i2<end; i2+=mc)
196        {
197          const Index actual_mc = (std::min)(i2+mc,end)-i2;
198          gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
199            (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
200
201          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
202        }
203      }
204    }
205  }
206};
207
208// implements col-major += alpha * op(general) * op(triangular)
209template <typename Scalar, typename Index, int Mode,
210          int LhsStorageOrder, bool ConjugateLhs,
211          int RhsStorageOrder, bool ConjugateRhs, int Version>
212struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
213                                           LhsStorageOrder,ConjugateLhs,
214                                           RhsStorageOrder,ConjugateRhs,ColMajor,Version>
215{
216  typedef gebp_traits<Scalar,Scalar> Traits;
217  enum {
218    SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
219    IsLower = (Mode&Lower) == Lower,
220    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
221  };
222
223  static EIGEN_DONT_INLINE void run(
224    Index _rows, Index _cols, Index _depth,
225    const Scalar* _lhs, Index lhsStride,
226    const Scalar* _rhs, Index rhsStride,
227    Scalar* res,        Index resStride,
228    Scalar alpha, level3_blocking<Scalar,Scalar>& blocking)
229  {
230    // strip zeros
231    Index diagSize  = (std::min)(_cols,_depth);
232    Index rows      = _rows;
233    Index depth     = IsLower ? _depth : diagSize;
234    Index cols      = IsLower ? diagSize : _cols;
235
236    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
237    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
238
239    Index kc = blocking.kc();                   // cache block size along the K direction
240    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
241
242    std::size_t sizeA = kc*mc;
243    std::size_t sizeB = kc*cols;
244    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
245
246    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
247    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
248    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
249
250    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
251    triangularBuffer.setZero();
252    if((Mode&ZeroDiag)==ZeroDiag)
253      triangularBuffer.diagonal().setZero();
254    else
255      triangularBuffer.diagonal().setOnes();
256
257    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
258    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
259    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
260    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
261
262    for(Index k2=IsLower ? 0 : depth;
263        IsLower ? k2<depth  : k2>0;
264        IsLower ? k2+=kc   : k2-=kc)
265    {
266      Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc);
267      Index actual_k2 = IsLower ? k2 : k2-actual_kc;
268
269      // align blocks with the end of the triangular part for trapezoidal rhs
270      if(IsLower && (k2<cols) && (actual_k2+actual_kc>cols))
271      {
272        actual_kc = cols-k2;
273        k2 = actual_k2 + actual_kc - kc;
274      }
275
276      // remaining size
277      Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2;
278      // size of the triangular part
279      Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
280
281      Scalar* geb = blockB+ts*ts;
282
283      pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
284
285      // pack the triangular part of the rhs padding the unrolled blocks with zeros
286      if(ts>0)
287      {
288        for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
289        {
290          Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
291          Index actual_j2 = actual_k2 + j2;
292          Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
293          Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
294          // general part
295          pack_rhs_panel(blockB+j2*actual_kc,
296                         &rhs(actual_k2+panelOffset, actual_j2), rhsStride,
297                         panelLength, actualPanelWidth,
298                         actual_kc, panelOffset);
299
300          // append the triangular part via a temporary buffer
301          for (Index j=0;j<actualPanelWidth;++j)
302          {
303            if (SetDiag)
304              triangularBuffer.coeffRef(j,j) = rhs(actual_j2+j,actual_j2+j);
305            for (Index k=IsLower ? j+1 : 0; IsLower ? k<actualPanelWidth : k<j; ++k)
306              triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
307          }
308
309          pack_rhs_panel(blockB+j2*actual_kc,
310                         triangularBuffer.data(), triangularBuffer.outerStride(),
311                         actualPanelWidth, actualPanelWidth,
312                         actual_kc, j2);
313        }
314      }
315
316      for (Index i2=0; i2<rows; i2+=mc)
317      {
318        const Index actual_mc = (std::min)(mc,rows-i2);
319        pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
320
321        // triangular kernel
322        if(ts>0)
323        {
324          for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
325          {
326            Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
327            Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
328            Index blockOffset = IsLower ? j2 : 0;
329
330            gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
331                        blockA, blockB+j2*actual_kc,
332                        actual_mc, panelLength, actualPanelWidth,
333                        alpha,
334                        actual_kc, actual_kc,  // strides
335                        blockOffset, blockOffset,// offsets
336                        blockW); // workspace
337          }
338        }
339        gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
340                    blockA, geb, actual_mc, actual_kc, rs,
341                    alpha,
342                    -1, -1, 0, 0, blockW);
343      }
344    }
345  }
346};
347
348/***************************************************************************
349* Wrapper to product_triangular_matrix_matrix
350***************************************************************************/
351
352template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
353struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> >
354  : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> >
355{};
356
357} // end namespace internal
358
359template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
360struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
361  : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs >
362{
363  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
364
365  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
366
367  template<typename Dest> void scaleAndAddTo(Dest& dst, Scalar alpha) const
368  {
369    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
370    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
371
372    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
373                               * RhsBlasTraits::extractScalarFactor(m_rhs);
374
375    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
376              Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
377
378    enum { IsLower = (Mode&Lower) == Lower };
379    Index stripedRows  = ((!LhsIsTriangular) || (IsLower))  ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols());
380    Index stripedCols  = ((LhsIsTriangular)  || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows());
381    Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
382                                         : ((IsLower)  ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
383
384    BlockingType blocking(stripedRows, stripedCols, stripedDepth);
385
386    internal::product_triangular_matrix_matrix<Scalar, Index,
387      Mode, LhsIsTriangular,
388      (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
389      (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
390      (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor>
391      ::run(
392        stripedRows, stripedCols, stripedDepth,   // sizes
393        &lhs.coeffRef(0,0),    lhs.outerStride(), // lhs info
394        &rhs.coeffRef(0,0),    rhs.outerStride(), // rhs info
395        &dst.coeffRef(0,0), dst.outerStride(),    // result info
396        actualAlpha, blocking
397      );
398  }
399};
400
401} // end namespace Eigen
402
403#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
404