1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This file is part of Eigen, a lightweight C++ template library 2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for linear algebra. 3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> 5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This Source Code Form is subject to the terms of the Mozilla 7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Public License v. 2.0. If a copy of the MPL was not distributed 8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H 11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H 12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace Eigen { 14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace internal { 16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// if the rhs is row major, let's transpose the product 18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder> 19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor> 20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 217faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez static void run( 22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index size, Index cols, 23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Scalar* tri, Index triStride, 24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar* _other, Index otherStride, 25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath level3_blocking<Scalar,Scalar>& blocking) 26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 27c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath triangular_solve_matrix< 28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft, 29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper), 30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath NumTraits<Scalar>::IsComplex && Conjugate, 31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor> 32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ::run(size, cols, tri, triStride, _other, otherStride, blocking); 33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left 37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath */ 38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> 39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> 40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath static EIGEN_DONT_INLINE void run( 42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index size, Index otherSize, 43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Scalar* _tri, Index triStride, 44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar* _other, Index otherStride, 457faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez level3_blocking<Scalar,Scalar>& blocking); 467faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez}; 477faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeztemplate <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> 487faaa9f3f0df9d23790277834d426c3d992ac3baCarlos HernandezEIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run( 497faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Index size, Index otherSize, 507faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* _tri, Index triStride, 517faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Scalar* _other, Index otherStride, 52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath level3_blocking<Scalar,Scalar>& blocking) 53c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index cols = otherSize; 55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride); 56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride); 57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef gebp_traits<Scalar,Scalar> Traits; 59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), 61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath IsLower = (Mode&Lower) == Lower 62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index kc = blocking.kc(); // cache block size along the K direction 65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index mc = (std::min)(size,blocking.mc()); // cache block size along the M direction 66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::size_t sizeA = kc*mc; 68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::size_t sizeB = kc*cols; 69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::size_t sizeW = kc*Traits::WorkSpaceFactor; 70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); 72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); 73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); 74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath conj_if<Conjugate> conj; 76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; 77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs; 78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs; 79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // the goal here is to subdivise the Rhs panels such that we keep some cache 81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // coherence when accessing the rhs elements 82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::ptrdiff_t l1, l2; 83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath manage_caching_sizes(GetAction, &l1, &l2); 84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; 85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr); 86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for(Index k2=IsLower ? 0 : size; 88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath IsLower ? k2<size : k2>0; 89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath IsLower ? k2+=kc : k2-=kc) 90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Index actual_kc = (std::min)(IsLower ? size-k2 : k2, kc); 92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel, 94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into 95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // A11 (the triangular part) and A21 the remaining rectangular part. 96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Then the high level algorithm is: 97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // - B = R1 => general block copy (done during the next step) 98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // - R1 = A11^-1 B => tricky part 99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // - update B from the new R1 => actually this has to be performed continuously during the above step 100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // - R2 -= A21 * B => GEPP 101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // The tricky part: compute R1 = A11^-1 B while updating B from R1 103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // The idea is to split A11 into multiple small vertical panels. 104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Each panel can be split into a small triangular part T1k which is processed without optimization, 105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // and the remaining small part T2k which is processed using gebp with appropriate block strides 106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for(Index j2=0; j2<cols; j2+=subcols) 107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index actual_cols = (std::min)(cols-j2,subcols); 109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // for each small vertical panels [T1k^T, T2k^T]^T of lhs 110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth) 111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth); 113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // tr solve 114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (Index k=0; k<actualPanelWidth; ++k) 115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // TODO write a small kernel handling this (can be shared with trsv) 117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index i = IsLower ? k2+k1+k : k2-k1-k-1; 118c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index s = IsLower ? k2+k1 : i+1; 119c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index rs = actualPanelWidth - k - 1; // remaining size 120c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i)); 122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (Index j=j2; j<j2+actual_cols; ++j) 123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (TriStorageOrder==RowMajor) 125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar b(0); 127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Scalar* l = &tri(i,s); 128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar* r = &other(s,j); 129c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (Index i3=0; i3<k; ++i3) 130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath b += conj(l[i3]) * r[i3]; 131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath other(i,j) = (other(i,j) - b)*a; 133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath else 135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 136c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index s = IsLower ? i+1 : i-rs; 137c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar b = (other(i,j) *= a); 138c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar* r = &other(s,j); 139c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Scalar* l = &tri(s,i); 140c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (Index i3=0;i3<rs;++i3) 141c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath r[i3] -= b * conj(l[i3]); 142c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 143c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 144c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 145c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 146c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index lengthTarget = actual_kc-k1-actualPanelWidth; 147c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index startBlock = IsLower ? k2+k1 : k2-k1-actualPanelWidth; 148c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index blockBOffset = IsLower ? k1 : lengthTarget; 149c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 150c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // update the respective rows of B from other 151c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset); 152c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 153c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // GEBP 154c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (lengthTarget>0) 155c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 156c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc; 157c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 158c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); 159c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 160c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), 161c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath actualPanelWidth, actual_kc, 0, blockBOffset, blockW); 162c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 163c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 164c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 165c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 166c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // R2 -= A21 * B => GEPP 167c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 168c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index start = IsLower ? k2+kc : 0; 169c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index end = IsLower ? size : k2-kc; 170c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for(Index i2=start; i2<end; i2+=mc) 171c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 172c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Index actual_mc = (std::min)(mc,end-i2); 173c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (actual_mc>0) 174c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 175c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); 176c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 177c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW); 178c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 179c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 180c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 181c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 182c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 183c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 184c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right 185c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath */ 186c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> 187c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> 188c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 189c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath static EIGEN_DONT_INLINE void run( 190c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index size, Index otherSize, 191c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Scalar* _tri, Index triStride, 192c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar* _other, Index otherStride, 1937faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez level3_blocking<Scalar,Scalar>& blocking); 1947faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez}; 1957faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeztemplate <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> 1967faaa9f3f0df9d23790277834d426c3d992ac3baCarlos HernandezEIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run( 1977faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Index size, Index otherSize, 1987faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* _tri, Index triStride, 1997faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Scalar* _other, Index otherStride, 200c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath level3_blocking<Scalar,Scalar>& blocking) 201c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 202c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index rows = otherSize; 203c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride); 204c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride); 205c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 206c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef gebp_traits<Scalar,Scalar> Traits; 207c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 208c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath RhsStorageOrder = TriStorageOrder, 209c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), 210c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath IsLower = (Mode&Lower) == Lower 211c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 212c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 213c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index kc = blocking.kc(); // cache block size along the K direction 214c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction 215c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 216c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::size_t sizeA = kc*mc; 217c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::size_t sizeB = kc*size; 218c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::size_t sizeW = kc*Traits::WorkSpaceFactor; 219c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 220c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); 221c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); 222c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); 223c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 224c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath conj_if<Conjugate> conj; 225c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; 226c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; 227c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; 228c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel; 229c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 230c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for(Index k2=IsLower ? size : 0; 231c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath IsLower ? k2>0 : k2<size; 232c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath IsLower ? k2-=kc : k2+=kc) 233c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 234c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Index actual_kc = (std::min)(IsLower ? k2 : size-k2, kc); 235c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index actual_k2 = IsLower ? k2-actual_kc : k2 ; 236c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 237c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index startPanel = IsLower ? 0 : k2+actual_kc; 238c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc; 239c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar* geb = blockB+actual_kc*actual_kc; 240c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 241c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs); 242c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 243c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // triangular packing (we only pack the panels off the diagonal, 244c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // neglecting the blocks overlapping the diagonal 245c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 246c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth) 247c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 248c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); 249c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index actual_j2 = actual_k2 + j2; 250c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index panelOffset = IsLower ? j2+actualPanelWidth : 0; 251c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; 252c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 253c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (panelLength>0) 254c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pack_rhs_panel(blockB+j2*actual_kc, 255c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath &rhs(actual_k2+panelOffset, actual_j2), triStride, 256c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath panelLength, actualPanelWidth, 257c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath actual_kc, panelOffset); 258c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 259c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 260c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 261c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for(Index i2=0; i2<rows; i2+=mc) 262c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 263c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Index actual_mc = (std::min)(mc,rows-i2); 264c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 265c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // triangular solver kernel 266c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 267c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // for each small block of the diagonal (=> vertical panels of rhs) 268c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (Index j2 = IsLower 269c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ? (actual_kc - ((actual_kc%SmallPanelWidth) ? Index(actual_kc%SmallPanelWidth) 270c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath : Index(SmallPanelWidth))) 271c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath : 0; 272c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath IsLower ? j2>=0 : j2<actual_kc; 273c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath IsLower ? j2-=SmallPanelWidth : j2+=SmallPanelWidth) 274c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 275c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); 276c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index absolute_j2 = actual_k2 + j2; 277c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index panelOffset = IsLower ? j2+actualPanelWidth : 0; 278c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2; 279c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 280c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // GEBP 281c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if(panelLength>0) 282c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 283c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gebp_kernel(&lhs(i2,absolute_j2), otherStride, 284c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath blockA, blockB+j2*actual_kc, 285c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath actual_mc, panelLength, actualPanelWidth, 286c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar(-1), 287c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath actual_kc, actual_kc, // strides 288c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath panelOffset, panelOffset, // offsets 289c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath blockW); // workspace 290c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 291c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 292c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // unblocked triangular solve 293c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (Index k=0; k<actualPanelWidth; ++k) 294c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 295c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k; 296c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 297c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar* r = &lhs(i2,j); 298c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (Index k3=0; k3<k; ++k3) 299c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 300c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j)); 301c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3); 302c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (Index i=0; i<actual_mc; ++i) 303c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath r[i] -= a[i] * b; 304c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 305c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar b = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(rhs(j,j)); 306c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for (Index i=0; i<actual_mc; ++i) 307c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath r[i] *= b; 308c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 309c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 310c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // pack the just computed part of lhs to A 311c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride, 312c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath actualPanelWidth, actual_mc, 313c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath actual_kc, j2); 314c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 315c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 316c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 317c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if (rs>0) 318c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, 319c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath actual_mc, actual_kc, rs, Scalar(-1), 320c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath -1, -1, 0, 0, blockW); 321c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 322c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 323c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 324c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 325c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace internal 326c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 327c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace Eigen 328c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 329c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H 330