GeneralMatrixMatrix.h revision 7faaa9f3f0df9d23790277834d426c3d992ac3ba
1c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This file is part of Eigen, a lightweight C++ template library 2c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// for linear algebra. 3c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 4c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> 5c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// 6c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// This Source Code Form is subject to the terms of the Mozilla 7c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// Public License v. 2.0. If a copy of the MPL was not distributed 8c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 10c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H 11c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#define EIGEN_GENERAL_MATRIX_MATRIX_H 12c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 13c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace Eigen { 14c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 15c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathnamespace internal { 16c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 17c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename _LhsScalar, typename _RhsScalar> class level3_blocking; 18c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 19c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/* Specialization for a row-major destination matrix => simple transposition of the product */ 20c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate< 21c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename Index, 22c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, 23c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> 24c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor> 25c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 26c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 27c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath static EIGEN_STRONG_INLINE void run( 28c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index rows, Index cols, Index depth, 29c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const LhsScalar* lhs, Index lhsStride, 30c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const RhsScalar* rhs, Index rhsStride, 31c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ResScalar* res, Index resStride, 32c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ResScalar alpha, 33c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath level3_blocking<RhsScalar,LhsScalar>& blocking, 34c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath GemmParallelInfo<Index>* info = 0) 35c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 36c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // transpose the product such that the result is column major 37c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath general_matrix_matrix_product<Index, 38c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, 39c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, 40c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ColMajor> 41c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info); 42c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 43c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 44c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 45c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/* Specialization for a col-major destination matrix 46c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath * => Blocking algorithm following Goto's paper */ 47c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate< 48c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename Index, 49c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, 50c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> 51c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor> 52c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 537faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 54c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtypedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 55c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstatic void run(Index rows, Index cols, Index depth, 56c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const LhsScalar* _lhs, Index lhsStride, 57c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const RhsScalar* _rhs, Index rhsStride, 58c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ResScalar* res, Index resStride, 59c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ResScalar alpha, 60c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath level3_blocking<LhsScalar,RhsScalar>& blocking, 61c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath GemmParallelInfo<Index>* info = 0) 62c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 63c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); 64c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); 65c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 66c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef gebp_traits<LhsScalar,RhsScalar> Traits; 67c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 68c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index kc = blocking.kc(); // cache block size along the K direction 69c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction 70c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath //Index nc = blocking.nc(); // cache block size along the N direction 71c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 72c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; 73c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; 74c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; 75c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 76c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#ifdef EIGEN_HAS_OPENMP 77c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if(info) 78c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 79c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // this is the parallel version! 80c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index tid = omp_get_thread_num(); 81c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index threads = omp_get_num_threads(); 82c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 83c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::size_t sizeA = kc*mc; 84c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::size_t sizeW = kc*Traits::WorkSpaceFactor; 85c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0); 86c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0); 87c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 88c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath RhsScalar* blockB = blocking.blockB(); 89c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath eigen_internal_assert(blockB!=0); 90c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 91c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs... 92c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for(Index k=0; k<depth; k+=kc) 93c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 94c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A' 95c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 96c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // In order to reduce the chance that a thread has to wait for the other, 97c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // let's start by packing A'. 98c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc); 99c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 100c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Pack B_k to B' in a parallel fashion: 101c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // each thread packs the sub block B_k,j to B'_j where j is the thread id. 102c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 103c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // However, before copying to B'_j, we have to make sure that no other thread is still using it, 104c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // i.e., we test that info[tid].users equals 0. 105c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. 106c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath while(info[tid].users!=0) {} 107c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath info[tid].users += threads; 108c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 109c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length); 110c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 111c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Notify the other threads that the part B'_j is ready to go. 112c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath info[tid].sync = k; 113c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 114c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Computes C_i += A' * B' per B'_j 115c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for(Index shift=0; shift<threads; ++shift) 116c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 117c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index j = (tid+shift)%threads; 118c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 119c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // At this point we have to make sure that B'_j has been updated by the thread j, 120c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // we use testAndSetOrdered to mimic a volatile access. 121c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // However, no need to wait for the B' part which has been updated by the current thread! 122c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if(shift>0) 123c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath while(info[j].sync!=k) {} 124c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 125c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w); 126c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 127c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 128c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Then keep going as usual with the remaining A' 129c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for(Index i=mc; i<rows; i+=mc) 130c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 131c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Index actual_mc = (std::min)(i+mc,rows)-i; 132c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 133c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // pack A_i,k to A' 134c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); 135c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 136c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // C_i += A' * B' 137c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w); 138c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 139c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 140c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Release all the sub blocks B'_j of B' for the current thread, 141c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // i.e., we simply decrement the number of users by 1 142c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for(Index j=0; j<threads; ++j) 143c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath #pragma omp atomic 144c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath --(info[j].users); 145c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 146c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 147c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath else 148c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif // EIGEN_HAS_OPENMP 149c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 150c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_UNUSED_VARIABLE(info); 151c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 152c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // this is the sequential version! 153c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::size_t sizeA = kc*mc; 154c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::size_t sizeB = kc*cols; 155c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath std::size_t sizeW = kc*Traits::WorkSpaceFactor; 156c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 157c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); 158c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); 159c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW()); 160c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 161c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // For each horizontal panel of the rhs, and corresponding panel of the lhs... 162c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // (==GEMM_VAR1) 163c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for(Index k2=0; k2<depth; k2+=kc) 164c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 165c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Index actual_kc = (std::min)(k2+kc,depth)-k2; 166c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 167c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. 168c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // => Pack rhs's panel into a sequential chunk of memory (L2 caching) 169c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Note that this panel will be read as many times as the number of blocks in the lhs's 170c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // vertical panel which is, in practice, a very low number. 171c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); 172c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 173c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // For each mc x kc block of the lhs's vertical panel... 174c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // (==GEPP_VAR1) 175c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath for(Index i2=0; i2<rows; i2+=mc) 176c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 177c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Index actual_mc = (std::min)(i2+mc,rows)-i2; 178c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 179c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // We pack the lhs's block into a sequential chunk of memory (L1 caching) 180c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Note that this block will be read a very high number of times, which is equal to the number of 181c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // micro vertical panel of the large rhs's panel (e.g., cols/4 times). 182c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); 183c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 184c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath // Everything is packed, we can now call the block * panel kernel: 185c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); 186c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 187c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 188c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 189c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} 190c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 191c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 192c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 193c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath/********************************************************************************* 194c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath* Specialization of GeneralProduct<> for "large" GEMM, i.e., 195c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath* implementation of the high level wrapper to general_matrix_matrix_product 196c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath**********************************************************************************/ 197c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 198c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename Lhs, typename Rhs> 199c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct traits<GeneralProduct<Lhs,Rhs,GemmProduct> > 200c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> > 201c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{}; 202c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 203c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType> 204c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathstruct gemm_functor 205c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 2067faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, 207c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath BlockingType& blocking) 208c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) 209c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath {} 210c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 211c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath void initParallelSession() const 212c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 213c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath m_blocking.allocateB(); 214c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 215c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 216c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const 217c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 218c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if(cols==-1) 219c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath cols = m_rhs.cols(); 220c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 221c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Gemm::run(rows, cols, m_lhs.cols(), 222c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(), 223c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(), 224c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), 225c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath m_actualAlpha, m_blocking, info); 226c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 227c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 228c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath protected: 229c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Lhs& m_lhs; 230c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath const Rhs& m_rhs; 231c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Dest& m_dest; 232c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar m_actualAlpha; 233c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath BlockingType& m_blocking; 234c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 235c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 236c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor=1, 237c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathbool FiniteAtCompileTime = MaxRows!=Dynamic && MaxCols!=Dynamic && MaxDepth != Dynamic> class gemm_blocking_space; 238c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 239c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename _LhsScalar, typename _RhsScalar> 240c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathclass level3_blocking 241c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 242c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef _LhsScalar LhsScalar; 243c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef _RhsScalar RhsScalar; 244c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 245c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath protected: 246c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LhsScalar* m_blockA; 247c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath RhsScalar* m_blockB; 248c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath RhsScalar* m_blockW; 249c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 250c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath DenseIndex m_mc; 251c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath DenseIndex m_nc; 252c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath DenseIndex m_kc; 253c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 254c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath public: 255c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 256c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath level3_blocking() 257c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0) 258c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath {} 259c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 260c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inline DenseIndex mc() const { return m_mc; } 261c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inline DenseIndex nc() const { return m_nc; } 262c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inline DenseIndex kc() const { return m_kc; } 263c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 264c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inline LhsScalar* blockA() { return m_blockA; } 265c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inline RhsScalar* blockB() { return m_blockB; } 266c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inline RhsScalar* blockW() { return m_blockW; } 267c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 268c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 269c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor> 270c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathclass gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true> 271c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath : public level3_blocking< 272c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type, 273c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type> 274c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 275c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 276c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Transpose = StorageOrder==RowMajor, 277c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ActualRows = Transpose ? MaxCols : MaxRows, 278c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ActualCols = Transpose ? MaxRows : MaxCols 279c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 280c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar; 281c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar; 282c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef gebp_traits<LhsScalar,RhsScalar> Traits; 283c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 284c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath SizeA = ActualRows * MaxDepth, 285c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath SizeB = ActualCols * MaxDepth, 286c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath SizeW = MaxDepth * Traits::WorkSpaceFactor 287c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 288c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 289c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; 290c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_ALIGN16 RhsScalar m_staticB[SizeB]; 291c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_ALIGN16 RhsScalar m_staticW[SizeW]; 292c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 293c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath public: 294c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 295c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/) 296c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 297c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_mc = ActualRows; 298c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_nc = ActualCols; 299c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_kc = MaxDepth; 300c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_blockA = m_staticA; 301c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_blockB = m_staticB; 302c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_blockW = m_staticW; 303c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 304c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 305c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inline void allocateA() {} 306c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inline void allocateB() {} 307c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inline void allocateW() {} 308c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath inline void allocateAll() {} 309c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 310c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 311c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor> 312c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathclass gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, false> 313c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath : public level3_blocking< 314c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type, 315c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type> 316c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 317c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 318c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Transpose = StorageOrder==RowMajor 319c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 320c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar; 321c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar; 322c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef gebp_traits<LhsScalar,RhsScalar> Traits; 323c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 324c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath DenseIndex m_sizeA; 325c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath DenseIndex m_sizeB; 326c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath DenseIndex m_sizeW; 327c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 328c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath public: 329c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 330c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth) 331c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 332c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_mc = Transpose ? cols : rows; 333c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_nc = Transpose ? rows : cols; 334c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_kc = depth; 335c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 336c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc); 337c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath m_sizeA = this->m_mc * this->m_kc; 338c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath m_sizeB = this->m_kc * this->m_nc; 339c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath m_sizeW = this->m_kc*Traits::WorkSpaceFactor; 340c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 341c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 342c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath void allocateA() 343c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 344c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if(this->m_blockA==0) 345c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_blockA = aligned_new<LhsScalar>(m_sizeA); 346c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 347c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 348c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath void allocateB() 349c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 350c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if(this->m_blockB==0) 351c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_blockB = aligned_new<RhsScalar>(m_sizeB); 352c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 353c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 354c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath void allocateW() 355c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 356c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath if(this->m_blockW==0) 357c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath this->m_blockW = aligned_new<RhsScalar>(m_sizeW); 358c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 359c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 360c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath void allocateAll() 361c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 362c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath allocateA(); 363c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath allocateB(); 364c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath allocateW(); 365c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 366c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 367c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath ~gemm_blocking_space() 368c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 369c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath aligned_delete(this->m_blockA, m_sizeA); 370c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath aligned_delete(this->m_blockB, m_sizeB); 371c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath aligned_delete(this->m_blockW, m_sizeW); 372c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 373c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 374c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 375c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace internal 376c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 377c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathtemplate<typename Lhs, typename Rhs> 378c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamathclass GeneralProduct<Lhs, Rhs, GemmProduct> 379c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> 380c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath{ 381c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath enum { 382c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) 383c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath }; 384c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath public: 385c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) 386c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 387c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef typename Lhs::Scalar LhsScalar; 388c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef typename Rhs::Scalar RhsScalar; 389c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef Scalar ResScalar; 390c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 391c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) 392c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 393c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp; 394c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar); 395c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 396c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 3977faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const 398c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath { 399c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); 400c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 401c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs); 402c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs); 403c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 404c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) 405c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath * RhsBlasTraits::extractScalarFactor(m_rhs); 406c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 407c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, 408c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; 409c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 410c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath typedef internal::gemm_functor< 411c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Scalar, Index, 412c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath internal::general_matrix_matrix_product< 413c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath Index, 414c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), 415c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), 416c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, 417c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; 418c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 419c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath BlockingType blocking(dst.rows(), dst.cols(), lhs.cols()); 420c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 421c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); 422c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath } 423c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath}; 424c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 425c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath} // end namespace Eigen 426c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath 427c981c48f5bc9aefeffc0bcb0cc3934c2fae179ddNarayan Kamath#endif // EIGEN_GENERAL_MATRIX_MATRIX_H 428