17faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// This file is part of Eigen, a lightweight C++ template library 27faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// for linear algebra. 37faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// 47faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr> 57faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// 67faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// This Source Code Form is subject to the terms of the Mozilla 77faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// Public License v. 2.0. If a copy of the MPL was not distributed 87faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 97faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 107faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#ifndef EIGEN_SPARSELU_GEMM_KERNEL_H 117faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#define EIGEN_SPARSELU_GEMM_KERNEL_H 127faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 137faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeznamespace Eigen { 147faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 157faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeznamespace internal { 167faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 177faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 187faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez/** \internal 197faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez * A general matrix-matrix product kernel optimized for the SparseLU factorization. 207faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez * - A, B, and C must be column major 217faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez * - lda and ldc must be multiples of the respective packet size 227faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez * - C must have the same alignment as A 237faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez */ 247faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeztemplate<typename Scalar,typename Index> 257faaa9f3f0df9d23790277834d426c3d992ac3baCarlos HernandezEIGEN_DONT_INLINE 267faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandezvoid sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc) 277faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez{ 287faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez using namespace Eigen::internal; 297faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 307faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez typedef typename packet_traits<Scalar>::type Packet; 317faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez enum { 327faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, 337faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez PacketSize = packet_traits<Scalar>::size, 347faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez PM = 8, // peeling in M 357faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez RN = 2, // register blocking 367faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez RK = NumberOfRegisters>=16 ? 4 : 2, // register blocking 377faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez BM = 4096/sizeof(Scalar), // number of rows of A-C per chunk 387faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez SM = PM*PacketSize // step along M 397faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez }; 407faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Index d_end = (d/RK)*RK; // number of columns of A (rows of B) suitable for full register blocking 417faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Index n_end = (n/RN)*RN; // number of columns of B-C suitable for processing RN columns at once 427faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Index i0 = internal::first_aligned(A,m); 437faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 447faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m))); 457faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 467faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // handle the non aligned rows of A and C without any optimization: 477faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index i=0; i<i0; ++i) 487faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 497faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index j=0; j<n; ++j) 507faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 517faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Scalar c = C[i+j*ldc]; 527faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index k=0; k<d; ++k) 537faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez c += B[k+j*ldb] * A[i+k*lda]; 547faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez C[i+j*ldc] = c; 557faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 567faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 577faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // process the remaining rows per chunk of BM rows 587faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index ib=i0; ib<m; ib+=BM) 597faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 607faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Index actual_b = std::min<Index>(BM, m-ib); // actual number of rows 617faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Index actual_b_end1 = (actual_b/SM)*SM; // actual number of rows suitable for peeling 627faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Index actual_b_end2 = (actual_b/PacketSize)*PacketSize; // actual number of rows suitable for vectorization 637faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 647faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // Let's process two columns of B-C at once 657faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index j=0; j<n_end; j+=RN) 667faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 677faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* Bc0 = B+(j+0)*ldb; 687faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* Bc1 = B+(j+1)*ldb; 697faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 707faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index k=0; k<d_end; k+=RK) 717faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 727faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 737faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // load and expand a RN x RK block of B 747faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet b00, b10, b20, b30, b01, b11, b21, b31; 757faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez b00 = pset1<Packet>(Bc0[0]); 767faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez b10 = pset1<Packet>(Bc0[1]); 777faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) b20 = pset1<Packet>(Bc0[2]); 787faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) b30 = pset1<Packet>(Bc0[3]); 797faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez b01 = pset1<Packet>(Bc1[0]); 807faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez b11 = pset1<Packet>(Bc1[1]); 817faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) b21 = pset1<Packet>(Bc1[2]); 827faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) b31 = pset1<Packet>(Bc1[3]); 837faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 847faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet a0, a1, a2, a3, c0, c1, t0, t1; 857faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 867faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* A0 = A+ib+(k+0)*lda; 877faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* A1 = A+ib+(k+1)*lda; 887faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* A2 = A+ib+(k+2)*lda; 897faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* A3 = A+ib+(k+3)*lda; 907faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 917faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Scalar* C0 = C+ib+(j+0)*ldc; 927faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Scalar* C1 = C+ib+(j+1)*ldc; 937faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 947faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a0 = pload<Packet>(A0); 957faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a1 = pload<Packet>(A1); 967faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) 977faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 987faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a2 = pload<Packet>(A2); 997faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a3 = pload<Packet>(A3); 1007faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 1017faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez else 1027faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 1037faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // workaround "may be used uninitialized in this function" warning 1047faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a2 = a3 = a0; 1057faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 1067faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1077faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);} 1087faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#define WORK(I) \ 1097faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez c0 = pload<Packet>(C0+i+(I)*PacketSize); \ 1107faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez c1 = pload<Packet>(C1+i+(I)*PacketSize); \ 1117faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez KMADD(c0, a0, b00, t0) \ 1127faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez KMADD(c1, a0, b01, t1) \ 1137faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \ 1147faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez KMADD(c0, a1, b10, t0) \ 1157faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez KMADD(c1, a1, b11, t1) \ 1167faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \ 1177faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) KMADD(c0, a2, b20, t0) \ 1187faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) KMADD(c1, a2, b21, t1) \ 1197faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \ 1207faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) KMADD(c0, a3, b30, t0) \ 1217faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) KMADD(c1, a3, b31, t1) \ 1227faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \ 1237faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez pstore(C0+i+(I)*PacketSize, c0); \ 1247faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez pstore(C1+i+(I)*PacketSize, c1) 1257faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1267faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // process rows of A' - C' with aggressive vectorization and peeling 1277faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index i=0; i<actual_b_end1; i+=PacketSize*8) 1287faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 1297faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL1"); 1307faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez prefetch((A0+i+(5)*PacketSize)); 1317faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez prefetch((A1+i+(5)*PacketSize)); 1327faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) prefetch((A2+i+(5)*PacketSize)); 1337faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) prefetch((A3+i+(5)*PacketSize)); 1347faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(0); 1357faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(1); 1367faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(2); 1377faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(3); 1387faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(4); 1397faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(5); 1407faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(6); 1417faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(7); 1427faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 1437faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // process the remaining rows with vectorization only 1447faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize) 1457faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 1467faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(0); 1477faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 1487faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#undef WORK 1497faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // process the remaining rows without vectorization 1507faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index i=actual_b_end2; i<actual_b; ++i) 1517faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 1527faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) 1537faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 1547faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3]; 1557faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1]+A2[i]*Bc1[2]+A3[i]*Bc1[3]; 1567faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 1577faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez else 1587faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 1597faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]; 1607faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1]; 1617faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 1627faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 1637faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1647faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Bc0 += RK; 1657faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Bc1 += RK; 1667faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } // peeled loop on k 1677faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } // peeled loop on the columns j 1687faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // process the last column (we now perform a matrux-vector product) 1697faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if((n-n_end)>0) 1707faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 1717faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* Bc0 = B+(n-1)*ldb; 1727faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1737faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index k=0; k<d_end; k+=RK) 1747faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 1757faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1767faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // load and expand a 1 x RK block of B 1777faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet b00, b10, b20, b30; 1787faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez b00 = pset1<Packet>(Bc0[0]); 1797faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez b10 = pset1<Packet>(Bc0[1]); 1807faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) b20 = pset1<Packet>(Bc0[2]); 1817faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) b30 = pset1<Packet>(Bc0[3]); 1827faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1837faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Packet a0, a1, a2, a3, c0, t0/*, t1*/; 1847faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1857faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* A0 = A+ib+(k+0)*lda; 1867faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* A1 = A+ib+(k+1)*lda; 1877faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* A2 = A+ib+(k+2)*lda; 1887faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez const Scalar* A3 = A+ib+(k+3)*lda; 1897faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1907faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Scalar* C0 = C+ib+(n_end)*ldc; 1917faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 1927faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a0 = pload<Packet>(A0); 1937faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a1 = pload<Packet>(A1); 1947faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) 1957faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 1967faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a2 = pload<Packet>(A2); 1977faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a3 = pload<Packet>(A3); 1987faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 1997faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez else 2007faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 2017faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // workaround "may be used uninitialized in this function" warning 2027faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a2 = a3 = a0; 2037faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 2047faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2057faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#define WORK(I) \ 2067faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez c0 = pload<Packet>(C0+i+(I)*PacketSize); \ 2077faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez KMADD(c0, a0, b00, t0) \ 2087faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \ 2097faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez KMADD(c0, a1, b10, t0) \ 2107faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \ 2117faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) KMADD(c0, a2, b20, t0) \ 2127faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \ 2137faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) KMADD(c0, a3, b30, t0) \ 2147faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \ 2157faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez pstore(C0+i+(I)*PacketSize, c0); 2167faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2177faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // agressive vectorization and peeling 2187faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index i=0; i<actual_b_end1; i+=PacketSize*8) 2197faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 2207faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2"); 2217faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(0); 2227faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(1); 2237faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(2); 2247faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(3); 2257faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(4); 2267faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(5); 2277faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(6); 2287faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(7); 2297faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 2307faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // vectorization only 2317faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize) 2327faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 2337faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez WORK(0); 2347faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 2357faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // remaining scalars 2367faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index i=actual_b_end2; i<actual_b; ++i) 2377faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 2387faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(RK==4) 2397faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3]; 2407faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez else 2417faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]; 2427faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 2437faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2447faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Bc0 += RK; 2457faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#undef WORK 2467faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 2477faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 2487faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2497faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez // process the last columns of A, corresponding to the last rows of B 2507faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Index rd = d-d_end; 2517faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(rd>0) 2527faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 2537faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez for(Index j=0; j<n; ++j) 2547faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez { 2557faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez enum { 2567faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez Alignment = PacketSize>1 ? Aligned : 0 2577faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez }; 2587faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez typedef Map<Matrix<Scalar,Dynamic,1>, Alignment > MapVector; 2597faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez typedef Map<const Matrix<Scalar,Dynamic,1>, Alignment > ConstMapVector; 2607faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez if(rd==1) MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b); 2617faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2627faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez else if(rd==2) MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b) 2637faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b); 2647faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2657faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez else MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b) 2667faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b) 2677faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b); 2687faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 2697faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } 2707faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2717faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez } // blocking on the rows of A and C 2727faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez} 2737faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#undef KMADD 2747faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2757faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez} // namespace internal 2767faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2777faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez} // namespace Eigen 2787faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez 2797faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#endif // EIGEN_SPARSELU_GEMM_KERNEL_H 280