17faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// This file is part of Eigen, a lightweight C++ template library
27faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// for linear algebra.
37faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez//
47faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
57faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez//
67faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// This Source Code Form is subject to the terms of the Mozilla
77faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// Public License v. 2.0. If a copy of the MPL was not distributed
87faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
97faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
107faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#ifndef EIGEN_SPARSELU_GEMM_KERNEL_H
117faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#define EIGEN_SPARSELU_GEMM_KERNEL_H
127faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
137faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeznamespace Eigen {
147faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
157faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeznamespace internal {
167faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
177faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
187faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez/** \internal
197faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  * A general matrix-matrix product kernel optimized for the SparseLU factorization.
207faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  *  - A, B, and C must be column major
217faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  *  - lda and ldc must be multiples of the respective packet size
227faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  *  - C must have the same alignment as A
237faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  */
247faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandeztemplate<typename Scalar,typename Index>
257faaa9f3f0df9d23790277834d426c3d992ac3baCarlos HernandezEIGEN_DONT_INLINE
267faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandezvoid sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc)
277faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez{
287faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  using namespace Eigen::internal;
297faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
307faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  typedef typename packet_traits<Scalar>::type Packet;
317faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  enum {
327faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
337faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    PacketSize = packet_traits<Scalar>::size,
347faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    PM = 8,                             // peeling in M
357faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    RN = 2,                             // register blocking
367faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    RK = NumberOfRegisters>=16 ? 4 : 2, // register blocking
377faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    BM = 4096/sizeof(Scalar),           // number of rows of A-C per chunk
387faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    SM = PM*PacketSize                  // step along M
397faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  };
407faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  Index d_end = (d/RK)*RK;    // number of columns of A (rows of B) suitable for full register blocking
417faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  Index n_end = (n/RN)*RN;    // number of columns of B-C suitable for processing RN columns at once
427faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  Index i0 = internal::first_aligned(A,m);
437faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
447faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m)));
457faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
467faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  // handle the non aligned rows of A and C without any optimization:
477faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  for(Index i=0; i<i0; ++i)
487faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  {
497faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    for(Index j=0; j<n; ++j)
507faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    {
517faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      Scalar c = C[i+j*ldc];
527faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      for(Index k=0; k<d; ++k)
537faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        c += B[k+j*ldb] * A[i+k*lda];
547faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      C[i+j*ldc] = c;
557faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    }
567faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  }
577faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  // process the remaining rows per chunk of BM rows
587faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  for(Index ib=i0; ib<m; ib+=BM)
597faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  {
607faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    Index actual_b = std::min<Index>(BM, m-ib);                 // actual number of rows
617faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    Index actual_b_end1 = (actual_b/SM)*SM;                   // actual number of rows suitable for peeling
627faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    Index actual_b_end2 = (actual_b/PacketSize)*PacketSize;   // actual number of rows suitable for vectorization
637faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
647faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    // Let's process two columns of B-C at once
657faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    for(Index j=0; j<n_end; j+=RN)
667faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    {
677faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      const Scalar* Bc0 = B+(j+0)*ldb;
687faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      const Scalar* Bc1 = B+(j+1)*ldb;
697faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
707faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      for(Index k=0; k<d_end; k+=RK)
717faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      {
727faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
737faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        // load and expand a RN x RK block of B
747faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        Packet b00, b10, b20, b30, b01, b11, b21, b31;
757faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  b00 = pset1<Packet>(Bc0[0]);
767faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  b10 = pset1<Packet>(Bc0[1]);
777faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
787faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
797faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  b01 = pset1<Packet>(Bc1[0]);
807faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  b11 = pset1<Packet>(Bc1[1]);
817faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4) b21 = pset1<Packet>(Bc1[2]);
827faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4) b31 = pset1<Packet>(Bc1[3]);
837faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
847faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        Packet a0, a1, a2, a3, c0, c1, t0, t1;
857faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
867faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        const Scalar* A0 = A+ib+(k+0)*lda;
877faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        const Scalar* A1 = A+ib+(k+1)*lda;
887faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        const Scalar* A2 = A+ib+(k+2)*lda;
897faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        const Scalar* A3 = A+ib+(k+3)*lda;
907faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
917faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        Scalar* C0 = C+ib+(j+0)*ldc;
927faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        Scalar* C1 = C+ib+(j+1)*ldc;
937faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
947faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  a0 = pload<Packet>(A0);
957faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  a1 = pload<Packet>(A1);
967faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4)
977faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        {
987faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          a2 = pload<Packet>(A2);
997faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          a3 = pload<Packet>(A3);
1007faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        }
1017faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        else
1027faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        {
1037faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          // workaround "may be used uninitialized in this function" warning
1047faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          a2 = a3 = a0;
1057faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        }
1067faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
1077faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);}
1087faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#define WORK(I)  \
1097faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
1107faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    c1 = pload<Packet>(C1+i+(I)*PacketSize);   \
1117faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    KMADD(c0, a0, b00, t0)      \
1127faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    KMADD(c1, a0, b01, t1)      \
1137faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
1147faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    KMADD(c0, a1, b10, t0)      \
1157faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    KMADD(c1, a1, b11, t1)       \
1167faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
1177faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          if(RK==4) KMADD(c0, a2, b20, t0)       \
1187faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          if(RK==4) KMADD(c1, a2, b21, t1)       \
1197faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
1207faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          if(RK==4) KMADD(c0, a3, b30, t0)       \
1217faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          if(RK==4) KMADD(c1, a3, b31, t1)       \
1227faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
1237faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    pstore(C0+i+(I)*PacketSize, c0);           \
1247faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    pstore(C1+i+(I)*PacketSize, c1)
1257faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
1267faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        // process rows of A' - C' with aggressive vectorization and peeling
1277faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
1287faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        {
1297faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL1");
1307faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    prefetch((A0+i+(5)*PacketSize));
1317faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    prefetch((A1+i+(5)*PacketSize));
1327faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          if(RK==4) prefetch((A2+i+(5)*PacketSize));
1337faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          if(RK==4) prefetch((A3+i+(5)*PacketSize));
1347faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    WORK(0);
1357faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    WORK(1);
1367faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    WORK(2);
1377faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    WORK(3);
1387faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    WORK(4);
1397faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    WORK(5);
1407faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    WORK(6);
1417faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                    WORK(7);
1427faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        }
1437faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        // process the remaining rows with vectorization only
1447faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
1457faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        {
1467faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          WORK(0);
1477faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        }
1487faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#undef WORK
1497faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        // process the remaining rows without vectorization
1507faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        for(Index i=actual_b_end2; i<actual_b; ++i)
1517faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        {
1527faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          if(RK==4)
1537faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          {
1547faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
1557faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1]+A2[i]*Bc1[2]+A3[i]*Bc1[3];
1567faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          }
1577faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          else
1587faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          {
1597faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
1607faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1];
1617faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          }
1627faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        }
1637faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
1647faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        Bc0 += RK;
1657faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        Bc1 += RK;
1667faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      } // peeled loop on k
1677faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    } // peeled loop on the columns j
1687faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    // process the last column (we now perform a matrux-vector product)
1697faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    if((n-n_end)>0)
1707faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    {
1717faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      const Scalar* Bc0 = B+(n-1)*ldb;
1727faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
1737faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      for(Index k=0; k<d_end; k+=RK)
1747faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      {
1757faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
1767faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        // load and expand a 1 x RK block of B
1777faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        Packet b00, b10, b20, b30;
1787faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  b00 = pset1<Packet>(Bc0[0]);
1797faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  b10 = pset1<Packet>(Bc0[1]);
1807faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
1817faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
1827faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
1837faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        Packet a0, a1, a2, a3, c0, t0/*, t1*/;
1847faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
1857faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        const Scalar* A0 = A+ib+(k+0)*lda;
1867faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        const Scalar* A1 = A+ib+(k+1)*lda;
1877faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        const Scalar* A2 = A+ib+(k+2)*lda;
1887faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        const Scalar* A3 = A+ib+(k+3)*lda;
1897faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
1907faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        Scalar* C0 = C+ib+(n_end)*ldc;
1917faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
1927faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  a0 = pload<Packet>(A0);
1937faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  a1 = pload<Packet>(A1);
1947faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4)
1957faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        {
1967faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          a2 = pload<Packet>(A2);
1977faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          a3 = pload<Packet>(A3);
1987faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        }
1997faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        else
2007faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        {
2017faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          // workaround "may be used uninitialized in this function" warning
2027faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          a2 = a3 = a0;
2037faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        }
2047faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
2057faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#define WORK(I) \
2067faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
2077faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  KMADD(c0, a0, b00, t0)       \
2087faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
2097faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  KMADD(c0, a1, b10, t0)       \
2107faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
2117faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4) KMADD(c0, a2, b20, t0)       \
2127faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
2137faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4) KMADD(c0, a3, b30, t0)       \
2147faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
2157faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                  pstore(C0+i+(I)*PacketSize, c0);
2167faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
2177faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        // agressive vectorization and peeling
2187faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
2197faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        {
2207faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2");
2217faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          WORK(0);
2227faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          WORK(1);
2237faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          WORK(2);
2247faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          WORK(3);
2257faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          WORK(4);
2267faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          WORK(5);
2277faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          WORK(6);
2287faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          WORK(7);
2297faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        }
2307faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        // vectorization only
2317faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
2327faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        {
2337faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          WORK(0);
2347faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        }
2357faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        // remaining scalars
2367faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        for(Index i=actual_b_end2; i<actual_b; ++i)
2377faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        {
2387faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          if(RK==4)
2397faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
2407faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          else
2417faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
2427faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        }
2437faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
2447faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        Bc0 += RK;
2457faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#undef WORK
2467faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      }
2477faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    }
2487faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
2497faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    // process the last columns of A, corresponding to the last rows of B
2507faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    Index rd = d-d_end;
2517faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    if(rd>0)
2527faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    {
2537faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      for(Index j=0; j<n; ++j)
2547faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      {
2557faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        enum {
2567faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez          Alignment = PacketSize>1 ? Aligned : 0
2577faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        };
2587faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        typedef Map<Matrix<Scalar,Dynamic,1>, Alignment > MapVector;
2597faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        typedef Map<const Matrix<Scalar,Dynamic,1>, Alignment > ConstMapVector;
2607faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        if(rd==1)       MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b);
2617faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
2627faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        else if(rd==2)  MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
2637faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b);
2647faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
2657faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez        else            MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
2667faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b)
2677faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez                                                        + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b);
2687faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez      }
2697faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez    }
2707faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
2717faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez  } // blocking on the rows of A and C
2727faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez}
2737faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#undef KMADD
2747faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
2757faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez} // namespace internal
2767faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
2777faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez} // namespace Eigen
2787faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez
2797faaa9f3f0df9d23790277834d426c3d992ac3baCarlos Hernandez#endif // EIGEN_SPARSELU_GEMM_KERNEL_H
280