1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
6//
7// This Source Code Form is subject to the terms of the Mozilla
8// Public License v. 2.0. If a copy of the MPL was not distributed
9// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
11#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
13
14namespace Eigen {
15
16/** \class TensorReduction
17  * \ingroup CXX11_Tensor_Module
18  *
19  * \brief Tensor reduction class.
20  *
21  */
22
23namespace internal {
24  template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
25  struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
26 : traits<XprType>
27{
28  typedef traits<XprType> XprTraits;
29  typedef typename XprTraits::Scalar Scalar;
30  typedef typename XprTraits::StorageKind StorageKind;
31  typedef typename XprTraits::Index Index;
32  typedef typename XprType::Nested Nested;
33  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
34  static const int Layout = XprTraits::Layout;
35
36  template <class T> struct MakePointer {
37    // Intermediate typedef to workaround MSVC issue.
38    typedef MakePointer_<T> MakePointerT;
39    typedef typename MakePointerT::Type Type;
40  };
41};
42
43template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
44struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
45{
46  typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
47};
48
49template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
50struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
51{
52  typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
53};
54
55
56template <typename OutputDims> struct DimInitializer {
57  template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
58  static void run(const InputDims& input_dims,
59                  const array<bool, internal::array_size<InputDims>::value>& reduced,
60                  OutputDims* output_dims, ReducedDims* reduced_dims) {
61    const int NumInputDims = internal::array_size<InputDims>::value;
62    int outputIndex = 0;
63    int reduceIndex = 0;
64    for (int i = 0; i < NumInputDims; ++i) {
65      if (reduced[i]) {
66        (*reduced_dims)[reduceIndex] = input_dims[i];
67        ++reduceIndex;
68      } else {
69        (*output_dims)[outputIndex] = input_dims[i];
70        ++outputIndex;
71      }
72    }
73  }
74};
75
76template <> struct DimInitializer<Sizes<> > {
77  template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
78  static void run(const InputDims& input_dims, const array<bool, Rank>&,
79                  Sizes<>*, array<Index, Rank>* reduced_dims) {
80    const int NumInputDims = internal::array_size<InputDims>::value;
81    for (int i = 0; i < NumInputDims; ++i) {
82      (*reduced_dims)[i] = input_dims[i];
83    }
84  }
85};
86
87
88template <typename ReducedDims, int NumTensorDims, int Layout>
89struct are_inner_most_dims {
90  static const bool value = false;
91};
92template <typename ReducedDims, int NumTensorDims, int Layout>
93struct preserve_inner_most_dims {
94  static const bool value = false;
95};
96
97#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
98template <typename ReducedDims, int NumTensorDims>
99struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
100  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
101  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
102  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
103  static const bool value = tmp1 & tmp2 & tmp3;
104};
105template <typename ReducedDims, int NumTensorDims>
106struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
107  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
108  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
109  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
110  static const bool value = tmp1 & tmp2 & tmp3;
111
112};
113template <typename ReducedDims, int NumTensorDims>
114struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
115  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
116  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
117  static const bool value = tmp1 & tmp2;
118
119};
120template <typename ReducedDims, int NumTensorDims>
121struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
122  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
123  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
124  static const bool value = tmp1 & tmp2;
125};
126#endif
127
128
129template <int DimIndex, typename Self, typename Op>
130struct GenericDimReducer {
131  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
132    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
133    for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
134      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
135      GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
136    }
137  }
138};
139template <typename Self, typename Op>
140struct GenericDimReducer<0, Self, Op> {
141  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
142    for (int j = 0; j < self.m_reducedDims[0]; ++j) {
143      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
144      reducer.reduce(self.m_impl.coeff(input), accum);
145    }
146  }
147};
148template <typename Self, typename Op>
149struct GenericDimReducer<-1, Self, Op> {
150  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
151    reducer.reduce(self.m_impl.coeff(index), accum);
152  }
153};
154
155template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
156struct InnerMostDimReducer {
157  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
158    typename Self::CoeffReturnType accum = reducer.initialize();
159    for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
160      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
161    }
162    return reducer.finalize(accum);
163  }
164};
165
166template <typename Self, typename Op>
167struct InnerMostDimReducer<Self, Op, true> {
168  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
169    const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
170    const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
171    typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
172    for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
173      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
174    }
175    typename Self::CoeffReturnType accum = reducer.initialize();
176    for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
177      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
178    }
179    return reducer.finalizeBoth(accum, p);
180  }
181};
182
183template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
184struct InnerMostDimPreserver {
185  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
186    eigen_assert(false && "should never be called");
187  }
188};
189
190template <int DimIndex, typename Self, typename Op>
191struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
192  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
193    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
194    for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
195      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
196      InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
197    }
198  }
199};
200
201template <typename Self, typename Op>
202struct InnerMostDimPreserver<0, Self, Op, true> {
203  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
204    for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
205      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
206      reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
207    }
208  }
209};
210template <typename Self, typename Op>
211struct InnerMostDimPreserver<-1, Self, Op, true> {
212  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
213    eigen_assert(false && "should never be called");
214  }
215};
216
217// Default full reducer
218template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
219struct FullReducer {
220  static const bool HasOptimizedImplementation = false;
221
222  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
223    const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
224    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
225  }
226};
227
228
229#ifdef EIGEN_USE_THREADS
230// Multithreaded full reducers
231template <typename Self, typename Op,
232          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
233struct FullReducerShard {
234  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
235                  typename Self::Index numValuesToReduce, Op& reducer,
236                  typename Self::CoeffReturnType* output) {
237    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
238        self, firstIndex, numValuesToReduce, reducer);
239  }
240};
241
242// Multithreaded full reducer
243template <typename Self, typename Op, bool Vectorizable>
244struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
245  static const bool HasOptimizedImplementation = !Op::IsStateful;
246  static const int PacketSize =
247      unpacket_traits<typename Self::PacketReturnType>::size;
248
249  // launch one reducer per thread and accumulate the result.
250  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
251                  typename Self::CoeffReturnType* output) {
252    typedef typename Self::Index Index;
253    const Index num_coeffs = array_prod(self.m_impl.dimensions());
254    if (num_coeffs == 0) {
255      *output = reducer.finalize(reducer.initialize());
256      return;
257    }
258    const TensorOpCost cost =
259        self.m_impl.costPerCoeff(Vectorizable) +
260        TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
261                     PacketSize);
262    const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
263        num_coeffs, cost, device.numThreads());
264    if (num_threads == 1) {
265      *output =
266          InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
267      return;
268    }
269    const Index blocksize =
270        std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
271    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
272    eigen_assert(num_coeffs >= numblocks * blocksize);
273
274    Barrier barrier(internal::convert_index<unsigned int>(numblocks));
275    MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
276    for (Index i = 0; i < numblocks; ++i) {
277      device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
278                                  self, i * blocksize, blocksize, reducer,
279                                  &shards[i]);
280    }
281    typename Self::CoeffReturnType finalShard;
282    if (numblocks * blocksize < num_coeffs) {
283      finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
284          self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
285          reducer);
286    } else {
287      finalShard = reducer.initialize();
288    }
289    barrier.Wait();
290
291    for (Index i = 0; i < numblocks; ++i) {
292      reducer.reduce(shards[i], &finalShard);
293    }
294    *output = reducer.finalize(finalShard);
295  }
296};
297
298#endif
299
300
301// Default inner reducer
302template <typename Self, typename Op, typename Device>
303struct InnerReducer {
304  static const bool HasOptimizedImplementation = false;
305
306  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
307    eigen_assert(false && "Not implemented");
308    return true;
309  }
310};
311
312// Default outer reducer
313template <typename Self, typename Op, typename Device>
314struct OuterReducer {
315  static const bool HasOptimizedImplementation = false;
316
317  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
318    eigen_assert(false && "Not implemented");
319    return true;
320  }
321};
322
323
324#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
325template <int B, int N, typename S, typename R, typename I>
326__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
327
328
329#ifdef EIGEN_HAS_CUDA_FP16
330template <typename S, typename R, typename I>
331__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
332template <int B, int N, typename S, typename R, typename I>
333__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
334template <int NPT, typename S, typename R, typename I>
335__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
336
337#endif
338
339template <int NPT, typename S, typename R, typename I>
340__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
341
342template <int NPT, typename S, typename R, typename I>
343__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
344#endif
345
346}  // end namespace internal
347
348
349template <typename Op, typename Dims, typename XprType,  template <class> class MakePointer_>
350class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
351  public:
352    typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
353    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
354    typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
355    typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
356    typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
357    typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
358
359    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
360    TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
361    { }
362    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
363    TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
364    { }
365
366    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
367    const XprType& expression() const { return m_expr; }
368    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
369    const Dims& dims() const { return m_dims; }
370    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
371    const Op& reducer() const { return m_reducer; }
372
373  protected:
374    typename XprType::Nested m_expr;
375    const Dims m_dims;
376    const Op m_reducer;
377};
378
379
380// Eval as rvalue
381template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
382struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
383{
384  typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
385  typedef typename XprType::Index Index;
386  typedef ArgType ChildType;
387  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
388  static const int NumInputDims = internal::array_size<InputDimensions>::value;
389  static const int NumReducedDims = internal::array_size<Dims>::value;
390  static const int NumOutputDims = NumInputDims - NumReducedDims;
391  typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
392  typedef typename XprType::Scalar Scalar;
393  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
394  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
395  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
396  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
397  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
398
399  enum {
400    IsAligned = false,
401    PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
402    Layout = TensorEvaluator<ArgType, Device>::Layout,
403    CoordAccess = false,  // to be implemented
404    RawAccess = false
405  };
406
407  static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
408  static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
409  static const bool RunningFullReduction = (NumOutputDims==0);
410
411  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
412      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims())
413  {
414    EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
415    EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
416                        YOU_MADE_A_PROGRAMMING_MISTAKE);
417
418    // Build the bitmap indicating if an input dimension is reduced or not.
419    for (int i = 0; i < NumInputDims; ++i) {
420      m_reduced[i] = false;
421    }
422    for (int i = 0; i < NumReducedDims; ++i) {
423      eigen_assert(op.dims()[i] >= 0);
424      eigen_assert(op.dims()[i] < NumInputDims);
425      m_reduced[op.dims()[i]] = true;
426    }
427
428    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
429    internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
430
431    // Precompute output strides.
432    if (NumOutputDims > 0) {
433      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
434        m_outputStrides[0] = 1;
435        for (int i = 1; i < NumOutputDims; ++i) {
436          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
437        }
438      } else {
439        m_outputStrides.back() = 1;
440        for (int i = NumOutputDims - 2; i >= 0; --i) {
441          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
442        }
443      }
444    }
445
446    // Precompute input strides.
447    if (NumInputDims > 0) {
448      array<Index, NumInputDims> input_strides;
449      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
450        input_strides[0] = 1;
451        for (int i = 1; i < NumInputDims; ++i) {
452          input_strides[i] = input_strides[i-1] * input_dims[i-1];
453        }
454      } else {
455        input_strides.back() = 1;
456        for (int i = NumInputDims - 2; i >= 0; --i) {
457          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
458        }
459      }
460
461      int outputIndex = 0;
462      int reduceIndex = 0;
463      for (int i = 0; i < NumInputDims; ++i) {
464        if (m_reduced[i]) {
465          m_reducedStrides[reduceIndex] = input_strides[i];
466          ++reduceIndex;
467        } else {
468          m_preservedStrides[outputIndex] = input_strides[i];
469          ++outputIndex;
470        }
471      }
472    }
473
474    // Special case for full reductions
475    if (NumOutputDims == 0) {
476      m_preservedStrides[0] = internal::array_prod(input_dims);
477    }
478  }
479
480  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
481
482  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
483    m_impl.evalSubExprsIfNeeded(NULL);
484
485    // Use the FullReducer if possible.
486    if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
487        internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
488        ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
489         !RunningOnGPU))) {
490      bool need_assign = false;
491      if (!data) {
492        m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
493        data = m_result;
494        need_assign = true;
495      }
496      Op reducer(m_reducer);
497      internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
498      return need_assign;
499    }
500    else if(RunningOnSycl){
501      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
502      const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
503      if (!data) {
504        data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
505        m_result = data;
506      }
507      Op reducer(m_reducer);
508      internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
509      return (m_result != NULL);
510    }
511
512    // Attempt to use an optimized reduction.
513    else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
514      bool reducing_inner_dims = true;
515      for (int i = 0; i < NumReducedDims; ++i) {
516        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
517          reducing_inner_dims &= m_reduced[i];
518        } else {
519          reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
520        }
521      }
522      if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
523          (reducing_inner_dims || ReducingInnerMostDims)) {
524        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
525        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
526        if (!data) {
527          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
528            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
529            m_result = data;
530          }
531          else {
532            return true;
533          }
534        }
535        Op reducer(m_reducer);
536        if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
537          if (m_result) {
538            m_device.deallocate(m_result);
539            m_result = NULL;
540          }
541          return true;
542        } else {
543          return (m_result != NULL);
544        }
545      }
546
547      bool preserving_inner_dims = true;
548      for (int i = 0; i < NumReducedDims; ++i) {
549        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
550          preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
551        } else {
552          preserving_inner_dims &= m_reduced[i];
553        }
554      }
555      if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
556          preserving_inner_dims) {
557        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
558        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
559        if (!data) {
560          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
561            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
562            m_result = data;
563          }
564          else {
565            return true;
566          }
567        }
568        Op reducer(m_reducer);
569        if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
570          if (m_result) {
571            m_device.deallocate(m_result);
572            m_result = NULL;
573          }
574          return true;
575        } else {
576          return (m_result != NULL);
577        }
578      }
579    }
580    return true;
581  }
582
583  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
584    m_impl.cleanup();
585    if (m_result) {
586      m_device.deallocate(m_result);
587      m_result = NULL;
588    }
589  }
590
591  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
592  {
593    if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
594      return *(m_result + index);
595    }
596    Op reducer(m_reducer);
597    if (ReducingInnerMostDims || RunningFullReduction) {
598      const Index num_values_to_reduce =
599        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
600      return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
601                                                             num_values_to_reduce, reducer);
602    } else {
603      typename Self::CoeffReturnType accum = reducer.initialize();
604      internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
605      return reducer.finalize(accum);
606    }
607  }
608
609  // TODO(bsteiner): provide a more efficient implementation.
610  template<int LoadMode>
611  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
612  {
613    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
614    eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
615
616    if (RunningOnGPU && m_result) {
617      return internal::pload<PacketReturnType>(m_result + index);
618    }
619
620    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
621    if (ReducingInnerMostDims) {
622      const Index num_values_to_reduce =
623        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
624      const Index firstIndex = firstInput(index);
625      for (Index i = 0; i < PacketSize; ++i) {
626        Op reducer(m_reducer);
627        values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
628                                                                    num_values_to_reduce, reducer);
629      }
630    } else if (PreservingInnerMostDims) {
631      const Index firstIndex = firstInput(index);
632      const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
633      // TBD: extend this the the n innermost dimensions that we preserve.
634      if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
635        Op reducer(m_reducer);
636        typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
637        internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
638        return reducer.finalizePacket(accum);
639      } else {
640        for (int i = 0; i < PacketSize; ++i) {
641          values[i] = coeff(index + i);
642        }
643      }
644    } else {
645      for (int i = 0; i < PacketSize; ++i) {
646        values[i] = coeff(index + i);
647      }
648    }
649    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
650    return rslt;
651  }
652
653  // Must be called after evalSubExprsIfNeeded().
654  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
655    if (RunningFullReduction && m_result) {
656      return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
657    } else {
658      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
659      const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
660      return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
661          TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
662    }
663  }
664
665  EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; }
666  /// required by sycl in order to extract the accessor
667  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
668  /// added for sycl in order to construct the buffer from the sycl device
669  const Device& device() const{return m_device;}
670  /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel
671  const Dims& xprDims() const {return m_xpr_dims;}
672
673
674  private:
675  template <int, typename, typename> friend struct internal::GenericDimReducer;
676  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
677  template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
678  template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
679#ifdef EIGEN_USE_THREADS
680  template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
681#endif
682#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
683  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
684#ifdef EIGEN_HAS_CUDA_FP16
685  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
686  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
687  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
688#endif
689  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
690
691  template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
692#endif
693
694  template <typename S, typename O, typename D> friend struct internal::InnerReducer;
695
696  // Returns the Index in the input tensor of the first value that needs to be
697  // used to compute the reduction at output index "index".
698  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
699    if (ReducingInnerMostDims) {
700      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
701        return index * m_preservedStrides[0];
702      } else {
703        return index * m_preservedStrides[NumPreservedStrides - 1];
704      }
705    }
706    // TBD: optimize the case where we preserve the innermost dimensions.
707    Index startInput = 0;
708    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
709      for (int i = NumOutputDims - 1; i > 0; --i) {
710        // This is index_i in the output tensor.
711        const Index idx = index / m_outputStrides[i];
712        startInput += idx * m_preservedStrides[i];
713        index -= idx * m_outputStrides[i];
714      }
715      if (PreservingInnerMostDims) {
716        eigen_assert(m_preservedStrides[0] == 1);
717        startInput += index;
718      } else {
719        startInput += index * m_preservedStrides[0];
720      }
721    } else {
722      for (int i = 0; i < NumOutputDims - 1; ++i) {
723        // This is index_i in the output tensor.
724        const Index idx = index / m_outputStrides[i];
725        startInput += idx * m_preservedStrides[i];
726        index -= idx * m_outputStrides[i];
727      }
728      if (PreservingInnerMostDims) {
729        eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
730        startInput += index;
731      } else {
732        startInput += index * m_preservedStrides[NumPreservedStrides - 1];
733      }
734    }
735    return startInput;
736  }
737
738  // Bitmap indicating if an input dimension is reduced or not.
739  array<bool, NumInputDims> m_reduced;
740  // Dimensions of the output of the operation.
741  Dimensions m_dimensions;
742  // Precomputed strides for the output tensor.
743  array<Index, NumOutputDims> m_outputStrides;
744  // Subset of strides of the input tensor for the non-reduced dimensions.
745  // Indexed by output dimensions.
746  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
747  array<Index, NumPreservedStrides> m_preservedStrides;
748
749  // Subset of strides of the input tensor for the reduced dimensions.
750  // Indexed by reduced dimensions.
751  array<Index, NumReducedDims> m_reducedStrides;
752  // Size of the input dimensions that are reduced.
753  // Indexed by reduced dimensions.
754  array<Index, NumReducedDims> m_reducedDims;
755
756  // Evaluator for the input expression.
757  TensorEvaluator<ArgType, Device> m_impl;
758
759  // Operation to apply for computing the reduction.
760  Op m_reducer;
761
762  // For full reductions
763#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
764  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
765  static const bool RunningOnSycl = false;
766#elif defined(EIGEN_USE_SYCL)
767static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
768static const bool RunningOnGPU = false;
769#else
770  static const bool RunningOnGPU = false;
771  static const bool RunningOnSycl = false;
772#endif
773  typename MakePointer_<CoeffReturnType>::Type m_result;
774
775  const Device& m_device;
776  const Dims& m_xpr_dims;
777};
778
779} // end namespace Eigen
780
781#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
782