111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// -*- C++ -*- 211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Copyright (C) 2007-2014 Free Software Foundation, Inc. 411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// 511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// This file is part of the GNU ISO C++ Library. This library is free 611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// software; you can redistribute it and/or modify it under the terms 711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// of the GNU General Public License as published by the Free Software 811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Foundation; either version 3, or (at your option) any later 911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// version. 1011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 1111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// This library is distributed in the hope that it will be useful, but 1211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// WITHOUT ANY WARRANTY; without even the implied warranty of 1311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// General Public License for more details. 1511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 1611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Under Section 7 of GPL version 3, you are granted additional 1711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// permissions described in the GCC Runtime Library Exception, version 1811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// 3.1, as published by the Free Software Foundation. 1911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 2011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// You should have received a copy of the GNU General Public License and 2111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// a copy of the GCC Runtime Library Exception along with this program; 2211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 2311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// <http://www.gnu.org/licenses/>. 2411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 2511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert/** @file parallel/partial_sum.h 2611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @brief Parallel implementation of std::partial_sum(), i.e. prefix 2711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert* sums. 2811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * This file is a GNU parallel extension to the Standard C++ Library. 2911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert */ 3011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 3111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert// Written by Johannes Singler. 3211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 3311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#ifndef _GLIBCXX_PARALLEL_PARTIAL_SUM_H 3411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#define _GLIBCXX_PARALLEL_PARTIAL_SUM_H 1 3511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 3611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#include <omp.h> 3711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#include <new> 3811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#include <bits/stl_algobase.h> 3911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#include <parallel/parallel.h> 4011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#include <parallel/numericfwd.h> 4111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 4211cd02dfb91661c65134cac258cf5924270e9d2Dan Albertnamespace __gnu_parallel 4311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert{ 4411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert // Problem: there is no 0-element given. 4511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 4611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert /** @brief Base case prefix sum routine. 4711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __begin Begin iterator of input sequence. 4811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __end End iterator of input sequence. 4911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __result Begin iterator of output sequence. 5011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __bin_op Associative binary function. 5111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __value Start value. Must be passed since the neutral 5211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * element is unknown in general. 5311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @return End iterator of output sequence. */ 5411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert template<typename _IIter, 5511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typename _OutputIterator, 5611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typename _BinaryOperation> 5711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _OutputIterator 5811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __parallel_partial_sum_basecase(_IIter __begin, _IIter __end, 5911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _OutputIterator __result, 6011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _BinaryOperation __bin_op, 6111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typename std::iterator_traits <_IIter>::value_type __value) 6211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 6311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert if (__begin == __end) 6411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert return __result; 6511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 6611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert while (__begin != __end) 6711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 6811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __value = __bin_op(__value, *__begin); 6911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert *__result = __value; 7011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert ++__result; 7111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert ++__begin; 7211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 7311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert return __result; 7411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 7511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 7611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert /** @brief Parallel partial sum implementation, two-phase approach, 7711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert no recursion. 7811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __begin Begin iterator of input sequence. 7911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __end End iterator of input sequence. 8011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __result Begin iterator of output sequence. 8111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __bin_op Associative binary function. 8211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __n Length of sequence. 8311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @return End iterator of output sequence. 8411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert */ 8511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert template<typename _IIter, 8611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typename _OutputIterator, 8711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typename _BinaryOperation> 8811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _OutputIterator 8911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __parallel_partial_sum_linear(_IIter __begin, _IIter __end, 9011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _OutputIterator __result, 9111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _BinaryOperation __bin_op, 9211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typename std::iterator_traits<_IIter>::difference_type __n) 9311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 9411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typedef std::iterator_traits<_IIter> _TraitsType; 9511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typedef typename _TraitsType::value_type _ValueType; 9611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typedef typename _TraitsType::difference_type _DifferenceType; 9711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 9811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert if (__begin == __end) 9911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert return __result; 10011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 10111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _ThreadIndex __num_threads = 10211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert std::min<_DifferenceType>(__get_max_threads(), __n - 1); 10311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 10411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert if (__num_threads < 2) 10511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 10611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert *__result = *__begin; 10711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert return __parallel_partial_sum_basecase(__begin + 1, __end, 10811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __result + 1, __bin_op, 10911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert *__begin); 11011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 11111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 11211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _DifferenceType* __borders; 11311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _ValueType* __sums; 11411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 11511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert const _Settings& __s = _Settings::get(); 11611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 11711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert# pragma omp parallel num_threads(__num_threads) 11811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 11911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert# pragma omp single 12011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 12111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __num_threads = omp_get_num_threads(); 12211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 12311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __borders = new _DifferenceType[__num_threads + 2]; 12411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 12511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert if (__s.partial_sum_dilation == 1.0f) 12611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __equally_split(__n, __num_threads + 1, __borders); 12711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert else 12811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 12911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _DifferenceType __first_part_length = 13011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert std::max<_DifferenceType>(1, 13111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __n / (1.0f + __s.partial_sum_dilation * __num_threads)); 13211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _DifferenceType __chunk_length = 13311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert (__n - __first_part_length) / __num_threads; 13411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _DifferenceType __borderstart = 13511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __n - __num_threads * __chunk_length; 13611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __borders[0] = 0; 13711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert for (_ThreadIndex __i = 1; __i < (__num_threads + 1); ++__i) 13811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 13911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __borders[__i] = __borderstart; 14011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __borderstart += __chunk_length; 14111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 14211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __borders[__num_threads + 1] = __n; 14311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 14411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 14511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __sums = static_cast<_ValueType*>(::operator new(sizeof(_ValueType) 14611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * __num_threads)); 14711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _OutputIterator __target_end; 14811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } //single 14911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 15011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _ThreadIndex __iam = omp_get_thread_num(); 15111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert if (__iam == 0) 15211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 15311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert *__result = *__begin; 15411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __parallel_partial_sum_basecase(__begin + 1, 15511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __begin + __borders[1], 15611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __result + 1, 15711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __bin_op, *__begin); 15811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert ::new(&(__sums[__iam])) _ValueType(*(__result + __borders[1] - 1)); 15911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 16011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert else 16111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 16211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert ::new(&(__sums[__iam])) 16311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _ValueType(__gnu_parallel::accumulate( 16411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __begin + __borders[__iam] + 1, 16511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __begin + __borders[__iam + 1], 16611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert *(__begin + __borders[__iam]), 16711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __bin_op, 16811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __gnu_parallel::sequential_tag())); 16911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 17011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 17111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert# pragma omp barrier 17211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 17311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert# pragma omp single 17411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __parallel_partial_sum_basecase(__sums + 1, __sums + __num_threads, 17511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __sums + 1, __bin_op, __sums[0]); 17611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 17711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert# pragma omp barrier 17811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 17911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert // Still same team. 18011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __parallel_partial_sum_basecase(__begin + __borders[__iam + 1], 18111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __begin + __borders[__iam + 2], 18211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __result + __borders[__iam + 1], 18311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __bin_op, __sums[__iam]); 18411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } //parallel 18511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 18611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert for (_ThreadIndex __i = 0; __i < __num_threads; ++__i) 18711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __sums[__i].~_ValueType(); 18811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert ::operator delete(__sums); 18911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 19011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert delete[] __borders; 19111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 19211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert return __result + __n; 19311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 19411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 19511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert /** @brief Parallel partial sum front-__end. 19611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __begin Begin iterator of input sequence. 19711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __end End iterator of input sequence. 19811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __result Begin iterator of output sequence. 19911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @param __bin_op Associative binary function. 20011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert * @return End iterator of output sequence. */ 20111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert template<typename _IIter, 20211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typename _OutputIterator, 20311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typename _BinaryOperation> 20411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _OutputIterator 20511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __parallel_partial_sum(_IIter __begin, _IIter __end, 20611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _OutputIterator __result, _BinaryOperation __bin_op) 20711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 20811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _GLIBCXX_CALL(__begin - __end) 20911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 21011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typedef std::iterator_traits<_IIter> _TraitsType; 21111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typedef typename _TraitsType::value_type _ValueType; 21211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert typedef typename _TraitsType::difference_type _DifferenceType; 21311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 21411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _DifferenceType __n = __end - __begin; 21511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 21611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert switch (_Settings::get().partial_sum_algorithm) 21711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert { 21811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert case LINEAR: 21911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert // Need an initial offset. 22011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert return __parallel_partial_sum_linear(__begin, __end, __result, 22111cd02dfb91661c65134cac258cf5924270e9d2Dan Albert __bin_op, __n); 22211cd02dfb91661c65134cac258cf5924270e9d2Dan Albert default: 22311cd02dfb91661c65134cac258cf5924270e9d2Dan Albert // Partial_sum algorithm not implemented. 22411cd02dfb91661c65134cac258cf5924270e9d2Dan Albert _GLIBCXX_PARALLEL_ASSERT(0); 22511cd02dfb91661c65134cac258cf5924270e9d2Dan Albert return __result + __n; 22611cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 22711cd02dfb91661c65134cac258cf5924270e9d2Dan Albert } 22811cd02dfb91661c65134cac258cf5924270e9d2Dan Albert} 22911cd02dfb91661c65134cac258cf5924270e9d2Dan Albert 23011cd02dfb91661c65134cac258cf5924270e9d2Dan Albert#endif /* _GLIBCXX_PARALLEL_PARTIAL_SUM_H */ 231