1a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 3a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Licensed under the Apache License, Version 2.0 (the "License"); 4a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// you may not use this file except in compliance with the License. 5a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// You may obtain a copy of the License at 6a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 7a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// http://www.apache.org/licenses/LICENSE-2.0 8a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 9a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Unless required by applicable law or agreed to in writing, software 10a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// distributed under the License is distributed on an "AS IS" BASIS, 11a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// See the License for the specific language governing permissions and 13a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// limitations under the License. 14a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 15a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifndef GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_ 16a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_ 17a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 18a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include "multi_thread_common.h" 19a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include "single_thread_transform.h" 20a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 21a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangnamespace gemmlowp { 22a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangnamespace meta { 23a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangnamespace internal { 24a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 25a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangconst int kTransformTaskOverhead = 128000; 26a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangconst int kMinTransformTaskSize = 32000; 27a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 28a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename MultiThreadingContext, typename Params> 29a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline bool PrepareTransform1DTasks(MultiThreadingContext* context, 30a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const Params& params, int kernel_size, 31a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::vector<Params>* task_params) { 32a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef Transform1DUtil<typename Params::InType, typename Params::OutType, 33a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typename Params::Kernel> 34a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Util; 35a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 36a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int max_threads = ResolveMaxThreads(context->max_num_threads()); 37a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int task_size = Util::EstimateComputeCost(params.kernel); 38a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int max_tasks_by_size = 39a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang (task_size - kTransformTaskOverhead) / kMinTransformTaskSize; 40a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 41a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int real_tasks = std::max(1, std::min(max_threads, max_tasks_by_size)); 42a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 43a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if (real_tasks == 1) { 44a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return false; 45a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 46a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 47a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int chunk = params.kernel.count / real_tasks; 48a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang for (int i = 0; i < real_tasks - 1; ++i) { 49a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang task_params->push_back(params); 50a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Params& task = task_params->back(); 51a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang task.kernel.count = chunk; 52a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang task.input = Util::OffsetInput(params.kernel, params.input, i * chunk); 53a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang task.output = Util::OffsetOutput(params.kernel, params.output, i * chunk); 54a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 55a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang task_params->push_back(params); 56a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Params& task = task_params->back(); 57a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const int sum_chunk = (real_tasks - 1) * chunk; 58a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang task.kernel.count = params.kernel.count - sum_chunk; 59a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang task.input = Util::OffsetInput(params.kernel, params.input, sum_chunk); 60a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang task.output = Util::OffsetOutput(params.kernel, params.output, sum_chunk); 61a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return true; 62a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 63a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 64a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Params, int kernel_size> 65a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct Transform1DTaskRunner : gemmlowp::Task { 66a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Transform1DTaskRunner(const Params& params) : params(params) {} 67a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 68a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang void Run() override { Transform1D<Params, kernel_size>(params); } 69a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 70a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Params params; 71a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 72a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 73a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} // namespace internal 74a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 75a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename MultiThreadingContext, typename Params, int kernel_size> 76a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MultiThreadTransform1D(MultiThreadingContext* context, 77a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const Params& params) { 78a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang typedef internal::Transform1DTaskRunner<Params, kernel_size> TaskRunnerType; 79a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 80a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::vector<Params> task_params; 81a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang if (!internal::PrepareTransform1DTasks<MultiThreadingContext, Params>( 82a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang context, params, kernel_size, &task_params)) { 83a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang Transform1D<Params, kernel_size>(params); 84a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang return; 85a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang } 86a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 87a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang auto workers_pool = context->workers_pool(); 88a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::vector<Task*> tasks; 89a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::for_each(task_params.begin(), task_params.end(), [tasks](Params* param) { 90a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang tasks.push_back(new TaskRunnerType(param)); 91a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang }); 92a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang workers_pool->Execute(tasks); 93a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} 94a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 95a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} // namespace meta 96a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang} // namespace gemmlowp 97a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 98a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif // GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_ 99