1a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
2a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
3a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Licensed under the Apache License, Version 2.0 (the "License");
4a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// you may not use this file except in compliance with the License.
5a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// You may obtain a copy of the License at
6a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
7a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//     http://www.apache.org/licenses/LICENSE-2.0
8a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
9a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Unless required by applicable law or agreed to in writing, software
10a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// distributed under the License is distributed on an "AS IS" BASIS,
11a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// See the License for the specific language governing permissions and
13a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// limitations under the License.
14a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
15a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#ifndef GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_
16a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#define GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_
17a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
18a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include "multi_thread_common.h"
19a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#include "single_thread_transform.h"
20a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
21a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangnamespace gemmlowp {
22a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangnamespace meta {
23a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangnamespace internal {
24a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
25a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangconst int kTransformTaskOverhead = 128000;
26a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangconst int kMinTransformTaskSize = 32000;
27a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
28a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename MultiThreadingContext, typename Params>
29a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline bool PrepareTransform1DTasks(MultiThreadingContext* context,
30a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                    const Params& params, int kernel_size,
31a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                    std::vector<Params>* task_params) {
32a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef Transform1DUtil<typename Params::InType, typename Params::OutType,
33a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                          typename Params::Kernel>
34a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      Util;
35a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
36a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int max_threads = ResolveMaxThreads(context->max_num_threads());
37a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int task_size = Util::EstimateComputeCost(params.kernel);
38a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int max_tasks_by_size =
39a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang      (task_size - kTransformTaskOverhead) / kMinTransformTaskSize;
40a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
41a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int real_tasks = std::max(1, std::min(max_threads, max_tasks_by_size));
42a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
43a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  if (real_tasks == 1) {
44a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    return false;
45a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
46a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
47a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int chunk = params.kernel.count / real_tasks;
48a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  for (int i = 0; i < real_tasks - 1; ++i) {
49a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    task_params->push_back(params);
50a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    Params& task = task_params->back();
51a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    task.kernel.count = chunk;
52a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    task.input = Util::OffsetInput(params.kernel, params.input, i * chunk);
53a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    task.output = Util::OffsetOutput(params.kernel, params.output, i * chunk);
54a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
55a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  task_params->push_back(params);
56a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  Params& task = task_params->back();
57a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  const int sum_chunk = (real_tasks - 1) * chunk;
58a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  task.kernel.count = params.kernel.count - sum_chunk;
59a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  task.input = Util::OffsetInput(params.kernel, params.input, sum_chunk);
60a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  task.output = Util::OffsetOutput(params.kernel, params.output, sum_chunk);
61a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  return true;
62a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
63a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
64a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename Params, int kernel_size>
65a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct Transform1DTaskRunner : gemmlowp::Task {
66a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  Transform1DTaskRunner(const Params& params) : params(params) {}
67a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
68a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  void Run() override { Transform1D<Params, kernel_size>(params); }
69a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
70a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  Params params;
71a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
72a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
73a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}  // namespace internal
74a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
75a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangtemplate <typename MultiThreadingContext, typename Params, int kernel_size>
76a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wanginline void MultiThreadTransform1D(MultiThreadingContext* context,
77a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang                                   const Params& params) {
78a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  typedef internal::Transform1DTaskRunner<Params, kernel_size> TaskRunnerType;
79a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
80a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::vector<Params> task_params;
81a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  if (!internal::PrepareTransform1DTasks<MultiThreadingContext, Params>(
82a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang          context, params, kernel_size, &task_params)) {
83a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    Transform1D<Params, kernel_size>(params);
84a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    return;
85a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  }
86a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
87a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  auto workers_pool = context->workers_pool();
88a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::vector<Task*> tasks;
89a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::for_each(task_params.begin(), task_params.end(), [tasks](Params* param) {
90a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    tasks.push_back(new TaskRunnerType(param));
91a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  });
92a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  workers_pool->Execute(tasks);
93a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}
94a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
95a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}  // namespace meta
96a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}  // namespace gemmlowp
97a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
98a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang#endif  // GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_
99