1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include <unistd.h>
17#include <cassert>
18#include <cmath>
19#include <cstdio>
20#include <cstdlib>
21#include <iostream>
22#include <limits>
23
24#include "tensorflow/contrib/lite/builtin_op_data.h"
25#include "tensorflow/contrib/lite/context.h"
26#include "tensorflow/contrib/lite/kernels/activation_functor.h"
27#include "tensorflow/contrib/lite/kernels/gemm_support.h"
28#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
29#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
30#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
31#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
32#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
33#include "tensorflow/contrib/lite/kernels/kernel_util.h"
34#include "tensorflow/contrib/lite/kernels/op_macros.h"
35
36namespace tflite {
37namespace ops {
38namespace builtin {
39namespace fully_connected {
40
41// This file has four implementations of FullyConnected
42enum KernelType {
43  kReference,
44  kGenericOptimized,  // Neon-free
45  kNeonOptimized,
46  kPie,  // Used by the PIE team
47};
48
49struct OpData {
50  // The scaling factor from input to output (aka the 'real multiplier') can
51  // be represented as a fixed point multipler plus a left shift.
52  int32_t output_multiplier;
53  int output_shift;
54  // The range of the fused activation layer. For example for kNone and
55  // uint8_t these would be 0 and 255.
56  int32_t output_activation_min;
57  int32_t output_activation_max;
58};
59
60constexpr int kInputTensor = 0;
61constexpr int kWeightsTensor = 1;
62constexpr int kBiasTensor = 2;
63constexpr int kOutputTensor = 0;
64
65void* Init(TfLiteContext* context, const char* buffer, size_t length) {
66  // This is a builtin op, so we don't use the contents in 'buffer', if any.
67  // Instead, we allocate a new object to carry information from Prepare() to
68  // Eval().
69  gemm_support::IncrementUsageCounter(context);
70  return new OpData;
71}
72
73void Free(TfLiteContext* context, void* buffer) {
74  gemm_support::DecrementUsageCounter(context);
75  delete reinterpret_cast<OpData*>(buffer);
76}
77
78TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
79  auto* params =
80      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
81  OpData* data = reinterpret_cast<OpData*>(node->user_data);
82
83  // Check we have all the inputs and outputs we need.
84  TF_LITE_ENSURE_EQ(context, node->inputs->size, 3);
85  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
86
87  TfLiteTensor* input = GetInput(context, node, kInputTensor);
88  TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
89  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
90  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
91
92  // Check all the parameters of tensor match within themselves and match the
93  // input configuration.
94  int input_size = 1;
95  for (int i = 0; i < input->dims->size; i++) {
96    input_size *= input->dims->data[i];
97  }
98
99  const int batch_size = input_size / filter->dims->data[1];
100  const int num_units = filter->dims->data[0];
101
102  TF_LITE_ASSERT_EQ(input_size, batch_size * filter->dims->data[1]);
103  if (bias) {
104    TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units);
105  }
106
107  TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 2);
108  TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
109
110  // Note that quantized inference requires that all tensors have their
111  // parameters set. This is usually done during quantized training.
112  TfLiteType data_type = input->type;
113  if (data_type != kTfLiteFloat32) {
114    double real_multiplier = 0.0;
115    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
116        context, input, filter, bias, output, &real_multiplier));
117    QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
118                                     &data->output_shift);
119    CalculateActivationRangeUint8(params->activation, output,
120                                  &data->output_activation_min,
121                                  &data->output_activation_max);
122  }
123
124  // Resize output.
125  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
126  output_size_array->data[0] = batch_size;
127  output_size_array->data[1] = num_units;
128  TF_LITE_ENSURE_OK(context,
129                    context->ResizeTensor(context, output, output_size_array));
130  return kTfLiteOk;
131}
132
133TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
134                     TfLiteFullyConnectedParams* params, OpData* data,
135                     TfLiteTensor* input, TfLiteTensor* filter,
136                     TfLiteTensor* bias, TfLiteTensor* output) {
137  int total_input_size = 1;
138  for (int i = 0; i < input->dims->size; i++) {
139    total_input_size *= input->dims->data[i];
140  }
141
142  int input_size = filter->dims->data[1];
143  const int batch_size = total_input_size / filter->dims->data[1];
144  const int num_units = filter->dims->data[0];
145
146  // Output = bias if bias tensor exists.
147  if (bias) {
148    tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
149                                          output->data.f);
150  } else {
151    tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
152  }
153
154  // Compute output += weight * input
155  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
156      filter->data.f, num_units, input_size, input->data.f, batch_size,
157      output->data.f, /*result_stride=*/1);
158
159  // Apply activation function
160  tensor_utils::ApplyActivationToVector(output->data.f, batch_size * num_units,
161                                        params->activation, output->data.f);
162
163  return kTfLiteOk;
164}
165
166#define TF_LITE_MACRO_DISPATCH(macro_name, params, target_namespace) \
167  if (params->activation == kTfLiteActNone) {                        \
168    macro_name(target_namespace, kNone);                             \
169  }                                                                  \
170  if (params->activation == kTfLiteActRelu) {                        \
171    macro_name(target_namespace, kRelu);                             \
172  }                                                                  \
173  if (params->activation == kTfLiteActRelu6) {                       \
174    macro_name(target_namespace, kRelu6);                            \
175  }
176
177template <KernelType kernel_type>
178TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
179                           TfLiteFullyConnectedParams* params, OpData* data,
180                           TfLiteTensor* input, TfLiteTensor* filter,
181                           TfLiteTensor* bias, TfLiteTensor* output) {
182  gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
183
184  int32_t input_offset = -input->params.zero_point;
185  int32_t filter_offset = -filter->params.zero_point;
186  int32_t output_offset = output->params.zero_point;
187#define TF_LITE_FULLY_CONNECTED(type)                                       \
188  type::FullyConnected(                                                     \
189      GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,    \
190      GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset, \
191      GetTensorData<int32_t>(bias), GetTensorDims(bias), output_offset,     \
192      data->output_multiplier, data->output_shift,                          \
193      data->output_activation_min, data->output_activation_max,             \
194      GetTensorData<uint8_t>(output), GetTensorDims(output), gemm_context)
195  if (kernel_type == kReference) {
196    TF_LITE_FULLY_CONNECTED(reference_ops);
197  } else if (kernel_type == kPie) {
198    // TODO(ahentz): we don't have a quantized version of the PIE kernels, so
199    // we just defer to the MINI ones.
200    TF_LITE_FULLY_CONNECTED(optimized_ops);
201  } else {
202    TF_LITE_FULLY_CONNECTED(optimized_ops);
203  }
204#undef TF_LITE_FULLY_CONNECTED
205
206  return kTfLiteOk;
207}
208
209template <KernelType kernel_type>
210TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
211                       TfLiteFullyConnectedParams* params, OpData* data,
212                       TfLiteTensor* input, TfLiteTensor* filter,
213                       TfLiteTensor* bias, TfLiteTensor* output) {
214  float output_activation_min, output_activation_max;
215  CalculateActivationRangeFloat(params->activation, &output_activation_min,
216                                &output_activation_max);
217#define TF_LITE_FULLY_CONNECTED(type)                                       \
218  type::FullyConnected(GetTensorData<float>(input), GetTensorDims(input),   \
219                       GetTensorData<float>(filter), GetTensorDims(filter), \
220                       GetTensorData<float>(bias), GetTensorDims(bias),     \
221                       output_activation_min, output_activation_max,        \
222                       GetTensorData<float>(output), GetTensorDims(output))
223  if (kernel_type == kReference) {
224    TF_LITE_FULLY_CONNECTED(reference_ops);
225  } else if (kernel_type == kPie) {
226    return EvalPie(context, node, params, data, input, filter, bias, output);
227  } else {
228    TF_LITE_FULLY_CONNECTED(optimized_ops);
229  }
230#undef TF_LITE_FULLY_CONNECTED
231
232  return kTfLiteOk;
233}
234
235#undef TF_LITE_MACRO_DISPATCH
236
237template <KernelType kernel_type>
238TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
239  auto* params =
240      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
241  OpData* data = reinterpret_cast<OpData*>(node->user_data);
242
243  TfLiteTensor* input = GetInput(context, node, kInputTensor);
244  TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
245  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
246  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
247
248  switch (input->type) {  // Already know in/out types are same.
249    case kTfLiteFloat32:
250      return EvalFloat<kernel_type>(context, node, params, data, input, filter,
251                                    bias, output);
252    case kTfLiteUInt8:
253      return EvalQuantized<kernel_type>(context, node, params, data, input,
254                                        filter, bias, output);
255    default:
256      context->ReportError(context, "Type not currently supported.");
257      return kTfLiteError;
258  }
259  return kTfLiteOk;
260}
261
262}  // namespace fully_connected
263
264TfLiteRegistration* Register_FULLY_CONNECTED_REF() {
265  static TfLiteRegistration r = {
266      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
267      fully_connected::Eval<fully_connected::kReference>};
268  return &r;
269}
270
271TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT() {
272  static TfLiteRegistration r = {
273      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
274      fully_connected::Eval<fully_connected::kNeonOptimized>};
275  return &r;
276}
277
278TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT() {
279  static TfLiteRegistration r = {
280      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
281      fully_connected::Eval<fully_connected::kGenericOptimized>};
282  return &r;
283}
284
285TfLiteRegistration* Register_FULLY_CONNECTED_PIE() {
286  static TfLiteRegistration r = {fully_connected::Init, fully_connected::Free,
287                                 fully_connected::Prepare,
288                                 fully_connected::Eval<fully_connected::kPie>};
289  return &r;
290}
291
292TfLiteRegistration* Register_FULLY_CONNECTED() {
293  // TODO(ahentz): We don't have a dedicated quantized version of the PIE
294  // kernel. For now, the quantized version just defer to the corresponding
295  // optimized MINI kernel. At some point we will allow different libraries to
296  // be built with different kernels, but for now we have to pick one here.
297  return Register_FULLY_CONNECTED_PIE();
298#ifdef USE_NEON
299  return Register_FULLY_CONNECTED_NEON_OPT();
300#else
301  return Register_FULLY_CONNECTED_GENERIC_OPT();
302#endif
303}
304
305}  // namespace builtin
306}  // namespace ops
307}  // namespace tflite
308