depthwiseconv_float.h revision 4d90af18b92eb804bce2c334e718fddc691df28e
1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
16#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
17
18#include "public/gemmlowp.h"
19#include "tensorflow/contrib/lite/kernels/internal/common.h"
20#include "tensorflow/contrib/lite/kernels/internal/types.h"
21
22namespace tflite {
23namespace optimized_ops {
24
25// Implementation of float DepthwiseConv
26
27template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
28struct FloatDepthwiseConvKernel {};
29
30#ifdef USE_NEON
31
32template <>
33struct FloatDepthwiseConvKernel<false, 8, 1> {
34  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
35                  const float* input_ptr, int input_ptr_increment,
36                  const float* filter_ptr, float* acc_buffer_ptr) {
37    // Load the filters
38    float32x4_t filter[2];
39    for (int i = 0; i < 2; i++) {
40      filter[i] = vld1q_f32(filter_ptr + 4 * i);
41    }
42    int outp = 0;
43    // Handle 2 output pixels at a time.
44    for (; outp <= num_output_pixels - 2; outp += 2) {
45      // Load the inputs
46      float32x4_t input[4];
47      for (int i = 0; i < 4; i++) {
48        input[i] = vld1q_f32(input_ptr + 4 * i);
49      }
50      input_ptr += 16;
51      // Load the accumulators from acc_buffer
52      float32x4_t acc[4];
53      for (int i = 0; i < 4; i++) {
54        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
55      }
56      // Multiply-accumulate
57      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
58      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
59      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
60      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
61      // Store the accumulators back to acc_buffer
62      for (int i = 0; i < 4; i++) {
63        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
64      }
65      acc_buffer_ptr += 16;
66    }
67    // Handle one output pixel at a time.
68    for (; outp < num_output_pixels; outp++) {
69      // Load the inputs
70      float32x4_t input[2];
71      for (int i = 0; i < 2; i++) {
72        input[i] = vld1q_f32(input_ptr + 4 * i);
73      }
74      input_ptr += 8;
75      // Load the accumulators from acc_buffer
76      float32x4_t acc[2];
77      for (int i = 0; i < 2; i++) {
78        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
79      }
80      // Multiply-accumulate
81      for (int i = 0; i < 2; i++) {
82        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
83      }
84      // Store the accumulators back to acc_buffer
85      for (int i = 0; i < 2; i++) {
86        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
87      }
88      acc_buffer_ptr += 8;
89    }
90  }
91};
92
93template <>
94struct FloatDepthwiseConvKernel<false, 2, 1> {
95  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
96                  const float* input_ptr, int input_ptr_increment,
97                  const float* filter_ptr, float* acc_buffer_ptr) {
98    const float32x2_t filters = vld1_f32(filter_ptr);
99    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
100    int outp = 0;
101    // Handle 8 output pixels at a time.
102    for (; outp <= num_output_pixels - 8; outp += 8) {
103      // Load the inputs
104      float32x4_t input[4];
105      for (int i = 0; i < 4; i++) {
106        input[i] = vld1q_f32(input_ptr + 4 * i);
107      }
108      input_ptr += 16;
109      // Load the accumulators from acc_buffer
110      float32x4_t acc[4];
111      for (int i = 0; i < 4; i++) {
112        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
113      }
114      // Multiply-accumulate
115      for (int i = 0; i < 4; i++) {
116        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
117      }
118      // Store the accumulators back to acc_buffer
119      for (int i = 0; i < 4; i++) {
120        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
121      }
122      acc_buffer_ptr += 16;
123    }
124    // Handle 4 output pixels at a time.
125    for (; outp <= num_output_pixels - 4; outp += 4) {
126      // Load the inputs
127      float32x4_t input[2];
128      for (int i = 0; i < 2; i++) {
129        input[i] = vld1q_f32(input_ptr + 4 * i);
130      }
131      input_ptr += 8;
132      // Load the accumulators from acc_buffer
133      float32x4_t acc[2];
134      for (int i = 0; i < 2; i++) {
135        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
136      }
137      // Multiply-accumulate
138      for (int i = 0; i < 2; i++) {
139        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
140      }
141      // Store the accumulators back to acc_buffer
142      for (int i = 0; i < 2; i++) {
143        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
144      }
145      acc_buffer_ptr += 8;
146    }
147    // Handle 2 output pixels at a time.
148    for (; outp <= num_output_pixels - 2; outp += 2) {
149      // Load the inputs
150      const float32x4_t input = vld1q_f32(input_ptr);
151      input_ptr += 4;
152      // Load the accumulators from acc_buffer
153      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
154      // Multiply-accumulate
155      acc = vmlaq_f32(acc, input, filters_dup2);
156      // Store the accumulators back to acc_buffer
157      vst1q_f32(acc_buffer_ptr, acc);
158      acc_buffer_ptr += 4;
159    }
160    // Handle 1 output pixel at a time
161    for (; outp < num_output_pixels; outp++) {
162      // Load the inputs
163      const float32x2_t input = vld1_f32(input_ptr);
164      input_ptr += 2;
165      // Load the accumulators from acc_buffer
166      float32x2_t acc = vld1_f32(acc_buffer_ptr);
167      // Multiply-accumulate
168      acc = vmla_f32(acc, input, filters);
169      // Store the accumulators back to acc_buffer
170      vst1_f32(acc_buffer_ptr, acc);
171      acc_buffer_ptr += 2;
172    }
173  }
174};
175
176template <>
177struct FloatDepthwiseConvKernel<true, 0, 1> {
178  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
179                  const float* input_ptr, int input_ptr_increment,
180                  const float* filter_ptr, float* acc_buffer_ptr) {
181    // Handle one output pixel at a time.
182    for (int outp = 0; outp < num_output_pixels; outp++) {
183      const float* local_filter_ptr = filter_ptr;
184      const float* local_input_ptr = input_ptr;
185      int ic = 0;
186      // Handle 16 input channels at a time.
187      for (; ic <= input_depth - 16; ic += 16) {
188        // Load the filters
189        float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
190        float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
191        float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
192        float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
193        local_filter_ptr += 16;
194        // Load the inputs
195        float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
196        float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
197        float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
198        float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
199        local_input_ptr += 16;
200        // Load the accumulators from acc_buffer
201        float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
202        float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
203        float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
204        float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
205        // Multiply-accumulate
206        acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
207        acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
208        acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
209        acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
210        // Store the accumulators back to acc_buffer
211        vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
212        vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
213        vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
214        vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
215        acc_buffer_ptr += 16;
216      }
217      // Handle 4 input channels at a time.
218      for (; ic <= input_depth - 4; ic += 4) {
219        // Load the filters
220        float32x4_t filter;
221        filter = vld1q_f32(local_filter_ptr);
222        local_filter_ptr += 4;
223        // Load the inputs
224        float32x4_t input;
225        input = vld1q_f32(local_input_ptr);
226        local_input_ptr += 4;
227        // Load the accumulators from acc_buffer
228        float32x4_t acc;
229        acc = vld1q_f32(acc_buffer_ptr);
230        // Multiply-accumulate
231        acc = vmlaq_f32(acc, input, filter);
232        // Store the accumulators back to acc_buffer
233        vst1q_f32(acc_buffer_ptr, acc);
234        acc_buffer_ptr += 4;
235      }
236      // Handle one input channel at a time.
237      for (; ic < input_depth; ic++) {
238        const float input_val = *local_input_ptr++;
239        const float filter_val = *local_filter_ptr++;
240        *acc_buffer_ptr++ += filter_val * input_val;
241      }
242      input_ptr += input_ptr_increment;
243    }
244  }
245};
246
247template <>
248struct FloatDepthwiseConvKernel<true, 0, 8> {
249  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
250                  const float* input_ptr, int input_ptr_increment,
251                  const float* filter_ptr, float* acc_buffer_ptr) {
252    // Handle one output pixel at a time.
253    for (int outp = 0; outp < num_output_pixels; outp++) {
254      const float* local_filter_ptr = filter_ptr;
255      const float* local_input_ptr = input_ptr;
256      int ic = 0;
257      // Handle 2 input channels at a time.
258      for (; ic <= input_depth - 2; ic += 2) {
259        // Load the filters
260        float32x4_t filter[4];
261        for (int i = 0; i < 4; i++) {
262          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
263        }
264        local_filter_ptr += 16;
265        // Load the inputs
266        const float32x2_t input = vld1_f32(local_input_ptr);
267        local_input_ptr += 2;
268        // Load the accumulators from acc_buffer
269        float32x4_t acc[4];
270        for (int i = 0; i < 4; i++) {
271          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
272        }
273        // Multiply-accumulate
274        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
275        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
276        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
277        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
278        // Store the accumulators back to acc_buffer
279        for (int i = 0; i < 4; i++) {
280          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
281        }
282        acc_buffer_ptr += 16;
283      }
284      // Handle one input channel at a time.
285      for (; ic < input_depth; ic++) {
286        // Load the filters
287        float32x4_t filter[2];
288        for (int i = 0; i < 2; i++) {
289          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
290        }
291        local_filter_ptr += 8;
292        // Load the inputs
293        const float input_val = *local_input_ptr++;
294        // Load the accumulators from acc_buffer
295        float32x4_t acc[2];
296        for (int i = 0; i < 2; i++) {
297          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
298        }
299        // Multiply-accumulate
300        for (int i = 0; i < 2; i++) {
301          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
302        }
303        // Store the accumulators back to acc_buffer
304        for (int i = 0; i < 2; i++) {
305          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
306        }
307        acc_buffer_ptr += 8;
308      }
309      input_ptr += input_ptr_increment;
310    }
311  }
312};
313
314// Note this implementation is very slow for input_depths < 8
315// (e.g. comparable to reference implementation) see, specializations for
316// input_depth=3 below.
317template <>
318struct FloatDepthwiseConvKernel<true, 0, 2> {
319  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
320                  const float* input_ptr, int input_ptr_increment,
321                  const float* filter_ptr, float* acc_buffer_ptr) {
322    // Handle one output pixel at a time.
323    for (int outp = 0; outp < num_output_pixels; outp++) {
324      const float* local_filter_ptr = filter_ptr;
325      const float* local_input_ptr = input_ptr;
326      int ic = 0;
327      // Handle 8 input channels at a time.
328      for (; ic <= input_depth - 8; ic += 8) {
329        // Load the filters
330        float32x4_t filter[4];
331        for (int i = 0; i < 4; i++) {
332          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
333        }
334        local_filter_ptr += 16;
335        // Load the inputs
336        float32x4x2_t input_dup2[2];
337        for (int i = 0; i < 2; i++) {
338          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
339          input_dup2[i] = vzipq_f32(input, input);
340        }
341        local_input_ptr += 8;
342        // Load the accumulators from acc_buffer
343        float32x4_t acc[4];
344        for (int i = 0; i < 4; i++) {
345          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
346        }
347        // Multiply-accumulate
348        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
349        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
350        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
351        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
352        // Store the accumulators back to acc_buffer
353        for (int i = 0; i < 4; i++) {
354          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
355        }
356        acc_buffer_ptr += 16;
357      }
358      // Handle 4 input channels at a time.
359      for (; ic <= input_depth - 4; ic += 4) {
360        // Load the filters
361        float32x2_t filter[4];
362        for (int i = 0; i < 4; i++) {
363          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
364        }
365        local_filter_ptr += 8;
366        // Load the inputs
367        const float32x4_t input = vld1q_f32(local_input_ptr);
368        local_input_ptr += 4;
369        // Load the accumulators from acc_buffer
370        float32x2_t acc[4];
371        for (int i = 0; i < 4; i++) {
372          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
373        }
374        // Multiply-accumulate
375        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
376        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
377        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
378        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
379        // Store the accumulators back to acc_buffer
380        for (int i = 0; i < 4; i++) {
381          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
382        }
383        acc_buffer_ptr += 8;
384      }
385      // Handle 2 input channels at a time.
386      for (; ic <= input_depth - 2; ic += 2) {
387        // Load the filters
388        const float32x4_t filter = vld1q_f32(local_filter_ptr);
389        local_filter_ptr += 4;
390        // Load the inputs
391        const float32x2_t input = vld1_f32(local_input_ptr);
392        local_input_ptr += 2;
393        // Load the accumulators from acc_buffer
394        float32x2_t acc[2];
395        for (int i = 0; i < 2; i++) {
396          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
397        }
398        // Multiply-accumulate
399        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
400        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
401        // Store the accumulators back to acc_buffer
402        for (int i = 0; i < 2; i++) {
403          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
404        }
405        acc_buffer_ptr += 4;
406      }
407      // Handle one input channel at a time.
408      for (; ic < input_depth; ic++) {
409        // Load the inputs
410        const float input_val = *local_input_ptr++;
411        // Multiply-accumulate
412        for (int i = 0; i < 2; i++) {
413          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
414        }
415        local_filter_ptr += 2;
416        acc_buffer_ptr += 2;
417      }
418      input_ptr += input_ptr_increment;
419    }
420  }
421};
422
423template <>
424struct FloatDepthwiseConvKernel<true, 3, 2> {
425  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
426                  const float* input_ptr, int input_ptr_increment,
427                  const float* filter_ptr, float* acc_buffer_ptr) {
428    // Load the filters
429    float32x2_t filter[3];
430    for (int i = 0; i < 3; i++) {
431      filter[i] = vld1_f32(filter_ptr + 2 * i);
432    }
433    // Handle one output pixel at a time.
434    for (int outp = 0; outp < num_output_pixels; outp++) {
435      const float32x2_t input01 = vld1_f32(input_ptr);
436      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
437      // Load the accumulators from acc_buffer
438      float32x2_t acc[3];
439      for (int i = 0; i < 3; i++) {
440        acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
441      }
442      // Multiply-accumulate for each input channel there 2 outputs
443      acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
444      acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
445      acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
446      // Store the accumulators back to acc_buffer
447      for (int i = 0; i < 3; i++) {
448        vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
449      }
450      acc_buffer_ptr += 6;
451      input_ptr += input_ptr_increment;
452    }
453  }
454};
455
456template <>
457struct FloatDepthwiseConvKernel<true, 3, 4> {
458  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
459                  const float* input_ptr, int input_ptr_increment,
460                  const float* filter_ptr, float* acc_buffer_ptr) {
461    // Load the filters
462    float32x4_t filter[3];
463    for (int i = 0; i < 3; i++) {
464      filter[i] = vld1q_f32(filter_ptr + 4 * i);
465    }
466    // Handle one output pixel at a time.
467    for (int outp = 0; outp < num_output_pixels; outp++) {
468      // NOTE: we only want 3 values, so we read it as two ops where
469      // the second op just duplicates the lane
470      const float32x2_t input01 = vld1_f32(input_ptr);
471      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
472      // Load the accumulators from acc_buffer
473      float32x4_t acc[3];
474      for (int i = 0; i < 3; i++) {
475        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
476      }
477      // Multiply-accumulate all outputs.
478      acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
479      acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
480      acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
481      // Store the accumulators back to acc_buffer
482      for (int i = 0; i < 3; i++) {
483        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
484      }
485      acc_buffer_ptr += 12;
486      input_ptr += input_ptr_increment;
487    }
488  }
489};
490
491template <>
492struct FloatDepthwiseConvKernel<true, 1, 8> {
493  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
494                  const float* input_ptr, int input_ptr_increment,
495                  const float* filter_ptr, float* acc_buffer_ptr) {
496    // Load the filters
497    float32x4_t filter[2];
498    for (int i = 0; i < 2; i++) {
499      filter[i] = vld1q_f32(filter_ptr + 4 * i);
500    }
501    // Handle one output pixel at a time.
502    for (int outp = 0; outp < num_output_pixels; outp++) {
503      // Load the inputs
504      const float input_val = *input_ptr;
505      input_ptr += input_ptr_increment;
506      // Load the accumulators from acc_buffer
507      float32x4_t acc[2];
508      for (int i = 0; i < 2; i++) {
509        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
510      }
511      // Multiply-accumulate
512      for (int i = 0; i < 2; i++) {
513        acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
514      }
515      // Store the accumulators back to acc_buffer
516      for (int i = 0; i < 2; i++) {
517        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
518      }
519      acc_buffer_ptr += 8;
520    }
521  }
522};
523
524template <>
525struct FloatDepthwiseConvKernel<true, 1, 32> {
526  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
527                  const float* input_ptr, int input_ptr_increment,
528                  const float* filter_ptr, float* acc_buffer_ptr) {
529    // Load the filters
530    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
531    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
532    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
533    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
534    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
535    float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
536    float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
537    float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
538
539    // Handle one output pixel at a time.
540    for (int outp = 0; outp < num_output_pixels; outp++) {
541      // Load the inputs
542      const float input_val = *input_ptr;
543      input_ptr += input_ptr_increment;
544      // Load the accumulators from acc_buffer
545      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
546      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
547      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
548      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
549      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
550      float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
551      float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
552      float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
553      // Multiply-accumulate
554      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
555      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
556      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
557      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
558      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
559      acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
560      acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
561      acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
562      // Store the accumulators back to acc_buffer
563      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
564      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
565      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
566      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
567      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
568      vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
569      vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
570      vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
571      acc_buffer_ptr += 32;
572    }
573  }
574};
575
576template <>
577struct FloatDepthwiseConvKernel<true, 1, 20> {
578  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
579                  const float* input_ptr, int input_ptr_increment,
580                  const float* filter_ptr, float* acc_buffer_ptr) {
581    // Load the filters
582    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
583    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
584    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
585    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
586    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
587
588    // Handle one output pixel at a time.
589    for (int outp = 0; outp < num_output_pixels; outp++) {
590      // Load the inputs
591      const float input_val = *input_ptr;
592      input_ptr += input_ptr_increment;
593      // Load the accumulators from acc_buffer
594      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
595      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
596      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
597      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
598      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
599      // Multiply-accumulate
600      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
601      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
602      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
603      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
604      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
605      // Store the accumulators back to acc_buffer
606      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
607      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
608      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
609      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
610      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
611      acc_buffer_ptr += 20;
612    }
613  }
614};
615
616template <>
617struct FloatDepthwiseConvKernel<true, 0, 16> {
618  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
619                  const float* input_ptr, int input_ptr_increment,
620                  const float* filter_ptr, float* acc_buffer_ptr) {
621    // Handle one output pixel at a time.
622    for (int outp = 0; outp < num_output_pixels; outp++) {
623      const float* local_filter_ptr = filter_ptr;
624      const float* local_input_ptr = input_ptr;
625      for (int ic = 0; ic < input_depth; ic++) {
626        // Load the filters
627        float32x4_t filter[4];
628        for (int i = 0; i < 4; i++) {
629          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
630        }
631        local_filter_ptr += 16;
632        // Load the inputs
633        const float input_val = *local_input_ptr++;
634        // Load the accumulators from acc_buffer
635        float32x4_t acc[4];
636        for (int i = 0; i < 4; i++) {
637          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
638        }
639        // Multiply-accumulate
640        for (int i = 0; i < 4; i++) {
641          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
642        }
643        // Store the accumulators back to acc_buffer
644        for (int i = 0; i < 4; i++) {
645          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
646        }
647        acc_buffer_ptr += 16;
648      }
649      input_ptr += input_ptr_increment;
650    }
651  }
652};
653
654template <>
655struct FloatDepthwiseConvKernel<true, 8, 1> {
656  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
657                  const float* input_ptr, int input_ptr_increment,
658                  const float* filter_ptr, float* acc_buffer_ptr) {
659    // Load the filters
660    float32x4_t filter[2];
661    for (int i = 0; i < 2; i++) {
662      filter[i] = vld1q_f32(filter_ptr + 4 * i);
663    }
664    // Handle one output pixel at a time.
665    for (int outp = 0; outp < num_output_pixels; outp++) {
666      // Load the inputs
667      float32x4_t input[2];
668      for (int i = 0; i < 2; i++) {
669        input[i] = vld1q_f32(input_ptr + 4 * i);
670      }
671      // Load the accumulators from acc_buffer
672      float32x4_t acc[2];
673      for (int i = 0; i < 2; i++) {
674        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
675      }
676      // Multiply-accumulate
677      for (int i = 0; i < 2; i++) {
678        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
679      }
680      // Store the accumulators back to acc_buffer
681      for (int i = 0; i < 2; i++) {
682        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
683      }
684      acc_buffer_ptr += 8;
685      input_ptr += input_ptr_increment;
686    }
687  }
688};
689
690template <>
691struct FloatDepthwiseConvKernel<true, 2, 1> {
692  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
693                  const float* input_ptr, int input_ptr_increment,
694                  const float* filter_ptr, float* acc_buffer_ptr) {
695    float32x2_t filter = vld1_f32(filter_ptr);
696    float32x4_t filter_x4 = vcombine_f32(filter, filter);
697    int outp = 0;
698
699    // Handle two output pixels at a time.
700    for (; outp <= num_output_pixels - 2; outp += 2) {
701      // Load the inputs
702      float32x2_t input_1 = vld1_f32(input_ptr);
703      input_ptr += input_ptr_increment;
704      float32x2_t input_2 = vld1_f32(input_ptr);
705      input_ptr += input_ptr_increment;
706      float32x4_t input = vcombine_f32(input_1, input_2);
707
708      // Load the accumulators from acc_buffer
709      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
710
711      // Multiply-accumulate
712      acc = vmlaq_f32(acc, input, filter_x4);
713
714      // Store the accumulators back to acc_buffer
715      vst1q_f32(acc_buffer_ptr, acc);
716      acc_buffer_ptr += 4;
717    }
718    // Handle one output pixel at a time.
719    for (; outp < num_output_pixels; outp++) {
720      // Load the inputs
721      float32x2_t input = vld1_f32(input_ptr);
722      input_ptr += input_ptr_increment;
723
724      // Load the accumulators from acc_buffer
725      float32x2_t acc = vld1_f32(acc_buffer_ptr);
726
727      // Multiply-accumulate
728      acc = vmla_f32(acc, input, filter);
729
730      // Store the accumulators back to acc_buffer
731      vst1_f32(acc_buffer_ptr, acc);
732      acc_buffer_ptr += 2;
733    }
734  }
735};
736
737template <>
738struct FloatDepthwiseConvKernel<true, 4, 1> {
739  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
740                  const float* input_ptr, int input_ptr_increment,
741                  const float* filter_ptr, float* acc_buffer_ptr) {
742    float32x4_t filter = vld1q_f32(filter_ptr);
743
744    // Handle one output pixel at a time.
745    for (int outp = 0; outp < num_output_pixels; outp++) {
746      // Load the inputs
747      float32x4_t input = vld1q_f32(input_ptr);
748      // Load the accumulators from acc_buffer
749      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
750      // Multiply-accumulate
751      acc = vmlaq_f32(acc, input, filter);
752      // Store the accumulators back to acc_buffer
753      vst1q_f32(acc_buffer_ptr, acc);
754      acc_buffer_ptr += 4;
755      input_ptr += input_ptr_increment;
756    }
757  }
758};
759#endif
760
761// Accumulates the effect of one row of the filter, on a segment of one row
762// of the output, accessing the corresponding one row of the input.
763template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
764void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width,
765                                const float* input_data, int pad_width,
766                                int depth_multiplier, int filter_width,
767                                const float* filter_data,
768                                int out_x_buffer_start, int out_x_buffer_end,
769                                int output_depth, float* acc_buffer) {
770#ifdef GEMMLOWP_PROFILING
771  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
772#endif
773  // Sanity check parameters. This is important in particular to ensure
774  // that we keep the number of template instantiations minimal, so we don't
775  // increase binary size unnecessarily.
776  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
777  static_assert(kFixedInputDepth || kAllowStrided, "");
778  TFLITE_DCHECK(stride == 1 || kAllowStrided);
779  if (kFixedInputDepth) {
780    TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
781  }
782  if (kFixedDepthMultiplier) {
783    TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
784  }
785  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
786  const int input_ptr_increment = stride * input_depth;
787  const float* filter_base_ptr = filter_data;
788  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
789    // For the current (filter_x, filter_y) point in the filter,
790    // compute the boundaries of the corresponding output row segment.
791    int out_x_loop_start_unclampled = 0;
792    int out_x_loop_end_unclampled = 0;
793    if (kAllowStrided) {
794      if (stride == 2) {
795        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
796        out_x_loop_end_unclampled =
797            (pad_width + input_width - filter_x + 1) / 2;
798      } else if (stride == 4) {
799        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
800        out_x_loop_end_unclampled =
801            (pad_width + input_width - filter_x + 3) / 4;
802      } else {
803        out_x_loop_start_unclampled =
804            (pad_width - filter_x + stride - 1) / stride;
805        out_x_loop_end_unclampled =
806            (pad_width + input_width - filter_x + stride - 1) / stride;
807      }
808    } else {
809      out_x_loop_start_unclampled = pad_width - filter_x;
810      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
811    }
812    // The kernel will have to iterate on the segment of the
813    // output row that starts at out_x_loop_start and out_x_loop_end.
814    const int out_x_loop_start =
815        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
816    const int out_x_loop_end =
817        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
818
819    float* acc_buffer_ptr =
820        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
821    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
822    const float* input_ptr = input_data + in_x_origin * input_depth;
823    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
824    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
825                             kFixedDepthMultiplier>::Run(num_output_pixels,
826                                                         input_depth,
827                                                         depth_multiplier,
828                                                         input_ptr,
829                                                         input_ptr_increment,
830                                                         filter_base_ptr,
831                                                         acc_buffer_ptr);
832    filter_base_ptr += output_depth;
833  }
834}
835
836// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
837inline void FloatDepthwiseConvAccumRowGeneric(
838    int stride, int input_depth, int input_width, const float* input_data,
839    int pad_width, int depth_multiplier, int filter_width,
840    const float* filter_data, int out_x_buffer_start, int out_x_buffer_end,
841    int output_depth, float* acc_buffer) {
842  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
843#ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
844#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
845  LOG(FATAL)
846      << "\n\n"
847      << "*****************************************************************\n"
848      << "* This tfmini inference code was about to use the slow generic\n"
849      << "* fallback implementation for a DepthwiseConv op, and we want you\n"
850      << "* to be aware of that so that you will know why you get terrible\n"
851      << "* performance.\n"
852      << "*\n"
853      << "* If you would like to carry on with the slow code, compile\n"
854      << "* with this preprocessor token defined:\n"
855      << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
856      << "*\n"
857      << "* The right thing to do, if you care about performance, is to add\n"
858      << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
859      << "* The relevant parameters defining your case are:\n"
860      << "* stride = " << stride << "\n"
861      << "* input_depth = " << input_depth << "\n"
862      << "* depth_multiplier = " << depth_multiplier << "\n"
863      << "*\n"
864      << "* Please do not hesitate to contact benoitjacob@ with this\n"
865      << "* information.\n"
866      << "*****************************************************************\n";
867#endif  // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
868#endif  // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
869  const float* filter_base_ptr = filter_data;
870  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
871    const int out_x_loop_start = std::max(
872        out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
873    const int out_x_loop_end =
874        std::min(out_x_buffer_end,
875                 (pad_width + input_width - filter_x + stride - 1) / stride);
876
877    float* acc_buffer_ptr =
878        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
879    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
880    const float* input_ptr = input_data + in_x_origin * input_depth;
881    const int input_ptr_increment = (stride - 1) * input_depth;
882    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
883      const float* filter_ptr = filter_base_ptr;
884      for (int ic = 0; ic < input_depth; ++ic) {
885        const float input_val = *input_ptr++;
886        for (int m = 0; m < depth_multiplier; m++) {
887          const float filter_val = *filter_ptr++;
888          *acc_buffer_ptr++ += filter_val * input_val;
889        }
890      }
891      input_ptr += input_ptr_increment;
892    }
893    filter_base_ptr += output_depth;
894  }
895}
896
897// Initializes the accumulator buffer with bias values.
898inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
899                                       const float* bias_data,
900                                       float* acc_buffer) {
901  // TODO(benoitjacob): This might need optimized specializations
902  // for small output_depth values, if that ever becomes an important
903  // case (like it was for some quantized DepthwiseConv cases).
904  for (int i = 0; i < num_output_pixels; i++) {
905    memcpy(acc_buffer + i * output_depth, bias_data,
906           sizeof(acc_buffer[0]) * output_depth);
907  }
908}
909
910inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
911                          const float* filter_data, const Dims<4>& filter_dims,
912                          const float* bias_data, const Dims<4>& bias_dims,
913                          int stride_width, int stride_height, int pad_width,
914                          int pad_height, int depth_multiplier,
915                          float output_activation_min,
916                          float output_activation_max, float* output_data,
917                          const Dims<4>& output_dims) {
918  gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
919  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
920  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
921  const int input_height = ArraySize(input_dims, 2);
922  const int input_width = ArraySize(input_dims, 1);
923  const int input_depth = ArraySize(input_dims, 0);
924  const int filter_height = ArraySize(filter_dims, 2);
925  const int filter_width = ArraySize(filter_dims, 1);
926  const int output_height = ArraySize(output_dims, 2);
927  const int output_width = ArraySize(output_dims, 1);
928  TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
929
930  static const int kAccBufferMaxSize = 2048;
931  float acc_buffer[kAccBufferMaxSize];
932  TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
933  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
934  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
935  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
936                   kAccBufferActualSize);
937  TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
938  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
939
940  // row_accum_func will point to the core accumulation function to be used
941  // for this DepthwiseConv op.
942  using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
943  row_accum_func_t row_accum_func = nullptr;
944
945#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
946                                        FIXED_DEPTH_MULTIPLIER)           \
947  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
948      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
949      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
950    row_accum_func =                                                      \
951        FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
952                                   FIXED_DEPTH_MULTIPLIER>;               \
953  }
954
955#ifdef USE_NEON
956  // We go over our list of kernels by decreasing order of preference
957  // for the cases where multiple kernels could apply.
958
959  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
960
961  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
962  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
963
964  // Next come the strided kernels: AllowStrided=true, fixed input depth.
965  // They are a bit less efficient, but allow stride!=1.
966
967  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
968  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
969  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
970  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
971  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
972  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
973  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
974  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
975
976  // Finally, the kernels allowing a variable input depth,
977  // these are the least efficient but most general kernels.
978
979  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
980  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
981  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
982  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
983
984#endif  // USE_NEON
985
986#undef TFMINI_USE_DEPTHWISECONV_KERNEL
987
988  // No matching fast kernel found, use slow fallback.
989  if (!row_accum_func) {
990    row_accum_func = FloatDepthwiseConvAccumRowGeneric;
991  }
992
993  // Now that we have determined row_accum_func, we can start work.
994  float* output_ptr = output_data;
995  for (int b = 0; b < batches; ++b) {
996    for (int out_y = 0; out_y < output_height; ++out_y) {
997      const int in_y_origin = (out_y * stride_height) - pad_height;
998      const int filter_y_start = std::max(0, -in_y_origin);
999      const int filter_y_end =
1000          std::min(filter_height, input_height - in_y_origin);
1001      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1002           out_x_buffer_start += kOutputPixelsInAccBuffer) {
1003        const int out_x_buffer_end = std::min(
1004            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1005        // We call a 'pixel' a group of activation that share all but the
1006        // 'depth'/'channel' coordinate. num_output_pixels is the number of
1007        // output pixels that we will accumulate in this loop iteration.
1008        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1009        // Initialize our local accumulator with the bias values, so we don't
1010        // have to add them later.
1011        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
1012                                   acc_buffer);
1013        // Accumulation loop. Most of the time should be spent in here.
1014        for (int filter_y = filter_y_start; filter_y < filter_y_end;
1015             ++filter_y) {
1016          const int in_y = in_y_origin + filter_y;
1017          row_accum_func(stride_width, input_depth, input_width,
1018                         input_data + in_y * input_dims.strides[2] +
1019                             b * input_dims.strides[3],
1020                         pad_width, depth_multiplier, filter_width,
1021                         filter_data + filter_y * filter_dims.strides[2],
1022                         out_x_buffer_start, out_x_buffer_end, output_depth,
1023                         acc_buffer);
1024        }
1025        // Finished accumulating. Now store to destination.
1026        const int num_output_values = output_depth * num_output_pixels;
1027        int i = 0;
1028// TODO(benoitjacob) optimized code goes here
1029#ifdef USE_NEON
1030        // Handle 16 values at a time
1031        for (; i <= num_output_values - 16; i += 16) {
1032          float32x4_t acc[4];
1033          for (int k = 0; k < 4; k++) {
1034            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
1035          }
1036          for (int k = 0; k < 4; k++) {
1037            acc[k] = vmaxq_f32(
1038                vdupq_n_f32(output_activation_min),
1039                vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
1040          }
1041          for (int k = 0; k < 4; k++) {
1042            vst1q_f32(output_ptr + 4 * k, acc[k]);
1043          }
1044          output_ptr += 16;
1045        }
1046        // Handle 4 values at a time
1047        for (; i <= num_output_values - 4; i += 4) {
1048          float32x4_t acc = vld1q_f32(acc_buffer + i);
1049
1050          acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
1051                          vminq_f32(vdupq_n_f32(output_activation_max), acc));
1052
1053          vst1q_f32(output_ptr, acc);
1054          output_ptr += 4;
1055        }
1056#endif
1057        // Handle leftover values, one by one. This is very slow.
1058        for (; i < num_output_values; i++) {
1059          float acc = acc_buffer[i];
1060          acc = std::max(output_activation_min,
1061                         std::min(output_activation_max, acc));
1062
1063          *output_ptr++ = acc;
1064        }
1065      }
1066    }
1067  }
1068}
1069
1070// legacy, for compatibility with old checked-in code
1071template <FusedActivationFunctionType Ac>
1072void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
1073                   const float* filter_data, const Dims<4>& filter_dims,
1074                   const float* bias_data, const Dims<4>& bias_dims,
1075                   int stride_width, int stride_height, int pad_width,
1076                   int pad_height, int depth_multiplier, float* output_data,
1077                   const Dims<4>& output_dims) {
1078  float output_activation_min, output_activation_max;
1079  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
1080  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
1081                bias_dims, stride_width, stride_height, pad_width, pad_height,
1082                depth_multiplier, output_activation_min, output_activation_max,
1083                output_data, output_dims);
1084}
1085
1086// legacy, for compatibility with old checked-in code
1087template <FusedActivationFunctionType Ac>
1088void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
1089                   const float* filter_data, const Dims<4>& filter_dims,
1090                   const float* bias_data, const Dims<4>& bias_dims, int stride,
1091                   int pad_width, int pad_height, int depth_multiplier,
1092                   float* output_data, const Dims<4>& output_dims) {
1093  DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
1094                    bias_dims, stride, stride, pad_width, pad_height,
1095                    depth_multiplier, output_data, output_dims);
1096}
1097
1098}  // namespace optimized_ops
1099}  // namespace tflite
1100
1101#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
1102