1a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
3a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew HarpLicensed under the Apache License, Version 2.0 (the "License");
4a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harpyou may not use this file except in compliance with the License.
5a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew HarpYou may obtain a copy of the License at
6a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
7a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    http://www.apache.org/licenses/LICENSE-2.0
8a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
9a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew HarpUnless required by applicable law or agreed to in writing, software
10a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harpdistributed under the License is distributed on an "AS IS" BASIS,
11a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew HarpWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew HarpSee the License for the specific language governing permissions and
13a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harplimitations under the License.
14a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp==============================================================================*/
15a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
16a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// NEON implementations of Image methods for compatible devices.  Control
17a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// should never enter this compilation unit on incompatible devices.
18a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
19a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#ifdef __ARM_NEON
20a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
21a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#include <arm_neon.h>
22a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
23a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower#include <stdint.h>
24a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
25a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
26a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#include "tensorflow/examples/android/jni/object_tracking/image.h"
27a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#include "tensorflow/examples/android/jni/object_tracking/image_utils.h"
28a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#include "tensorflow/examples/android/jni/object_tracking/utils.h"
29a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
30a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harpnamespace tf_tracking {
31a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
32a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// This function does the bulk of the work.
33a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harptemplate <>
34a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlowervoid Image<uint8_t>::Downsample2x32ColumnsNeon(const uint8_t* const original,
35a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower                                               const int stride,
36a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower                                               const int orig_x) {
37a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Divide input x offset by 2 to find output offset.
38a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  const int new_x = orig_x >> 1;
39a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
40a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Initial offset into top row.
41a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower  const uint8_t* offset = original + orig_x;
42a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
43a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // This points to the leftmost pixel of our 8 horizontally arranged
44a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // pixels in the destination data.
45a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower  uint8_t* ptr_dst = (*this)[0] + new_x;
46a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
47a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Sum along vertical columns.
48a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Process 32x2 input pixels and 16x1 output pixels per iteration.
49a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  for (int new_y = 0; new_y < height_; ++new_y) {
50a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    uint16x8_t accum1 = vdupq_n_u16(0);
51a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    uint16x8_t accum2 = vdupq_n_u16(0);
52a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
53a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Go top to bottom across the four rows of input pixels that make up
54a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // this output row.
55a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    for (int row_num = 0; row_num < 2; ++row_num) {
56a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      // First 16 bytes.
57a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      {
58a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // Load 16 bytes of data from current offset.
59a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        const uint8x16_t curr_data1 = vld1q_u8(offset);
60a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
61a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // Pairwise add and accumulate into accum vectors (16 bit to account
62a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // for values above 255).
63a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        accum1 = vpadalq_u8(accum1, curr_data1);
64a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      }
65a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
66a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      // Second 16 bytes.
67a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      {
68a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // Load 16 bytes of data from current offset.
69a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        const uint8x16_t curr_data2 = vld1q_u8(offset + 16);
70a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
71a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // Pairwise add and accumulate into accum vectors (16 bit to account
72a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // for values above 255).
73a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        accum2 = vpadalq_u8(accum2, curr_data2);
74a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      }
75a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
76a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      // Move offset down one row.
77a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      offset += stride;
78a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    }
79a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
80a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Divide by 4 (number of input pixels per output
81a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // pixel) and narrow data from 16 bits per pixel to 8 bpp.
82a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    const uint8x8_t tmp_pix1 = vqshrn_n_u16(accum1, 2);
83a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    const uint8x8_t tmp_pix2 = vqshrn_n_u16(accum2, 2);
84a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
85a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Concatenate 8x1 pixel strips into 16x1 pixel strip.
86a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    const uint8x16_t allpixels = vcombine_u8(tmp_pix1, tmp_pix2);
87a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
88a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Copy all pixels from composite 16x1 vector into output strip.
89a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    vst1q_u8(ptr_dst, allpixels);
90a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
91a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    ptr_dst += stride_;
92a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  }
93a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp}
94a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
95a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// This function does the bulk of the work.
96a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harptemplate <>
97a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlowervoid Image<uint8_t>::Downsample4x32ColumnsNeon(const uint8_t* const original,
98a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower                                               const int stride,
99a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower                                               const int orig_x) {
100a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Divide input x offset by 4 to find output offset.
101a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  const int new_x = orig_x >> 2;
102a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
103a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Initial offset into top row.
104a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower  const uint8_t* offset = original + orig_x;
105a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
106a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // This points to the leftmost pixel of our 8 horizontally arranged
107a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // pixels in the destination data.
108a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower  uint8_t* ptr_dst = (*this)[0] + new_x;
109a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
110a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Sum along vertical columns.
111a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Process 32x4 input pixels and 8x1 output pixels per iteration.
112a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  for (int new_y = 0; new_y < height_; ++new_y) {
113a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    uint16x8_t accum1 = vdupq_n_u16(0);
114a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    uint16x8_t accum2 = vdupq_n_u16(0);
115a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
116a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Go top to bottom across the four rows of input pixels that make up
117a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // this output row.
118a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    for (int row_num = 0; row_num < 4; ++row_num) {
119a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      // First 16 bytes.
120a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      {
121a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // Load 16 bytes of data from current offset.
122a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        const uint8x16_t curr_data1 = vld1q_u8(offset);
123a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
124a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // Pairwise add and accumulate into accum vectors (16 bit to account
125a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // for values above 255).
126a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        accum1 = vpadalq_u8(accum1, curr_data1);
127a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      }
128a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
129a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      // Second 16 bytes.
130a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      {
131a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // Load 16 bytes of data from current offset.
132a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        const uint8x16_t curr_data2 = vld1q_u8(offset + 16);
133a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
134a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // Pairwise add and accumulate into accum vectors (16 bit to account
135a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        // for values above 255).
136a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp        accum2 = vpadalq_u8(accum2, curr_data2);
137a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      }
138a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
139a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      // Move offset down one row.
140a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      offset += stride;
141a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    }
142a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
143a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Add and widen, then divide by 16 (number of input pixels per output
144a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // pixel) and narrow data from 32 bits per pixel to 16 bpp.
145a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    const uint16x4_t tmp_pix1 = vqshrn_n_u32(vpaddlq_u16(accum1), 4);
146a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    const uint16x4_t tmp_pix2 = vqshrn_n_u32(vpaddlq_u16(accum2), 4);
147a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
148a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Combine 4x1 pixel strips into 8x1 pixel strip and narrow from
149a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // 16 bits to 8 bits per pixel.
150a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    const uint8x8_t allpixels = vmovn_u16(vcombine_u16(tmp_pix1, tmp_pix2));
151a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
152a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Copy all pixels from composite 8x1 vector into output strip.
153a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    vst1_u8(ptr_dst, allpixels);
154a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
155a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    ptr_dst += stride_;
156a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  }
157a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp}
158a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
159a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
160a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// Hardware accelerated downsampling method for supported devices.
161a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// Requires that image size be a multiple of 16 pixels in each dimension,
162a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// and that downsampling be by a factor of 2 or 4.
163a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harptemplate <>
164a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlowervoid Image<uint8_t>::DownsampleAveragedNeon(const uint8_t* const original,
165a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower                                            const int stride,
166a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower                                            const int factor) {
167a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // TODO(andrewharp): stride is a bad approximation for the src image's width.
168a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Better to pass that in directly.
169a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  SCHECK(width_ * factor <= stride, "Uh oh!");
170a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  const int last_starting_index = width_ * factor - 32;
171a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
172a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // We process 32 input pixels lengthwise at a time.
173a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // The output per pass of this loop is an 8 wide by downsampled height tall
174a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // pixel strip.
175a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  int orig_x = 0;
176a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  for (; orig_x <= last_starting_index; orig_x += 32) {
177a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    if (factor == 2) {
178a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      Downsample2x32ColumnsNeon(original, stride, orig_x);
179a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    } else {
180a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      Downsample4x32ColumnsNeon(original, stride, orig_x);
181a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    }
182a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  }
183a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
184a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // If a last pass is required, push it to the left enough so that it never
185a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // goes out of bounds. This will result in some extra computation on devices
186a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // whose frame widths are multiples of 16 and not 32.
187a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  if (orig_x < last_starting_index + 32) {
188a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    if (factor == 2) {
189a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      Downsample2x32ColumnsNeon(original, stride, last_starting_index);
190a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    } else {
191a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp      Downsample4x32ColumnsNeon(original, stride, last_starting_index);
192a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    }
193a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  }
194a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp}
195a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
196a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
197a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// Puts the image gradient matrix about a pixel into the 2x2 float array G.
198a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// vals_x should be an array of the window x gradient values, whose indices
199a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// can be in any order but are parallel to the vals_y entries.
200a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// See http://robots.stanford.edu/cs223b04/algo_tracking.pdf for more details.
201a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harpvoid CalculateGNeon(const float* const vals_x, const float* const vals_y,
202a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp                    const int num_vals, float* const G) {
203a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  const float32_t* const arm_vals_x = (const float32_t*) vals_x;
204a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  const float32_t* const arm_vals_y = (const float32_t*) vals_y;
205a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
206a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Running sums.
207a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  float32x4_t xx = vdupq_n_f32(0.0f);
208a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  float32x4_t xy = vdupq_n_f32(0.0f);
209a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  float32x4_t yy = vdupq_n_f32(0.0f);
210a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
211a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Maximum index we can load 4 consecutive values from.
212a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // e.g. if there are 81 values, our last full pass can be from index 77:
213a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // 81-4=>77 (77, 78, 79, 80)
214a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  const int max_i = num_vals - 4;
215a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
216a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Defined here because we want to keep track of how many values were
217a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // processed by NEON, so that we can finish off the remainder the normal
218a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // way.
219a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  int i = 0;
220a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
221a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Process values 4 at a time, accumulating the sums of
222a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // the pixel-wise x*x, x*y, and y*y values.
223a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  for (; i <= max_i; i += 4) {
224a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Load xs
225a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    float32x4_t x = vld1q_f32(arm_vals_x + i);
226a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
227a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Multiply x*x and accumulate.
228a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    xx = vmlaq_f32(xx, x, x);
229a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
230a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Load ys
231a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    float32x4_t y = vld1q_f32(arm_vals_y + i);
232a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
233a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Multiply x*y and accumulate.
234a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    xy = vmlaq_f32(xy, x, y);
235a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
236a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    // Multiply y*y and accumulate.
237a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    yy = vmlaq_f32(yy, y, y);
238a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  }
239a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
240a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  static float32_t xx_vals[4];
241a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  static float32_t xy_vals[4];
242a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  static float32_t yy_vals[4];
243a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
244a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  vst1q_f32(xx_vals, xx);
245a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  vst1q_f32(xy_vals, xy);
246a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  vst1q_f32(yy_vals, yy);
247a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
248a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Accumulated values are store in sets of 4, we have to manually add
249a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // the last bits together.
250a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  for (int j = 0; j < 4; ++j) {
251a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    G[0] += xx_vals[j];
252a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    G[1] += xy_vals[j];
253a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    G[3] += yy_vals[j];
254a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  }
255a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
256a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // Finishes off last few values (< 4) from above.
257a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  for (; i < num_vals; ++i) {
258a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    G[0] += Square(vals_x[i]);
259a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    G[1] += vals_x[i] * vals_y[i];
260a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp    G[3] += Square(vals_y[i]);
261a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  }
262a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
263a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  // The matrix is symmetric, so this is a given.
264a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp  G[2] = G[1];
265a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp}
266a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
267a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp}  // namespace tf_tracking
268a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp
269a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#endif
270