1a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 3a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew HarpLicensed under the Apache License, Version 2.0 (the "License"); 4a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harpyou may not use this file except in compliance with the License. 5a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew HarpYou may obtain a copy of the License at 6a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 7a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp http://www.apache.org/licenses/LICENSE-2.0 8a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 9a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew HarpUnless required by applicable law or agreed to in writing, software 10a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harpdistributed under the License is distributed on an "AS IS" BASIS, 11a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew HarpWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew HarpSee the License for the specific language governing permissions and 13a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harplimitations under the License. 14a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp==============================================================================*/ 15a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 16a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// NEON implementations of Image methods for compatible devices. Control 17a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// should never enter this compilation unit on incompatible devices. 18a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 19a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#ifdef __ARM_NEON 20a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 21a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#include <arm_neon.h> 22a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 23a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower#include <stdint.h> 24a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 25a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#include "tensorflow/examples/android/jni/object_tracking/image-inl.h" 26a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#include "tensorflow/examples/android/jni/object_tracking/image.h" 27a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#include "tensorflow/examples/android/jni/object_tracking/image_utils.h" 28a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#include "tensorflow/examples/android/jni/object_tracking/utils.h" 29a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 30a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harpnamespace tf_tracking { 31a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 32a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// This function does the bulk of the work. 33a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harptemplate <> 34a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlowervoid Image<uint8_t>::Downsample2x32ColumnsNeon(const uint8_t* const original, 35a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower const int stride, 36a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower const int orig_x) { 37a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Divide input x offset by 2 to find output offset. 38a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const int new_x = orig_x >> 1; 39a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 40a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Initial offset into top row. 41a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower const uint8_t* offset = original + orig_x; 42a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 43a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // This points to the leftmost pixel of our 8 horizontally arranged 44a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // pixels in the destination data. 45a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower uint8_t* ptr_dst = (*this)[0] + new_x; 46a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 47a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Sum along vertical columns. 48a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Process 32x2 input pixels and 16x1 output pixels per iteration. 49a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp for (int new_y = 0; new_y < height_; ++new_y) { 50a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp uint16x8_t accum1 = vdupq_n_u16(0); 51a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp uint16x8_t accum2 = vdupq_n_u16(0); 52a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 53a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Go top to bottom across the four rows of input pixels that make up 54a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // this output row. 55a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp for (int row_num = 0; row_num < 2; ++row_num) { 56a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // First 16 bytes. 57a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp { 58a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Load 16 bytes of data from current offset. 59a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const uint8x16_t curr_data1 = vld1q_u8(offset); 60a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 61a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Pairwise add and accumulate into accum vectors (16 bit to account 62a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // for values above 255). 63a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp accum1 = vpadalq_u8(accum1, curr_data1); 64a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 65a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 66a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Second 16 bytes. 67a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp { 68a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Load 16 bytes of data from current offset. 69a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const uint8x16_t curr_data2 = vld1q_u8(offset + 16); 70a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 71a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Pairwise add and accumulate into accum vectors (16 bit to account 72a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // for values above 255). 73a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp accum2 = vpadalq_u8(accum2, curr_data2); 74a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 75a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 76a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Move offset down one row. 77a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp offset += stride; 78a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 79a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 80a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Divide by 4 (number of input pixels per output 81a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // pixel) and narrow data from 16 bits per pixel to 8 bpp. 82a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const uint8x8_t tmp_pix1 = vqshrn_n_u16(accum1, 2); 83a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const uint8x8_t tmp_pix2 = vqshrn_n_u16(accum2, 2); 84a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 85a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Concatenate 8x1 pixel strips into 16x1 pixel strip. 86a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const uint8x16_t allpixels = vcombine_u8(tmp_pix1, tmp_pix2); 87a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 88a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Copy all pixels from composite 16x1 vector into output strip. 89a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp vst1q_u8(ptr_dst, allpixels); 90a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 91a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp ptr_dst += stride_; 92a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 93a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp} 94a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 95a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// This function does the bulk of the work. 96a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harptemplate <> 97a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlowervoid Image<uint8_t>::Downsample4x32ColumnsNeon(const uint8_t* const original, 98a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower const int stride, 99a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower const int orig_x) { 100a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Divide input x offset by 4 to find output offset. 101a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const int new_x = orig_x >> 2; 102a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 103a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Initial offset into top row. 104a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower const uint8_t* offset = original + orig_x; 105a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 106a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // This points to the leftmost pixel of our 8 horizontally arranged 107a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // pixels in the destination data. 108a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower uint8_t* ptr_dst = (*this)[0] + new_x; 109a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 110a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Sum along vertical columns. 111a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Process 32x4 input pixels and 8x1 output pixels per iteration. 112a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp for (int new_y = 0; new_y < height_; ++new_y) { 113a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp uint16x8_t accum1 = vdupq_n_u16(0); 114a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp uint16x8_t accum2 = vdupq_n_u16(0); 115a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 116a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Go top to bottom across the four rows of input pixels that make up 117a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // this output row. 118a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp for (int row_num = 0; row_num < 4; ++row_num) { 119a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // First 16 bytes. 120a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp { 121a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Load 16 bytes of data from current offset. 122a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const uint8x16_t curr_data1 = vld1q_u8(offset); 123a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 124a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Pairwise add and accumulate into accum vectors (16 bit to account 125a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // for values above 255). 126a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp accum1 = vpadalq_u8(accum1, curr_data1); 127a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 128a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 129a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Second 16 bytes. 130a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp { 131a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Load 16 bytes of data from current offset. 132a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const uint8x16_t curr_data2 = vld1q_u8(offset + 16); 133a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 134a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Pairwise add and accumulate into accum vectors (16 bit to account 135a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // for values above 255). 136a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp accum2 = vpadalq_u8(accum2, curr_data2); 137a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 138a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 139a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Move offset down one row. 140a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp offset += stride; 141a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 142a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 143a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Add and widen, then divide by 16 (number of input pixels per output 144a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // pixel) and narrow data from 32 bits per pixel to 16 bpp. 145a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const uint16x4_t tmp_pix1 = vqshrn_n_u32(vpaddlq_u16(accum1), 4); 146a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const uint16x4_t tmp_pix2 = vqshrn_n_u32(vpaddlq_u16(accum2), 4); 147a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 148a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Combine 4x1 pixel strips into 8x1 pixel strip and narrow from 149a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // 16 bits to 8 bits per pixel. 150a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const uint8x8_t allpixels = vmovn_u16(vcombine_u16(tmp_pix1, tmp_pix2)); 151a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 152a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Copy all pixels from composite 8x1 vector into output strip. 153a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp vst1_u8(ptr_dst, allpixels); 154a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 155a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp ptr_dst += stride_; 156a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 157a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp} 158a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 159a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 160a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// Hardware accelerated downsampling method for supported devices. 161a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// Requires that image size be a multiple of 16 pixels in each dimension, 162a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// and that downsampling be by a factor of 2 or 4. 163a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harptemplate <> 164a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlowervoid Image<uint8_t>::DownsampleAveragedNeon(const uint8_t* const original, 165a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower const int stride, 166a99712b6229a39d6c6a6e0bb60b8ff6934216017A. Unique TensorFlower const int factor) { 167a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // TODO(andrewharp): stride is a bad approximation for the src image's width. 168a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Better to pass that in directly. 169a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp SCHECK(width_ * factor <= stride, "Uh oh!"); 170a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const int last_starting_index = width_ * factor - 32; 171a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 172a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // We process 32 input pixels lengthwise at a time. 173a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // The output per pass of this loop is an 8 wide by downsampled height tall 174a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // pixel strip. 175a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp int orig_x = 0; 176a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp for (; orig_x <= last_starting_index; orig_x += 32) { 177a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp if (factor == 2) { 178a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp Downsample2x32ColumnsNeon(original, stride, orig_x); 179a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } else { 180a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp Downsample4x32ColumnsNeon(original, stride, orig_x); 181a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 182a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 183a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 184a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // If a last pass is required, push it to the left enough so that it never 185a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // goes out of bounds. This will result in some extra computation on devices 186a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // whose frame widths are multiples of 16 and not 32. 187a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp if (orig_x < last_starting_index + 32) { 188a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp if (factor == 2) { 189a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp Downsample2x32ColumnsNeon(original, stride, last_starting_index); 190a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } else { 191a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp Downsample4x32ColumnsNeon(original, stride, last_starting_index); 192a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 193a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 194a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp} 195a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 196a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 197a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// Puts the image gradient matrix about a pixel into the 2x2 float array G. 198a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// vals_x should be an array of the window x gradient values, whose indices 199a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// can be in any order but are parallel to the vals_y entries. 200a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp// See http://robots.stanford.edu/cs223b04/algo_tracking.pdf for more details. 201a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harpvoid CalculateGNeon(const float* const vals_x, const float* const vals_y, 202a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const int num_vals, float* const G) { 203a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const float32_t* const arm_vals_x = (const float32_t*) vals_x; 204a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const float32_t* const arm_vals_y = (const float32_t*) vals_y; 205a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 206a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Running sums. 207a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp float32x4_t xx = vdupq_n_f32(0.0f); 208a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp float32x4_t xy = vdupq_n_f32(0.0f); 209a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp float32x4_t yy = vdupq_n_f32(0.0f); 210a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 211a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Maximum index we can load 4 consecutive values from. 212a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // e.g. if there are 81 values, our last full pass can be from index 77: 213a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // 81-4=>77 (77, 78, 79, 80) 214a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp const int max_i = num_vals - 4; 215a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 216a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Defined here because we want to keep track of how many values were 217a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // processed by NEON, so that we can finish off the remainder the normal 218a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // way. 219a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp int i = 0; 220a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 221a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Process values 4 at a time, accumulating the sums of 222a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // the pixel-wise x*x, x*y, and y*y values. 223a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp for (; i <= max_i; i += 4) { 224a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Load xs 225a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp float32x4_t x = vld1q_f32(arm_vals_x + i); 226a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 227a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Multiply x*x and accumulate. 228a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp xx = vmlaq_f32(xx, x, x); 229a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 230a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Load ys 231a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp float32x4_t y = vld1q_f32(arm_vals_y + i); 232a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 233a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Multiply x*y and accumulate. 234a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp xy = vmlaq_f32(xy, x, y); 235a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 236a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Multiply y*y and accumulate. 237a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp yy = vmlaq_f32(yy, y, y); 238a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 239a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 240a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp static float32_t xx_vals[4]; 241a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp static float32_t xy_vals[4]; 242a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp static float32_t yy_vals[4]; 243a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 244a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp vst1q_f32(xx_vals, xx); 245a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp vst1q_f32(xy_vals, xy); 246a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp vst1q_f32(yy_vals, yy); 247a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 248a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Accumulated values are store in sets of 4, we have to manually add 249a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // the last bits together. 250a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp for (int j = 0; j < 4; ++j) { 251a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp G[0] += xx_vals[j]; 252a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp G[1] += xy_vals[j]; 253a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp G[3] += yy_vals[j]; 254a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 255a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 256a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // Finishes off last few values (< 4) from above. 257a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp for (; i < num_vals; ++i) { 258a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp G[0] += Square(vals_x[i]); 259a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp G[1] += vals_x[i] * vals_y[i]; 260a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp G[3] += Square(vals_y[i]); 261a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp } 262a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 263a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp // The matrix is symmetric, so this is a given. 264a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp G[2] = G[1]; 265a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp} 266a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 267a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp} // namespace tf_tracking 268a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp 269a30b9926bd7d5276d6ff35af9428dee3e77b7dcbAndrew Harp#endif 270