1793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler/*M/////////////////////////////////////////////////////////////////////////////////////// 2793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 3793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 5793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// By downloading, copying, installing or using the software you agree to this license. 6793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// If you do not agree to this license, do not download, install, 7793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// copy or use the software. 8793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 9793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 10793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// License Agreement 11793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// For Open Source Computer Vision Library 12793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 13793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Third party copyrights are property of their respective owners. 16793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 17793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Redistribution and use in source and binary forms, with or without modification, 18793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// are permitted provided that the following conditions are met: 19793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 20793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// * Redistribution's of source code must retain the above copyright notice, 21793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// this list of conditions and the following disclaimer. 22793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 23793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// * Redistribution's in binary form must reproduce the above copyright notice, 24793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// this list of conditions and the following disclaimer in the documentation 25793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// and/or other materials provided with the distribution. 26793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 27793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// * The name of the copyright holders may not be used to endorse or promote products 28793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// derived from this software without specific prior written permission. 29793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 30793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// This software is provided by the copyright holders and contributors "as is" and 31793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// any express or implied warranties, including, but not limited to, the implied 32793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// warranties of merchantability and fitness for a particular purpose are disclaimed. 33793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// In no event shall the Intel Corporation or contributors be liable for any direct, 34793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// indirect, incidental, special, exemplary, or consequential damages 35793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// (including, but not limited to, procurement of substitute goods or services; 36793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// loss of use, data, or profits; or business interruption) however caused 37793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// and on any theory of liability, whether in contract, strict liability, 38793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// or tort (including negligence or otherwise) arising in any way out of 39793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// the use of this software, even if advised of the possibility of such damage. 40793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 41793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//M*/ 42793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 43793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if !defined CUDA_DISABLER 44793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 45793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "opencv2/core/cuda/common.hpp" 46793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "opencv2/core/cuda/vec_math.hpp" 47793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 48793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslernamespace cv { namespace cuda { namespace device 49793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{ 50793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler namespace match_template 51793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 52793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float sum(float v) { return v; } 53793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; } 54793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; } 55793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; } 56793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 57793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float first(float v) { return v; } 58793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float first(float2 v) { return v.x; } 59793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float first(float3 v) { return v.x; } 60793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float first(float4 v) { return v.x; } 61793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 62793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float mul(float a, float b) { return a * b; } 63793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } 64793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } 65793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } 66793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 67793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; } 68793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); } 69793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } 70793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } 71793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 72793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float sub(float a, float b) { return a - b; } 73793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); } 74793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } 75793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } 76793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 77793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; } 78793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); } 79793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } 80793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } 81793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 82793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler ////////////////////////////////////////////////////////////////////// 83793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Naive_CCORR 84793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 85793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler template <typename T, int cn> 86793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result) 87793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 88793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler typedef typename TypeVec<T, cn>::vec_type Type; 89793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler typedef typename TypeVec<float, cn>::vec_type Typef; 90793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 91793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int x = blockDim.x * blockIdx.x + threadIdx.x; 92793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int y = blockDim.y * blockIdx.y + threadIdx.y; 93793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 94793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 95793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 96793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler Typef res = VecTraits<Typef>::all(0); 97793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 98793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler for (int i = 0; i < h; ++i) 99793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 100793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const Type* image_ptr = (const Type*)image.ptr(y + i); 101793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const Type* templ_ptr = (const Type*)templ.ptr(i); 102793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler for (int j = 0; j < w; ++j) 103793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler res = res + mul(image_ptr[x + j], templ_ptr[j]); 104793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 105793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 106793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = sum(res); 107793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 108793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 109793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 110793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler template <typename T, int cn> 111793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream) 112793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 113793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const dim3 threads(32, 8); 114793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 115793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 116793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result); 117793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 118793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 119793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 120793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 121793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 122793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 123793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) 124793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 125793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); 126793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 127793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler static const caller_t callers[] = 128793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 129793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4> 130793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler }; 131793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 132793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler callers[cn](image, templ, result, stream); 133793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 134793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 135793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 136793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) 137793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 138793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); 139793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 140793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler static const caller_t callers[] = 141793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 142793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4> 143793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler }; 144793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 145793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler callers[cn](image, templ, result, stream); 146793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 147793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 148793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler ////////////////////////////////////////////////////////////////////// 149793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Naive_SQDIFF 150793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 151793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler template <typename T, int cn> 152793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result) 153793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 154793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler typedef typename TypeVec<T, cn>::vec_type Type; 155793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler typedef typename TypeVec<float, cn>::vec_type Typef; 156793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 157793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int x = blockDim.x * blockIdx.x + threadIdx.x; 158793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int y = blockDim.y * blockIdx.y + threadIdx.y; 159793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 160793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 161793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 162793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler Typef res = VecTraits<Typef>::all(0); 163793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler Typef delta; 164793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 165793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler for (int i = 0; i < h; ++i) 166793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 167793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const Type* image_ptr = (const Type*)image.ptr(y + i); 168793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const Type* templ_ptr = (const Type*)templ.ptr(i); 169793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler for (int j = 0; j < w; ++j) 170793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 171793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler delta = sub(image_ptr[x + j], templ_ptr[j]); 172793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler res = res + delta * delta; 173793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 174793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 175793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 176793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = sum(res); 177793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 178793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 179793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 180793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler template <typename T, int cn> 181793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream) 182793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 183793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const dim3 threads(32, 8); 184793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 185793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 186793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result); 187793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 188793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 189793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 190793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 191793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 192793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 193793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) 194793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 195793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); 196793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 197793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler static const caller_t callers[] = 198793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 199793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4> 200793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler }; 201793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 202793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler callers[cn](image, templ, result, stream); 203793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 204793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 205793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) 206793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 207793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); 208793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 209793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler static const caller_t callers[] = 210793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 211793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4> 212793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler }; 213793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 214793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler callers[cn](image, templ, result, stream); 215793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 216793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 217793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler ////////////////////////////////////////////////////////////////////// 218793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Prepared_SQDIFF 219793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 220793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler template <int cn> 221793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<double> image_sqsum, double templ_sqsum, PtrStepSzf result) 222793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 223793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int x = blockIdx.x * blockDim.x + threadIdx.x; 224793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int y = blockIdx.y * blockDim.y + threadIdx.y; 225793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 226793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 227793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 228793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_ = (float)( 229793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - 230793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); 231793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float ccorr = result.ptr(y)[x]; 232793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum; 233793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 234793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 235793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 236793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler template <int cn> 237793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream) 238793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 239793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const dim3 threads(32, 8); 240793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 241793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 242793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 243793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 244793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 245793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 246793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 247793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 248793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 249793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, int cn, 250793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaStream_t stream) 251793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 252793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream); 253793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 254793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler static const caller_t callers[] = 255793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 256793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4> 257793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler }; 258793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 259793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler callers[cn](w, h, image_sqsum, templ_sqsum, result, stream); 260793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 261793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 262793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler ////////////////////////////////////////////////////////////////////// 263793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Prepared_SQDIFF_NORMED 264793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 265793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // normAcc* are accurate normalization routines which make CUDA matchTemplate 266793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // consistent with CPU one 267793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 268793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ float normAcc(float num, float denum) 269793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 270793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (::fabs(num) < denum) 271793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler return num / denum; 272793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (::fabs(num) < denum * 1.125f) 273793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler return num > 0 ? 1 : -1; 274793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler return 0; 275793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 276793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 277793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 278793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __device__ float normAcc_SQDIFF(float num, float denum) 279793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 280793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (::fabs(num) < denum) 281793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler return num / denum; 282793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (::fabs(num) < denum * 1.125f) 283793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler return num > 0 ? 1 : -1; 284793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler return 1; 285793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 286793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 287793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 288793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler template <int cn> 289793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U( 290793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, const PtrStep<double> image_sqsum, 291793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler double templ_sqsum, PtrStepSzf result) 292793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 293793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int x = blockIdx.x * blockDim.x + threadIdx.x; 294793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int y = blockIdx.y * blockDim.y + threadIdx.y; 295793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 296793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 297793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 298793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_ = (float)( 299793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - 300793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); 301793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float ccorr = result.ptr(y)[x]; 302793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum, 303793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler sqrtf(image_sqsum_ * templ_sqsum)); 304793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 305793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 306793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 307793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler template <int cn> 308793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, 309793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result, cudaStream_t stream) 310793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 311793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const dim3 threads(32, 8); 312793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 313793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 314793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 315793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 316793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 317793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 318793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 319793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 320793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 321793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 322793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, 323793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result, int cn, cudaStream_t stream) 324793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 325793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream); 326793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler static const caller_t callers[] = 327793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 328793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4> 329793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler }; 330793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 331793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler callers[cn](w, h, image_sqsum, templ_sqsum, result, stream); 332793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 333793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 334793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler ////////////////////////////////////////////////////////////////////// 335793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Prepared_CCOFF 336793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 337793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<int> image_sum, PtrStepSzf result) 338793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 339793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int x = blockIdx.x * blockDim.x + threadIdx.x; 340793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int y = blockIdx.y * blockDim.y + threadIdx.y; 341793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 342793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 343793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 344793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_ = (float)( 345793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) - 346793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x])); 347793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float ccorr = result.ptr(y)[x]; 348793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale; 349793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 350793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 351793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 352793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream) 353793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 354793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 threads(32, 8); 355793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 356793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 357793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result); 358793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 359793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 360793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 361793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 362793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 363793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 364793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 365793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 366793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplatePreparedKernel_CCOFF_8UC2( 367793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, float templ_sum_scale_r, float templ_sum_scale_g, 368793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_r, 369793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_g, 370793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result) 371793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 372793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int x = blockIdx.x * blockDim.x + threadIdx.x; 373793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int y = blockIdx.y * blockDim.y + threadIdx.y; 374793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 375793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 376793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 377793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_r_ = (float)( 378793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 379793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 380793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_g_ = (float)( 381793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 382793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 383793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float ccorr = result.ptr(y)[x]; 384793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 385793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler - image_sum_g_ * templ_sum_scale_g; 386793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 387793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 388793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 389793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_CCOFF_8UC2( 390793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, 391793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_r, 392793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_g, 393793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_r, int templ_sum_g, 394793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result, cudaStream_t stream) 395793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 396793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 threads(32, 8); 397793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 398793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 399793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>( 400793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h), 401793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_r, image_sum_g, result); 402793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 403793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 404793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 405793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 406793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 407793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 408793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 409793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 410793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplatePreparedKernel_CCOFF_8UC3( 411793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, 412793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_r, 413793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_g, 414793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_b, 415793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_r, 416793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_g, 417793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_b, 418793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result) 419793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 420793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int x = blockIdx.x * blockDim.x + threadIdx.x; 421793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int y = blockIdx.y * blockDim.y + threadIdx.y; 422793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 423793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 424793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 425793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_r_ = (float)( 426793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 427793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 428793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_g_ = (float)( 429793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 430793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 431793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_b_ = (float)( 432793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - 433793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); 434793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float ccorr = result.ptr(y)[x]; 435793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 436793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler - image_sum_g_ * templ_sum_scale_g 437793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler - image_sum_b_ * templ_sum_scale_b; 438793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 439793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 440793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 441793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_CCOFF_8UC3( 442793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, 443793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_r, 444793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_g, 445793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_b, 446793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_r, 447793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_g, 448793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_b, 449793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result, cudaStream_t stream) 450793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 451793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 threads(32, 8); 452793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 453793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 454793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>( 455793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler w, h, 456793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (float)templ_sum_r / (w * h), 457793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (float)templ_sum_g / (w * h), 458793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (float)templ_sum_b / (w * h), 459793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_r, image_sum_g, image_sum_b, result); 460793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 461793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 462793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 463793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 464793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 465793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 466793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 467793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 468793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplatePreparedKernel_CCOFF_8UC4( 469793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, 470793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_r, 471793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_g, 472793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_b, 473793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_a, 474793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_r, 475793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_g, 476793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_b, 477793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_a, 478793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result) 479793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 480793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int x = blockIdx.x * blockDim.x + threadIdx.x; 481793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int y = blockIdx.y * blockDim.y + threadIdx.y; 482793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 483793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 484793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 485793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_r_ = (float)( 486793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 487793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 488793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_g_ = (float)( 489793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 490793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 491793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_b_ = (float)( 492793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - 493793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); 494793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_a_ = (float)( 495793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) - 496793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x])); 497793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float ccorr = result.ptr(y)[x]; 498793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 499793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler - image_sum_g_ * templ_sum_scale_g 500793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler - image_sum_b_ * templ_sum_scale_b 501793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler - image_sum_a_ * templ_sum_scale_a; 502793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 503793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 504793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 505793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_CCOFF_8UC4( 506793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, 507793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_r, 508793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_g, 509793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_b, 510793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_a, 511793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_r, 512793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_g, 513793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_b, 514793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_a, 515793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result, cudaStream_t stream) 516793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 517793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 threads(32, 8); 518793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 519793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 520793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>( 521793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler w, h, 522793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (float)templ_sum_r / (w * h), 523793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (float)templ_sum_g / (w * h), 524793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (float)templ_sum_b / (w * h), 525793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (float)templ_sum_a / (w * h), 526793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_r, image_sum_g, image_sum_b, image_sum_a, 527793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result); 528793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 529793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 530793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 531793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 532793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 533793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 534793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler ////////////////////////////////////////////////////////////////////// 535793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Prepared_CCOFF_NORMED 536793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 537793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U( 538793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, float weight, 539793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale, float templ_sqsum_scale, 540793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum, 541793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<double> image_sqsum, 542793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result) 543793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 544793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int x = blockIdx.x * blockDim.x + threadIdx.x; 545793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int y = blockIdx.y * blockDim.y + threadIdx.y; 546793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 547793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 548793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 549793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float ccorr = result.ptr(y)[x]; 550793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_ = (float)( 551793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) - 552793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x])); 553793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_ = (float)( 554793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) - 555793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x])); 556793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale, 557793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_))); 558793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 559793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 560793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 561793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_CCOFF_NORMED_8U( 562793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, const PtrStepSz<int> image_sum, 563793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<double> image_sqsum, 564793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum, double templ_sqsum, 565793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result, cudaStream_t stream) 566793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 567793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 threads(32, 8); 568793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 569793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 570793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float weight = 1.f / (w * h); 571793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale = templ_sum * weight; 572793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum; 573793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 574793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>( 575793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler w, h, weight, templ_sum_scale, templ_sqsum_scale, 576793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum, image_sqsum, result); 577793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 578793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 579793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 580793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 581793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 582793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 583793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 584793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 585793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2( 586793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, float weight, 587793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_r, float templ_sum_scale_g, 588793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sqsum_scale, 589793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r, 590793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g, 591793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result) 592793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 593793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int x = blockIdx.x * blockDim.x + threadIdx.x; 594793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int y = blockIdx.y * blockDim.y + threadIdx.y; 595793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 596793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 597793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 598793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_r_ = (float)( 599793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 600793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 601793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_r_ = (float)( 602793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - 603793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); 604793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_g_ = (float)( 605793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 606793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 607793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_g_ = (float)( 608793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - 609793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); 610793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 611793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r 612793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler - image_sum_g_ * templ_sum_scale_g; 613793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ 614793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_)); 615793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = normAcc(num, denum); 616793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 617793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 618793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 619793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_CCOFF_NORMED_8UC2( 620793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, 621793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r, 622793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g, 623793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_r, double templ_sqsum_r, 624793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_g, double templ_sqsum_g, 625793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result, cudaStream_t stream) 626793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 627793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 threads(32, 8); 628793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 629793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 630793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float weight = 1.f / (w * h); 631793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_r = templ_sum_r * weight; 632793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_g = templ_sum_g * weight; 633793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 634793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + templ_sqsum_g - weight * templ_sum_g * templ_sum_g; 635793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 636793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>( 637793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler w, h, weight, 638793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler templ_sum_scale_r, templ_sum_scale_g, 639793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler templ_sqsum_scale, 640793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_r, image_sqsum_r, 641793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_g, image_sqsum_g, 642793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result); 643793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 644793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 645793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 646793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 647793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 648793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 649793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 650793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 651793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3( 652793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, float weight, 653793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 654793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sqsum_scale, 655793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r, 656793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g, 657793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b, 658793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result) 659793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 660793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int x = blockIdx.x * blockDim.x + threadIdx.x; 661793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int y = blockIdx.y * blockDim.y + threadIdx.y; 662793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 663793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 664793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 665793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_r_ = (float)( 666793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 667793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 668793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_r_ = (float)( 669793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - 670793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); 671793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_g_ = (float)( 672793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 673793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 674793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_g_ = (float)( 675793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - 676793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); 677793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_b_ = (float)( 678793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - 679793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); 680793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_b_ = (float)( 681793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) - 682793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x])); 683793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 684793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r 685793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler - image_sum_g_ * templ_sum_scale_g 686793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler - image_sum_b_ * templ_sum_scale_b; 687793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ 688793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_ 689793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_)); 690793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = normAcc(num, denum); 691793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 692793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 693793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 694793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_CCOFF_NORMED_8UC3( 695793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, 696793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r, 697793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g, 698793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b, 699793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_r, double templ_sqsum_r, 700793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_g, double templ_sqsum_g, 701793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_b, double templ_sqsum_b, 702793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result, cudaStream_t stream) 703793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 704793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 threads(32, 8); 705793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 706793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 707793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float weight = 1.f / (w * h); 708793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_r = templ_sum_r * weight; 709793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_g = templ_sum_g * weight; 710793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_b = templ_sum_b * weight; 711793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 712793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + templ_sqsum_g - weight * templ_sum_g * templ_sum_g 713793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + templ_sqsum_b - weight * templ_sum_b * templ_sum_b; 714793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 715793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>( 716793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler w, h, weight, 717793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, 718793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler templ_sqsum_scale, 719793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_r, image_sqsum_r, 720793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_g, image_sqsum_g, 721793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_b, image_sqsum_b, 722793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result); 723793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 724793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 725793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 726793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 727793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 728793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 729793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 730793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 731793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4( 732793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, float weight, 733793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 734793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_a, float templ_sqsum_scale, 735793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r, 736793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g, 737793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b, 738793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStep<int> image_sum_a, const PtrStep<double> image_sqsum_a, 739793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result) 740793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 741793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int x = blockIdx.x * blockDim.x + threadIdx.x; 742793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int y = blockIdx.y * blockDim.y + threadIdx.y; 743793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 744793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 745793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 746793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_r_ = (float)( 747793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 748793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 749793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_r_ = (float)( 750793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - 751793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); 752793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_g_ = (float)( 753793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 754793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 755793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_g_ = (float)( 756793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - 757793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); 758793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_b_ = (float)( 759793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - 760793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); 761793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_b_ = (float)( 762793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) - 763793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x])); 764793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sum_a_ = (float)( 765793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) - 766793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x])); 767793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_a_ = (float)( 768793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) - 769793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x])); 770793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 771793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g 772793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a; 773793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ 774793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_ 775793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_ 776793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_)); 777793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = normAcc(num, denum); 778793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 779793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 780793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 781793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void matchTemplatePrepared_CCOFF_NORMED_8UC4( 782793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, 783793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r, 784793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g, 785793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b, 786793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a, 787793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_r, double templ_sqsum_r, 788793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_g, double templ_sqsum_g, 789793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_b, double templ_sqsum_b, 790793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int templ_sum_a, double templ_sqsum_a, 791793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler PtrStepSzf result, cudaStream_t stream) 792793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 793793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 threads(32, 8); 794793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 795793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 796793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float weight = 1.f / (w * h); 797793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_r = templ_sum_r * weight; 798793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_g = templ_sum_g * weight; 799793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_b = templ_sum_b * weight; 800793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sum_scale_a = templ_sum_a * weight; 801793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 802793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + templ_sqsum_g - weight * templ_sum_g * templ_sum_g 803793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + templ_sqsum_b - weight * templ_sum_b * templ_sum_b 804793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler + templ_sqsum_a - weight * templ_sum_a * templ_sum_a; 805793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 806793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>( 807793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler w, h, weight, 808793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, 809793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler templ_sqsum_scale, 810793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_r, image_sqsum_r, 811793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_g, image_sqsum_g, 812793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_b, image_sqsum_b, 813793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler image_sum_a, image_sqsum_a, 814793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result); 815793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 816793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 817793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 818793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 819793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 820793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 821793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler ////////////////////////////////////////////////////////////////////// 822793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // normalize 823793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 824793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler template <int cn> 825793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void normalizeKernel_8U( 826793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int w, int h, const PtrStep<double> image_sqsum, 827793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler double templ_sqsum, PtrStepSzf result) 828793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 829793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int x = blockIdx.x * blockDim.x + threadIdx.x; 830793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int y = blockIdx.y * blockDim.y + threadIdx.y; 831793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 832793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 833793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 834793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler float image_sqsum_ = (float)( 835793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - 836793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); 837793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum)); 838793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 839793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 840793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 841793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum, 842793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream) 843793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 844793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 threads(32, 8); 845793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 846793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 847793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler switch (cn) 848793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 849793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler case 1: 850793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 851793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler break; 852793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler case 2: 853793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 854793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler break; 855793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler case 3: 856793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 857793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler break; 858793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler case 4: 859793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 860793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler break; 861793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 862793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 863793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 864793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 865793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 866793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 867793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 868793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 869793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler ////////////////////////////////////////////////////////////////////// 870793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // extractFirstChannel 871793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 872793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler template <int cn> 873793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result) 874793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 875793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler typedef typename TypeVec<float, cn>::vec_type Typef; 876793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 877793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int x = blockDim.x * blockIdx.x + threadIdx.x; 878793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int y = blockDim.y * blockIdx.y + threadIdx.y; 879793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 880793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (x < result.cols && y < result.rows) 881793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 882793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler Typef val = ((const Typef*)image.ptr(y))[x]; 883793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler result.ptr(y)[x] = first(val); 884793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 885793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 886793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 887793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream) 888793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 889793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 threads(32, 8); 890793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 891793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 892793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler switch (cn) 893793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 894793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler case 1: 895793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result); 896793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler break; 897793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler case 2: 898793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result); 899793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler break; 900793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler case 3: 901793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result); 902793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler break; 903793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler case 4: 904793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result); 905793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler break; 906793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 907793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaGetLastError() ); 908793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 909793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (stream == 0) 910793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cudaSafeCall( cudaDeviceSynchronize() ); 911793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 912793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } //namespace match_template 913793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}}} // namespace cv { namespace cuda { namespace cudev 914793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 915793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 916793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif /* CUDA_DISABLER */ 917