1793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler/*M///////////////////////////////////////////////////////////////////////////////////////
2793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
3793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
5793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//  By downloading, copying, installing or using the software you agree to this license.
6793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//  If you do not agree to this license, do not download, install,
7793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//  copy or use the software.
8793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
9793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
10793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//                           License Agreement
11793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//                For Open Source Computer Vision Library
12793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
13793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Third party copyrights are property of their respective owners.
16793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
17793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Redistribution and use in source and binary forms, with or without modification,
18793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// are permitted provided that the following conditions are met:
19793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
20793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//   * Redistribution's of source code must retain the above copyright notice,
21793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//     this list of conditions and the following disclaimer.
22793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
23793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//   * Redistribution's in binary form must reproduce the above copyright notice,
24793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//     this list of conditions and the following disclaimer in the documentation
25793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//     and/or other materials provided with the distribution.
26793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
27793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//   * The name of the copyright holders may not be used to endorse or promote products
28793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//     derived from this software without specific prior written permission.
29793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
30793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// This software is provided by the copyright holders and contributors "as is" and
31793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// any express or implied warranties, including, but not limited to, the implied
32793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// warranties of merchantability and fitness for a particular purpose are disclaimed.
33793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// In no event shall the Intel Corporation or contributors be liable for any direct,
34793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// indirect, incidental, special, exemplary, or consequential damages
35793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// (including, but not limited to, procurement of substitute goods or services;
36793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// loss of use, data, or profits; or business interruption) however caused
37793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// and on any theory of liability, whether in contract, strict liability,
38793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// or tort (including negligence or otherwise) arising in any way out of
39793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// the use of this software, even if advised of the possibility of such damage.
40793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
41793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//M*/
42793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
43793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if !defined CUDA_DISABLER
44793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
45793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "opencv2/core/cuda/common.hpp"
46793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "opencv2/core/cuda/vec_math.hpp"
47793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
48793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslernamespace cv { namespace cuda { namespace device
49793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{
50793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    namespace match_template
51793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    {
52793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float sum(float v) { return v; }
53793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
54793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
55793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
56793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
57793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float first(float v) { return v; }
58793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float first(float2 v) { return v.x; }
59793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float first(float3 v) { return v.x; }
60793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float first(float4 v) { return v.x; }
61793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
62793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float mul(float a, float b) { return a * b; }
63793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
64793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
65793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
66793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
67793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
68793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
69793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
70793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
71793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
72793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float sub(float a, float b) { return a - b; }
73793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
74793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
75793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
76793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
77793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
78793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
79793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
80793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
81793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
82793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        //////////////////////////////////////////////////////////////////////
83793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        // Naive_CCORR
84793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
85793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        template <typename T, int cn>
86793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
87793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
88793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            typedef typename TypeVec<T, cn>::vec_type Type;
89793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            typedef typename TypeVec<float, cn>::vec_type Typef;
90793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
91793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            int x = blockDim.x * blockIdx.x + threadIdx.x;
92793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            int y = blockDim.y * blockIdx.y + threadIdx.y;
93793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
94793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
95793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
96793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                Typef res = VecTraits<Typef>::all(0);
97793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
98793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                for (int i = 0; i < h; ++i)
99793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                {
100793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const Type* image_ptr = (const Type*)image.ptr(y + i);
101793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const Type* templ_ptr = (const Type*)templ.ptr(i);
102793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    for (int j = 0; j < w; ++j)
103793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        res = res + mul(image_ptr[x + j], templ_ptr[j]);
104793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                }
105793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
106793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = sum(res);
107793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
108793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
109793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
110793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        template <typename T, int cn>
111793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
112793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
113793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const dim3 threads(32, 8);
114793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
115793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
116793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
117793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
118793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
119793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
120793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
121793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
122793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
123793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
124793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
125793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
126793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
127793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            static const caller_t callers[] =
128793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
129793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
130793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            };
131793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
132793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            callers[cn](image, templ, result, stream);
133793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
134793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
135793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
136793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
137793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
138793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
139793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
140793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            static const caller_t callers[] =
141793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
142793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
143793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            };
144793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
145793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            callers[cn](image, templ, result, stream);
146793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
147793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
148793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        //////////////////////////////////////////////////////////////////////
149793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        // Naive_SQDIFF
150793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
151793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        template <typename T, int cn>
152793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
153793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
154793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            typedef typename TypeVec<T, cn>::vec_type Type;
155793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            typedef typename TypeVec<float, cn>::vec_type Typef;
156793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
157793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            int x = blockDim.x * blockIdx.x + threadIdx.x;
158793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            int y = blockDim.y * blockIdx.y + threadIdx.y;
159793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
160793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
161793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
162793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                Typef res = VecTraits<Typef>::all(0);
163793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                Typef delta;
164793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
165793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                for (int i = 0; i < h; ++i)
166793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                {
167793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const Type* image_ptr = (const Type*)image.ptr(y + i);
168793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const Type* templ_ptr = (const Type*)templ.ptr(i);
169793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    for (int j = 0; j < w; ++j)
170793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    {
171793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        delta = sub(image_ptr[x + j], templ_ptr[j]);
172793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        res = res + delta * delta;
173793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    }
174793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                }
175793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
176793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = sum(res);
177793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
178793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
179793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
180793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        template <typename T, int cn>
181793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
182793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
183793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const dim3 threads(32, 8);
184793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
185793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
186793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
187793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
188793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
189793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
190793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
191793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
192793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
193793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
194793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
195793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
196793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
197793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            static const caller_t callers[] =
198793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
199793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
200793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            };
201793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
202793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            callers[cn](image, templ, result, stream);
203793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
204793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
205793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
206793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
207793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
208793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
209793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            static const caller_t callers[] =
210793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
211793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
212793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            };
213793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
214793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            callers[cn](image, templ, result, stream);
215793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
216793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
217793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        //////////////////////////////////////////////////////////////////////
218793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        // Prepared_SQDIFF
219793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
220793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        template <int cn>
221793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<double> image_sqsum, double templ_sqsum, PtrStepSzf result)
222793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
223793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int x = blockIdx.x * blockDim.x + threadIdx.x;
224793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int y = blockIdx.y * blockDim.y + threadIdx.y;
225793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
226793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
227793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
228793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_ = (float)(
229793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
230793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
231793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float ccorr = result.ptr(y)[x];
232793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
233793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
234793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
235793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
236793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        template <int cn>
237793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream)
238793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
239793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const dim3 threads(32, 8);
240793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
241793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
242793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
243793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
244793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
245793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
246793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
247793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
248793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
249793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, int cn,
250793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                             cudaStream_t stream)
251793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
252793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);
253793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
254793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            static const caller_t callers[] =
255793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
256793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
257793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            };
258793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
259793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
260793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
261793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
262793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        //////////////////////////////////////////////////////////////////////
263793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        // Prepared_SQDIFF_NORMED
264793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
265793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        // normAcc* are accurate normalization routines which make CUDA matchTemplate
266793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        // consistent with CPU one
267793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
268793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ float normAcc(float num, float denum)
269793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
270793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (::fabs(num) < denum)
271793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                return num / denum;
272793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (::fabs(num) < denum * 1.125f)
273793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                return num > 0 ? 1 : -1;
274793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            return 0;
275793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
276793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
277793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
278793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __device__ float normAcc_SQDIFF(float num, float denum)
279793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
280793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (::fabs(num) < denum)
281793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                return num / denum;
282793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (::fabs(num) < denum * 1.125f)
283793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                return num > 0 ? 1 : -1;
284793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            return 1;
285793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
286793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
287793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
288793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        template <int cn>
289793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
290793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h, const PtrStep<double> image_sqsum,
291793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                double templ_sqsum, PtrStepSzf result)
292793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
293793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int x = blockIdx.x * blockDim.x + threadIdx.x;
294793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int y = blockIdx.y * blockDim.y + threadIdx.y;
295793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
296793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
297793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
298793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_ = (float)(
299793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
300793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
301793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float ccorr = result.ptr(y)[x];
302793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
303793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                  sqrtf(image_sqsum_ * templ_sqsum));
304793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
305793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
306793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
307793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        template <int cn>
308793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
309793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                    PtrStepSzf result, cudaStream_t stream)
310793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
311793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const dim3 threads(32, 8);
312793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
313793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
314793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
315793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
316793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
317793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
318793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
319793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
320793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
321793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
322793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
323793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                    PtrStepSzf result, int cn, cudaStream_t stream)
324793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
325793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);
326793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            static const caller_t callers[] =
327793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
328793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
329793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            };
330793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
331793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
332793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
333793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
334793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        //////////////////////////////////////////////////////////////////////
335793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        // Prepared_CCOFF
336793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
337793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<int> image_sum, PtrStepSzf result)
338793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
339793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int x = blockIdx.x * blockDim.x + threadIdx.x;
340793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int y = blockIdx.y * blockDim.y + threadIdx.y;
341793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
342793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
343793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
344793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_ = (float)(
345793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
346793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
347793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float ccorr = result.ptr(y)[x];
348793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
349793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
350793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
351793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
352793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream)
353793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
354793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 threads(32, 8);
355793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
356793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
357793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
358793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
359793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
360793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
361793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
362793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
363793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
364793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
365793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
366793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
367793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
368793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_r,
369793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_g,
370793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                PtrStepSzf result)
371793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
372793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int x = blockIdx.x * blockDim.x + threadIdx.x;
373793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int y = blockIdx.y * blockDim.y + threadIdx.y;
374793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
375793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
376793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
377793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_r_ = (float)(
378793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
379793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
380793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_g_ = (float)(
381793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
382793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
383793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float ccorr = result.ptr(y)[x];
384793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
385793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                         - image_sum_g_ * templ_sum_scale_g;
386793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
387793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
388793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
389793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_CCOFF_8UC2(
390793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h,
391793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStepSz<int> image_sum_r,
392793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStepSz<int> image_sum_g,
393793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int templ_sum_r, int templ_sum_g,
394793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                PtrStepSzf result, cudaStream_t stream)
395793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
396793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 threads(32, 8);
397793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
398793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
399793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
400793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
401793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_r, image_sum_g, result);
402793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
403793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
404793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
405793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
406793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
407793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
408793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
409793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
410793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
411793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h,
412793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale_r,
413793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale_g,
414793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale_b,
415793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_r,
416793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_g,
417793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_b,
418793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                PtrStepSzf result)
419793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
420793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int x = blockIdx.x * blockDim.x + threadIdx.x;
421793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int y = blockIdx.y * blockDim.y + threadIdx.y;
422793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
423793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
424793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
425793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_r_ = (float)(
426793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
427793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
428793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_g_ = (float)(
429793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
430793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
431793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_b_ = (float)(
432793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
433793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
434793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float ccorr = result.ptr(y)[x];
435793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
436793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                         - image_sum_g_ * templ_sum_scale_g
437793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                         - image_sum_b_ * templ_sum_scale_b;
438793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
439793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
440793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
441793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_CCOFF_8UC3(
442793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h,
443793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStepSz<int> image_sum_r,
444793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStepSz<int> image_sum_g,
445793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStepSz<int> image_sum_b,
446793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int templ_sum_r,
447793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int templ_sum_g,
448793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int templ_sum_b,
449793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                PtrStepSzf result, cudaStream_t stream)
450793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
451793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 threads(32, 8);
452793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
453793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
454793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
455793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    w, h,
456793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    (float)templ_sum_r / (w * h),
457793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    (float)templ_sum_g / (w * h),
458793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    (float)templ_sum_b / (w * h),
459793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_r, image_sum_g, image_sum_b, result);
460793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
461793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
462793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
463793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
464793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
465793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
466793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
467793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
468793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
469793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h,
470793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale_r,
471793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale_g,
472793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale_b,
473793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale_a,
474793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_r,
475793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_g,
476793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_b,
477793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_a,
478793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                PtrStepSzf result)
479793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
480793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int x = blockIdx.x * blockDim.x + threadIdx.x;
481793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int y = blockIdx.y * blockDim.y + threadIdx.y;
482793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
483793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
484793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
485793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_r_ = (float)(
486793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
487793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
488793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_g_ = (float)(
489793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
490793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
491793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_b_ = (float)(
492793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
493793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
494793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_a_ = (float)(
495793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
496793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
497793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float ccorr = result.ptr(y)[x];
498793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
499793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                         - image_sum_g_ * templ_sum_scale_g
500793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                         - image_sum_b_ * templ_sum_scale_b
501793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                         - image_sum_a_ * templ_sum_scale_a;
502793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
503793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
504793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
505793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_CCOFF_8UC4(
506793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h,
507793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStepSz<int> image_sum_r,
508793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStepSz<int> image_sum_g,
509793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStepSz<int> image_sum_b,
510793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStepSz<int> image_sum_a,
511793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int templ_sum_r,
512793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int templ_sum_g,
513793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int templ_sum_b,
514793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int templ_sum_a,
515793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                PtrStepSzf result, cudaStream_t stream)
516793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
517793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 threads(32, 8);
518793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
519793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
520793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
521793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    w, h,
522793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    (float)templ_sum_r / (w * h),
523793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    (float)templ_sum_g / (w * h),
524793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    (float)templ_sum_b / (w * h),
525793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    (float)templ_sum_a / (w * h),
526793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_r, image_sum_g, image_sum_b, image_sum_a,
527793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    result);
528793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
529793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
530793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
531793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
532793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
533793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
534793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        //////////////////////////////////////////////////////////////////////
535793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        // Prepared_CCOFF_NORMED
536793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
537793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
538793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h, float weight,
539793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale, float templ_sqsum_scale,
540793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum,
541793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<double> image_sqsum,
542793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                PtrStepSzf result)
543793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
544793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int x = blockIdx.x * blockDim.x + threadIdx.x;
545793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int y = blockIdx.y * blockDim.y + threadIdx.y;
546793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
547793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
548793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
549793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float ccorr = result.ptr(y)[x];
550793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_ = (float)(
551793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
552793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
553793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_ = (float)(
554793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
555793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
556793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
557793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                           sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
558793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
559793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
560793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
561793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_CCOFF_NORMED_8U(
562793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int w, int h, const PtrStepSz<int> image_sum,
563793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const PtrStepSz<double> image_sqsum,
564793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int templ_sum, double templ_sqsum,
565793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    PtrStepSzf result, cudaStream_t stream)
566793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
567793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 threads(32, 8);
568793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
569793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
570793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float weight = 1.f / (w * h);
571793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sum_scale = templ_sum * weight;
572793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
573793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
574793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
575793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    w, h, weight, templ_sum_scale, templ_sqsum_scale,
576793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum, image_sqsum, result);
577793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
578793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
579793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
580793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
581793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
582793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
583793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
584793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
585793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
586793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h, float weight,
587793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale_r, float templ_sum_scale_g,
588793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sqsum_scale,
589793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
590793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
591793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                PtrStepSzf result)
592793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
593793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int x = blockIdx.x * blockDim.x + threadIdx.x;
594793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int y = blockIdx.y * blockDim.y + threadIdx.y;
595793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
596793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
597793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
598793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_r_ = (float)(
599793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
600793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
601793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_r_ = (float)(
602793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
603793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
604793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_g_ = (float)(
605793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
606793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
607793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_g_ = (float)(
608793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
609793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
610793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
611793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
612793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                             - image_sum_g_ * templ_sum_scale_g;
613793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
614793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
615793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = normAcc(num, denum);
616793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
617793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
618793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
619793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
620793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int w, int h,
621793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
622793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
623793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int templ_sum_r, double templ_sqsum_r,
624793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int templ_sum_g, double templ_sqsum_g,
625793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    PtrStepSzf result, cudaStream_t stream)
626793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
627793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 threads(32, 8);
628793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
629793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
630793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float weight = 1.f / (w * h);
631793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sum_scale_r = templ_sum_r * weight;
632793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sum_scale_g = templ_sum_g * weight;
633793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
634793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
635793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
636793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
637793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    w, h, weight,
638793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    templ_sum_scale_r, templ_sum_scale_g,
639793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    templ_sqsum_scale,
640793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_r, image_sqsum_r,
641793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_g, image_sqsum_g,
642793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    result);
643793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
644793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
645793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
646793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
647793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
648793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
649793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
650793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
651793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
652793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h, float weight,
653793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
654793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sqsum_scale,
655793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
656793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
657793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
658793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                PtrStepSzf result)
659793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
660793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int x = blockIdx.x * blockDim.x + threadIdx.x;
661793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int y = blockIdx.y * blockDim.y + threadIdx.y;
662793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
663793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
664793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
665793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_r_ = (float)(
666793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
667793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
668793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_r_ = (float)(
669793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
670793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
671793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_g_ = (float)(
672793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
673793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
674793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_g_ = (float)(
675793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
676793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
677793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_b_ = (float)(
678793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
679793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
680793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_b_ = (float)(
681793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
682793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
683793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
684793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
685793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                             - image_sum_g_ * templ_sum_scale_g
686793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                             - image_sum_b_ * templ_sum_scale_b;
687793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
688793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
689793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
690793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = normAcc(num, denum);
691793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
692793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
693793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
694793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
695793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int w, int h,
696793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
697793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
698793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
699793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int templ_sum_r, double templ_sqsum_r,
700793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int templ_sum_g, double templ_sqsum_g,
701793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int templ_sum_b, double templ_sqsum_b,
702793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    PtrStepSzf result, cudaStream_t stream)
703793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
704793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 threads(32, 8);
705793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
706793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
707793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float weight = 1.f / (w * h);
708793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sum_scale_r = templ_sum_r * weight;
709793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sum_scale_g = templ_sum_g * weight;
710793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sum_scale_b = templ_sum_b * weight;
711793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
712793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
713793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
714793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
715793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
716793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    w, h, weight,
717793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
718793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    templ_sqsum_scale,
719793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_r, image_sqsum_r,
720793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_g, image_sqsum_g,
721793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_b, image_sqsum_b,
722793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    result);
723793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
724793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
725793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
726793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
727793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
728793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
729793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
730793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
731793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
732793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h, float weight,
733793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
734793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float templ_sum_scale_a, float templ_sqsum_scale,
735793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
736793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
737793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
738793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                const PtrStep<int> image_sum_a, const PtrStep<double> image_sqsum_a,
739793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                PtrStepSzf result)
740793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
741793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int x = blockIdx.x * blockDim.x + threadIdx.x;
742793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int y = blockIdx.y * blockDim.y + threadIdx.y;
743793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
744793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
745793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
746793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_r_ = (float)(
747793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
748793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
749793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_r_ = (float)(
750793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
751793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
752793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_g_ = (float)(
753793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
754793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
755793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_g_ = (float)(
756793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
757793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
758793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_b_ = (float)(
759793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
760793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
761793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_b_ = (float)(
762793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
763793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
764793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sum_a_ = (float)(
765793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
766793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
767793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_a_ = (float)(
768793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
769793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
770793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
771793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
772793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                             - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
773793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
774793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
775793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
776793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                         + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
777793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = normAcc(num, denum);
778793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
779793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
780793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
781793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
782793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int w, int h,
783793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
784793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
785793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
786793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a,
787793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int templ_sum_r, double templ_sqsum_r,
788793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int templ_sum_g, double templ_sqsum_g,
789793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int templ_sum_b, double templ_sqsum_b,
790793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    int templ_sum_a, double templ_sqsum_a,
791793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    PtrStepSzf result, cudaStream_t stream)
792793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
793793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 threads(32, 8);
794793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
795793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
796793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float weight = 1.f / (w * h);
797793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sum_scale_r = templ_sum_r * weight;
798793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sum_scale_g = templ_sum_g * weight;
799793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sum_scale_b = templ_sum_b * weight;
800793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sum_scale_a = templ_sum_a * weight;
801793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
802793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
803793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
804793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                      + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
805793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
806793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
807793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    w, h, weight,
808793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
809793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    templ_sqsum_scale,
810793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_r, image_sqsum_r,
811793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_g, image_sqsum_g,
812793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_b, image_sqsum_b,
813793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    image_sum_a, image_sqsum_a,
814793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                    result);
815793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
816793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
817793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
818793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
819793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
820793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
821793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        //////////////////////////////////////////////////////////////////////
822793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        // normalize
823793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
824793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        template <int cn>
825793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void normalizeKernel_8U(
826793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                int w, int h, const PtrStep<double> image_sqsum,
827793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                double templ_sqsum, PtrStepSzf result)
828793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
829793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int x = blockIdx.x * blockDim.x + threadIdx.x;
830793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            const int y = blockIdx.y * blockDim.y + threadIdx.y;
831793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
832793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
833793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
834793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                float image_sqsum_ = (float)(
835793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
836793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
837793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
838793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
839793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
840793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
841793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum,
842793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                          double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
843793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
844793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 threads(32, 8);
845793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
846793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
847793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            switch (cn)
848793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
849793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            case 1:
850793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
851793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                break;
852793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            case 2:
853793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
854793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                break;
855793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            case 3:
856793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
857793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                break;
858793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            case 4:
859793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
860793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                break;
861793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
862793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
863793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
864793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
865793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
866793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
867793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
868793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
869793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        //////////////////////////////////////////////////////////////////////
870793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        // extractFirstChannel
871793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
872793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        template <int cn>
873793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
874793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
875793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            typedef typename TypeVec<float, cn>::vec_type Typef;
876793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
877793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            int x = blockDim.x * blockIdx.x + threadIdx.x;
878793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            int y = blockDim.y * blockIdx.y + threadIdx.y;
879793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
880793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (x < result.cols && y < result.rows)
881793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
882793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                Typef val = ((const Typef*)image.ptr(y))[x];
883793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                result.ptr(y)[x] = first(val);
884793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
885793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
886793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
887793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
888793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
889793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 threads(32, 8);
890793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
891793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
892793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            switch (cn)
893793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            {
894793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            case 1:
895793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
896793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                break;
897793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            case 2:
898793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
899793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                break;
900793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            case 3:
901793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
902793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                break;
903793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            case 4:
904793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
905793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                break;
906793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            }
907793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            cudaSafeCall( cudaGetLastError() );
908793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
909793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            if (stream == 0)
910793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                cudaSafeCall( cudaDeviceSynchronize() );
911793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
912793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    } //namespace match_template
913793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}}} // namespace cv { namespace cuda { namespace cudev
914793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
915793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
916793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif /* CUDA_DISABLER */
917