1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                        Intel License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2000, Intel Corporation, all rights reserved.
14// Third party copyrights are property of their respective owners.
15//
16// Redistribution and use in source and binary forms, with or without modification,
17// are permitted provided that the following conditions are met:
18//
19//   * Redistribution's of source code must retain the above copyright notice,
20//     this list of conditions and the following disclaimer.
21//
22//   * Redistribution's in binary form must reproduce the above copyright notice,
23//     this list of conditions and the following disclaimer in the documentation
24//     and/or other materials provided with the distribution.
25//
26//   * The name of Intel Corporation may not be used to endorse or promote products
27//     derived from this software without specific prior written permission.
28//
29// This software is provided by the copyright holders and contributors "as is" and
30// any express or implied warranties, including, but not limited to, the implied
31// warranties of merchantability and fitness for a particular purpose are disclaimed.
32// In no event shall the Intel Corporation or contributors be liable for any direct,
33// indirect, incidental, special, exemplary, or consequential damages
34// (including, but not limited to, procurement of substitute goods or services;
35// loss of use, data, or profits; or business interruption) however caused
36// and on any theory of liability, whether in contract, strict liability,
37// or tort (including negligence or otherwise) arising in any way out of
38// the use of this software, even if advised of the possibility of such damage.
39//
40//M*/
41
42#include "_cv.h"
43
44
45/****************************************************************************************\
46                                    Base Image Filter
47\****************************************************************************************/
48
49static void default_x_filter_func( const uchar*, uchar*, void* )
50{
51}
52
53static void default_y_filter_func( uchar**, uchar*, int, int, void* )
54{
55}
56
57CvBaseImageFilter::CvBaseImageFilter()
58{
59    min_depth = CV_8U;
60    buffer = 0;
61    rows = 0;
62    max_width = 0;
63    x_func = default_x_filter_func;
64    y_func = default_y_filter_func;
65}
66
67
68CvBaseImageFilter::CvBaseImageFilter( int _max_width, int _src_type, int _dst_type,
69                                      bool _is_separable, CvSize _ksize, CvPoint _anchor,
70                                      int _border_mode, CvScalar _border_value )
71{
72    min_depth = CV_8U;
73    buffer = 0;
74    rows = 0;
75    max_width = 0;
76    x_func = default_x_filter_func;
77    y_func = default_y_filter_func;
78
79    init( _max_width, _src_type, _dst_type, _is_separable,
80          _ksize, _anchor, _border_mode, _border_value );
81}
82
83
84void CvBaseImageFilter::clear()
85{
86    cvFree( &buffer );
87    rows = 0;
88}
89
90
91CvBaseImageFilter::~CvBaseImageFilter()
92{
93    clear();
94}
95
96
97void CvBaseImageFilter::get_work_params()
98{
99    int min_rows = max_ky*2 + 3, rows = MAX(min_rows,10), row_sz;
100    int width = max_width, trow_sz = 0;
101
102    if( is_separable )
103    {
104        int max_depth = MAX(CV_MAT_DEPTH(src_type), CV_MAT_DEPTH(dst_type));
105        int max_cn = MAX(CV_MAT_CN(src_type), CV_MAT_CN(dst_type));
106        max_depth = MAX( max_depth, min_depth );
107        work_type = CV_MAKETYPE( max_depth, max_cn );
108        trow_sz = cvAlign( (max_width + ksize.width - 1)*CV_ELEM_SIZE(src_type), ALIGN );
109    }
110    else
111    {
112        work_type = src_type;
113        width += ksize.width - 1;
114    }
115    row_sz = cvAlign( width*CV_ELEM_SIZE(work_type), ALIGN );
116    buf_size = rows*row_sz;
117    buf_size = MIN( buf_size, 1 << 16 );
118    buf_size = MAX( buf_size, min_rows*row_sz );
119    max_rows = (buf_size/row_sz)*3 + max_ky*2 + 8;
120    buf_size += trow_sz;
121}
122
123
124void CvBaseImageFilter::init( int _max_width, int _src_type, int _dst_type,
125                              bool _is_separable, CvSize _ksize, CvPoint _anchor,
126                              int _border_mode, CvScalar _border_value )
127{
128    CV_FUNCNAME( "CvBaseImageFilter::init" );
129
130    __BEGIN__;
131
132    int total_buf_sz, src_pix_sz, row_tab_sz, bsz;
133    uchar* ptr;
134
135    if( !(buffer && _max_width <= max_width && _src_type == src_type &&
136        _dst_type == dst_type && _is_separable == is_separable &&
137        _ksize.width == ksize.width && _ksize.height == ksize.height &&
138        _anchor.x == anchor.x && _anchor.y == anchor.y) )
139        clear();
140
141    is_separable = _is_separable != 0;
142    max_width = _max_width; //MAX(_max_width,_ksize.width);
143    src_type = CV_MAT_TYPE(_src_type);
144    dst_type = CV_MAT_TYPE(_dst_type);
145    ksize = _ksize;
146    anchor = _anchor;
147
148    if( anchor.x == -1 )
149        anchor.x = ksize.width / 2;
150    if( anchor.y == -1 )
151        anchor.y = ksize.height / 2;
152
153    max_ky = MAX( anchor.y, ksize.height - anchor.y - 1 );
154    border_mode = _border_mode;
155    border_value = _border_value;
156
157    if( ksize.width <= 0 || ksize.height <= 0 ||
158        (unsigned)anchor.x >= (unsigned)ksize.width ||
159        (unsigned)anchor.y >= (unsigned)ksize.height )
160        CV_ERROR( CV_StsOutOfRange, "invalid kernel size and/or anchor position" );
161
162    if( border_mode != IPL_BORDER_CONSTANT && border_mode != IPL_BORDER_REPLICATE &&
163        border_mode != IPL_BORDER_REFLECT && border_mode != IPL_BORDER_REFLECT_101 )
164        CV_ERROR( CV_StsBadArg, "Invalid/unsupported border mode" );
165
166    get_work_params();
167
168    prev_width = 0;
169    prev_x_range = cvSlice(0,0);
170
171    buf_size = cvAlign( buf_size, ALIGN );
172
173    src_pix_sz = CV_ELEM_SIZE(src_type);
174    border_tab_sz1 = anchor.x*src_pix_sz;
175    border_tab_sz = (ksize.width-1)*src_pix_sz;
176    bsz = cvAlign( border_tab_sz*sizeof(int), ALIGN );
177
178    assert( max_rows > max_ky*2 );
179    row_tab_sz = cvAlign( max_rows*sizeof(uchar*), ALIGN );
180    total_buf_sz = buf_size + row_tab_sz + bsz;
181
182    CV_CALL( ptr = buffer = (uchar*)cvAlloc( total_buf_sz ));
183
184    rows = (uchar**)ptr;
185    ptr += row_tab_sz;
186    border_tab = (int*)ptr;
187    ptr += bsz;
188
189    buf_start = ptr;
190    const_row = 0;
191
192    if( border_mode == IPL_BORDER_CONSTANT )
193        cvScalarToRawData( &border_value, border_tab, src_type, 0 );
194
195    __END__;
196}
197
198
199void CvBaseImageFilter::start_process( CvSlice x_range, int width )
200{
201    int mode = border_mode;
202    int pix_sz = CV_ELEM_SIZE(src_type), work_pix_sz = CV_ELEM_SIZE(work_type);
203    int bsz = buf_size, bw = x_range.end_index - x_range.start_index, bw1 = bw + ksize.width - 1;
204    int tr_step = cvAlign(bw1*pix_sz, ALIGN );
205    int i, j, k, ofs;
206
207    if( x_range.start_index == prev_x_range.start_index &&
208        x_range.end_index == prev_x_range.end_index &&
209        width == prev_width )
210        return;
211
212    prev_x_range = x_range;
213    prev_width = width;
214
215    if( !is_separable )
216        bw = bw1;
217    else
218        bsz -= tr_step;
219
220    buf_step = cvAlign(bw*work_pix_sz, ALIGN);
221
222    if( mode == IPL_BORDER_CONSTANT )
223        bsz -= buf_step;
224    buf_max_count = bsz/buf_step;
225    buf_max_count = MIN( buf_max_count, max_rows - max_ky*2 );
226    buf_end = buf_start + buf_max_count*buf_step;
227
228    if( mode == IPL_BORDER_CONSTANT )
229    {
230        int i, tab_len = ksize.width*pix_sz;
231        uchar* bt = (uchar*)border_tab;
232        uchar* trow = buf_end;
233        const_row = buf_end + (is_separable ? 1 : 0)*tr_step;
234
235        for( i = pix_sz; i < tab_len; i++ )
236            bt[i] = bt[i - pix_sz];
237        for( i = 0; i < pix_sz; i++ )
238            trow[i] = bt[i];
239        for( i = pix_sz; i < tr_step; i++ )
240            trow[i] = trow[i - pix_sz];
241        if( is_separable )
242            x_func( trow, const_row, this );
243        return;
244    }
245
246    if( x_range.end_index - x_range.start_index <= 1 )
247        mode = IPL_BORDER_REPLICATE;
248
249    width = (width - 1)*pix_sz;
250    ofs = (anchor.x-x_range.start_index)*pix_sz;
251
252    for( k = 0; k < 2; k++ )
253    {
254        int idx, delta;
255        int i1, i2, di;
256
257        if( k == 0 )
258        {
259            idx = (x_range.start_index - 1)*pix_sz;
260            delta = di = -pix_sz;
261            i1 = border_tab_sz1 - pix_sz;
262            i2 = -pix_sz;
263        }
264        else
265        {
266            idx = x_range.end_index*pix_sz;
267            delta = di = pix_sz;
268            i1 = border_tab_sz1;
269            i2 = border_tab_sz;
270        }
271
272        if( (unsigned)idx > (unsigned)width )
273        {
274            int shift = mode == IPL_BORDER_REFLECT_101 ? pix_sz : 0;
275            idx = k == 0 ? shift : width - shift;
276            delta = -delta;
277        }
278
279        for( i = i1; i != i2; i += di )
280        {
281            for( j = 0; j < pix_sz; j++ )
282                border_tab[i + j] = idx + ofs + j;
283            if( mode != IPL_BORDER_REPLICATE )
284            {
285                if( (delta > 0 && idx == width) ||
286                    (delta < 0 && idx == 0) )
287                {
288                    if( mode == IPL_BORDER_REFLECT_101 )
289                        idx -= delta*2;
290                    delta = -delta;
291                }
292                else
293                    idx += delta;
294            }
295        }
296    }
297}
298
299
300void CvBaseImageFilter::make_y_border( int row_count, int top_rows, int bottom_rows )
301{
302    int i;
303
304    if( border_mode == IPL_BORDER_CONSTANT ||
305        border_mode == IPL_BORDER_REPLICATE )
306    {
307        uchar* row1 = border_mode == IPL_BORDER_CONSTANT ? const_row : rows[max_ky];
308
309        for( i = 0; i < top_rows && rows[i] == 0; i++ )
310            rows[i] = row1;
311
312        row1 = border_mode == IPL_BORDER_CONSTANT ? const_row : rows[row_count-1];
313        for( i = 0; i < bottom_rows; i++ )
314            rows[i + row_count] = row1;
315    }
316    else
317    {
318        int j, dj = 1, shift = border_mode == IPL_BORDER_REFLECT_101;
319
320        for( i = top_rows-1, j = top_rows+shift; i >= 0; i-- )
321        {
322            if( rows[i] == 0 )
323                rows[i] = rows[j];
324            j += dj;
325            if( dj > 0 && j >= row_count )
326            {
327                if( !bottom_rows )
328                    break;
329                j -= 1 + shift;
330                dj = -dj;
331            }
332        }
333
334        for( i = 0, j = row_count-1-shift; i < bottom_rows; i++, j-- )
335            rows[i + row_count] = rows[j];
336    }
337}
338
339
340int CvBaseImageFilter::fill_cyclic_buffer( const uchar* src, int src_step,
341                                           int y0, int y1, int y2 )
342{
343    int i, y = y0, bsz1 = border_tab_sz1, bsz = border_tab_sz;
344    int pix_size = CV_ELEM_SIZE(src_type);
345    int width = prev_x_range.end_index - prev_x_range.start_index, width_n = width*pix_size;
346    bool can_use_src_as_trow = false; //is_separable && width >= ksize.width;
347
348    // fill the cyclic buffer
349    for( ; buf_count < buf_max_count && y < y2; buf_count++, y++, src += src_step )
350    {
351        uchar* trow = is_separable ? buf_end : buf_tail;
352        uchar* bptr = can_use_src_as_trow && y1 < y && y+1 < y2 ? (uchar*)(src - bsz1) : trow;
353
354        if( bptr != trow )
355        {
356            for( i = 0; i < bsz1; i++ )
357                trow[i] = bptr[i];
358            for( ; i < bsz; i++ )
359                trow[i] = bptr[i + width_n];
360        }
361        else if( !(((size_t)(bptr + bsz1)|(size_t)src|width_n) & (sizeof(int)-1)) )
362            for( i = 0; i < width_n; i += sizeof(int) )
363                *(int*)(bptr + i + bsz1) = *(int*)(src + i);
364        else
365            for( i = 0; i < width_n; i++ )
366                bptr[i + bsz1] = src[i];
367
368        if( border_mode != IPL_BORDER_CONSTANT )
369        {
370            for( i = 0; i < bsz1; i++ )
371            {
372                int j = border_tab[i];
373                bptr[i] = bptr[j];
374            }
375            for( ; i < bsz; i++ )
376            {
377                int j = border_tab[i];
378                bptr[i + width_n] = bptr[j];
379            }
380        }
381        else
382        {
383            const uchar *bt = (uchar*)border_tab;
384            for( i = 0; i < bsz1; i++ )
385                bptr[i] = bt[i];
386
387            for( ; i < bsz; i++ )
388                bptr[i + width_n] = bt[i];
389        }
390
391        if( is_separable )
392        {
393            x_func( bptr, buf_tail, this );
394            if( bptr != trow )
395            {
396                for( i = 0; i < bsz1; i++ )
397                    bptr[i] = trow[i];
398                for( ; i < bsz; i++ )
399                    bptr[i + width_n] = trow[i];
400            }
401        }
402
403        buf_tail += buf_step;
404        if( buf_tail >= buf_end )
405            buf_tail = buf_start;
406    }
407
408    return y - y0;
409}
410
411int CvBaseImageFilter::process( const CvMat* src, CvMat* dst,
412                                CvRect src_roi, CvPoint dst_origin, int flags )
413{
414    int rows_processed = 0;
415
416    /*
417        check_parameters
418        initialize_horizontal_border_reloc_tab_if_not_initialized_yet
419
420        for_each_source_row: src starts from src_roi.y, buf starts with the first available row
421            1) if separable,
422                   1a.1) copy source row to temporary buffer, form a border using border reloc tab.
423                   1a.2) apply row-wise filter (symmetric, asymmetric or generic)
424               else
425                   1b.1) copy source row to the buffer, form a border
426            2) if the buffer is full, or it is the last source row:
427                 2.1) if stage != middle, form the pointers to other "virtual" rows.
428                 if separable
429                    2a.2) apply column-wise filter, store the results.
430                 else
431                    2b.2) form a sparse (offset,weight) tab
432                    2b.3) apply generic non-separable filter, store the results
433            3) update row pointers etc.
434    */
435
436    CV_FUNCNAME( "CvBaseImageFilter::process" );
437
438    __BEGIN__;
439
440    int i, width, _src_y1, _src_y2;
441    int src_x, src_y, src_y1, src_y2, dst_y;
442    int pix_size = CV_ELEM_SIZE(src_type);
443    uchar *sptr = 0, *dptr;
444    int phase = flags & (CV_START|CV_END|CV_MIDDLE);
445    bool isolated_roi = (flags & CV_ISOLATED_ROI) != 0;
446
447    if( !CV_IS_MAT(src) )
448        CV_ERROR( CV_StsBadArg, "" );
449
450    if( CV_MAT_TYPE(src->type) != src_type )
451        CV_ERROR( CV_StsUnmatchedFormats, "" );
452
453    width = src->cols;
454
455    if( src_roi.width == -1 && src_roi.x == 0 )
456        src_roi.width = width;
457
458    if( src_roi.height == -1 && src_roi.y == 0 )
459    {
460        src_roi.y = 0;
461        src_roi.height = src->rows;
462    }
463
464    if( src_roi.width > max_width ||
465        src_roi.x < 0 || src_roi.width < 0 ||
466        src_roi.y < 0 || src_roi.height < 0 ||
467        src_roi.x + src_roi.width > width ||
468        src_roi.y + src_roi.height > src->rows )
469        CV_ERROR( CV_StsOutOfRange, "Too large source image or its ROI" );
470
471    src_x = src_roi.x;
472    _src_y1 = 0;
473    _src_y2 = src->rows;
474
475    if( isolated_roi )
476    {
477        src_roi.x = 0;
478        width = src_roi.width;
479        _src_y1 = src_roi.y;
480        _src_y2 = src_roi.y + src_roi.height;
481    }
482
483    if( !CV_IS_MAT(dst) )
484        CV_ERROR( CV_StsBadArg, "" );
485
486    if( CV_MAT_TYPE(dst->type) != dst_type )
487        CV_ERROR( CV_StsUnmatchedFormats, "" );
488
489    if( dst_origin.x < 0 || dst_origin.y < 0 )
490        CV_ERROR( CV_StsOutOfRange, "Incorrect destination ROI origin" );
491
492    if( phase == CV_WHOLE )
493        phase = CV_START | CV_END;
494    phase &= CV_START | CV_END | CV_MIDDLE;
495
496    // initialize horizontal border relocation tab if it is not initialized yet
497    if( phase & CV_START )
498        start_process( cvSlice(src_roi.x, src_roi.x + src_roi.width), width );
499    else if( prev_width != width || prev_x_range.start_index != src_roi.x ||
500        prev_x_range.end_index != src_roi.x + src_roi.width )
501        CV_ERROR( CV_StsBadArg,
502            "In a middle or at the end the horizontal placement of the stripe can not be changed" );
503
504    dst_y = dst_origin.y;
505    src_y1 = src_roi.y;
506    src_y2 = src_roi.y + src_roi.height;
507
508    if( phase & CV_START )
509    {
510        for( i = 0; i <= max_ky*2; i++ )
511            rows[i] = 0;
512
513        src_y1 -= max_ky;
514        top_rows = bottom_rows = 0;
515
516        if( src_y1 < _src_y1 )
517        {
518            top_rows = _src_y1 - src_y1;
519            src_y1 = _src_y1;
520        }
521
522        buf_head = buf_tail = buf_start;
523        buf_count = 0;
524    }
525
526    if( phase & CV_END )
527    {
528        src_y2 += max_ky;
529
530        if( src_y2 > _src_y2 )
531        {
532            bottom_rows = src_y2 - _src_y2;
533            src_y2 = _src_y2;
534        }
535    }
536
537    dptr = dst->data.ptr + dst_origin.y*dst->step + dst_origin.x*CV_ELEM_SIZE(dst_type);
538    sptr = src->data.ptr + src_y1*src->step + src_x*pix_size;
539
540    for( src_y = src_y1; src_y < src_y2; )
541    {
542        uchar* bptr;
543        int row_count, delta;
544
545        delta = fill_cyclic_buffer( sptr, src->step, src_y, src_y1, src_y2 );
546
547        src_y += delta;
548        sptr += src->step*delta;
549
550        // initialize the cyclic buffer row pointers
551        bptr = buf_head;
552        for( i = 0; i < buf_count; i++ )
553        {
554            rows[i+top_rows] = bptr;
555            bptr += buf_step;
556            if( bptr >= buf_end )
557                bptr = buf_start;
558        }
559
560        row_count = top_rows + buf_count;
561
562        if( !rows[0] || ((phase & CV_END) && src_y == src_y2) )
563        {
564            int br = (phase & CV_END) && src_y == src_y2 ? bottom_rows : 0;
565            make_y_border( row_count, top_rows, br );
566            row_count += br;
567        }
568
569        if( rows[0] && row_count > max_ky*2 )
570        {
571            int count = row_count - max_ky*2;
572            if( dst_y + count > dst->rows )
573                CV_ERROR( CV_StsOutOfRange, "The destination image can not fit the result" );
574
575            assert( count >= 0 );
576            y_func( rows + max_ky - anchor.y, dptr, dst->step, count, this );
577            row_count -= count;
578            dst_y += count;
579            dptr += dst->step*count;
580            for( bptr = row_count > 0 ?rows[count] : 0; buf_head != bptr && buf_count > 0; buf_count-- )
581            {
582                buf_head += buf_step;
583                if( buf_head >= buf_end )
584                    buf_head = buf_start;
585            }
586            rows_processed += count;
587            top_rows = MAX(top_rows - count, 0);
588        }
589    }
590
591    __END__;
592
593    return rows_processed;
594}
595
596
597/****************************************************************************************\
598                                    Separable Linear Filter
599\****************************************************************************************/
600
601static void icvFilterRowSymm_8u32s( const uchar* src, int* dst, void* params );
602static void icvFilterColSymm_32s8u( const int** src, uchar* dst, int dst_step,
603                                    int count, void* params );
604static void icvFilterColSymm_32s16s( const int** src, short* dst, int dst_step,
605                                     int count, void* params );
606static void icvFilterRowSymm_8u32f( const uchar* src, float* dst, void* params );
607static void icvFilterRow_8u32f( const uchar* src, float* dst, void* params );
608static void icvFilterRowSymm_16s32f( const short* src, float* dst, void* params );
609static void icvFilterRow_16s32f( const short* src, float* dst, void* params );
610static void icvFilterRowSymm_16u32f( const ushort* src, float* dst, void* params );
611static void icvFilterRow_16u32f( const ushort* src, float* dst, void* params );
612static void icvFilterRowSymm_32f( const float* src, float* dst, void* params );
613static void icvFilterRow_32f( const float* src, float* dst, void* params );
614
615static void icvFilterColSymm_32f8u( const float** src, uchar* dst, int dst_step,
616                                    int count, void* params );
617static void icvFilterCol_32f8u( const float** src, uchar* dst, int dst_step,
618                                int count, void* params );
619static void icvFilterColSymm_32f16s( const float** src, short* dst, int dst_step,
620                                     int count, void* params );
621static void icvFilterCol_32f16s( const float** src, short* dst, int dst_step,
622                                 int count, void* params );
623static void icvFilterColSymm_32f16u( const float** src, ushort* dst, int dst_step,
624                                     int count, void* params );
625static void icvFilterCol_32f16u( const float** src, ushort* dst, int dst_step,
626                                 int count, void* params );
627static void icvFilterColSymm_32f( const float** src, float* dst, int dst_step,
628                                  int count, void* params );
629static void icvFilterCol_32f( const float** src, float* dst, int dst_step,
630                              int count, void* params );
631
632CvSepFilter::CvSepFilter()
633{
634    min_depth = CV_32F;
635    kx = ky = 0;
636    kx_flags = ky_flags = 0;
637}
638
639
640CvSepFilter::CvSepFilter( int _max_width, int _src_type, int _dst_type,
641                          const CvMat* _kx, const CvMat* _ky,
642                          CvPoint _anchor, int _border_mode,
643                          CvScalar _border_value )
644{
645    min_depth = CV_32F;
646    kx = ky = 0;
647    init( _max_width, _src_type, _dst_type, _kx, _ky, _anchor, _border_mode, _border_value );
648}
649
650
651void CvSepFilter::clear()
652{
653    cvReleaseMat( &kx );
654    cvReleaseMat( &ky );
655    CvBaseImageFilter::clear();
656}
657
658
659CvSepFilter::~CvSepFilter()
660{
661    clear();
662}
663
664
665#undef FILTER_BITS
666#define FILTER_BITS 8
667
668void CvSepFilter::init( int _max_width, int _src_type, int _dst_type,
669                        const CvMat* _kx, const CvMat* _ky,
670                        CvPoint _anchor, int _border_mode,
671                        CvScalar _border_value )
672{
673    CV_FUNCNAME( "CvSepFilter::init" );
674
675    __BEGIN__;
676
677    CvSize _ksize;
678    int filter_type;
679    int i, xsz, ysz;
680    int convert_filters = 0;
681    double xsum = 0, ysum = 0;
682    const float eps = FLT_EPSILON*100.f;
683
684    if( !CV_IS_MAT(_kx) || !CV_IS_MAT(_ky) ||
685        (_kx->cols != 1 && _kx->rows != 1) ||
686        (_ky->cols != 1 && _ky->rows != 1) ||
687        CV_MAT_CN(_kx->type) != 1 || CV_MAT_CN(_ky->type) != 1 ||
688        !CV_ARE_TYPES_EQ(_kx,_ky) )
689        CV_ERROR( CV_StsBadArg,
690        "Both kernels must be valid 1d single-channel vectors of the same types" );
691
692    if( CV_MAT_CN(_src_type) != CV_MAT_CN(_dst_type) )
693        CV_ERROR( CV_StsUnmatchedFormats, "Input and output must have the same number of channels" );
694
695    filter_type = MAX( CV_32F, CV_MAT_DEPTH(_kx->type) );
696
697    _ksize.width = _kx->rows + _kx->cols - 1;
698    _ksize.height = _ky->rows + _ky->cols - 1;
699
700    CV_CALL( CvBaseImageFilter::init( _max_width, _src_type, _dst_type, 1, _ksize,
701                                      _anchor, _border_mode, _border_value ));
702
703    if( !(kx && CV_ARE_SIZES_EQ(kx,_kx)) )
704    {
705        cvReleaseMat( &kx );
706        CV_CALL( kx = cvCreateMat( _kx->rows, _kx->cols, filter_type ));
707    }
708
709    if( !(ky && CV_ARE_SIZES_EQ(ky,_ky)) )
710    {
711        cvReleaseMat( &ky );
712        CV_CALL( ky = cvCreateMat( _ky->rows, _ky->cols, filter_type ));
713    }
714
715    CV_CALL( cvConvert( _kx, kx ));
716    CV_CALL( cvConvert( _ky, ky ));
717
718    xsz = kx->rows + kx->cols - 1;
719    ysz = ky->rows + ky->cols - 1;
720    kx_flags = ky_flags = ASYMMETRICAL + SYMMETRICAL + POSITIVE + SUM_TO_1 + INTEGER;
721
722    if( !(xsz & 1) )
723        kx_flags &= ~(ASYMMETRICAL + SYMMETRICAL);
724    if( !(ysz & 1) )
725        ky_flags &= ~(ASYMMETRICAL + SYMMETRICAL);
726
727    for( i = 0; i < xsz; i++ )
728    {
729        float v = kx->data.fl[i];
730        xsum += v;
731        if( v < 0 )
732            kx_flags &= ~POSITIVE;
733        if( fabs(v - cvRound(v)) > eps )
734            kx_flags &= ~INTEGER;
735        if( fabs(v - kx->data.fl[xsz - i - 1]) > eps )
736            kx_flags &= ~SYMMETRICAL;
737        if( fabs(v + kx->data.fl[xsz - i - 1]) > eps )
738            kx_flags &= ~ASYMMETRICAL;
739    }
740
741    if( fabs(xsum - 1.) > eps )
742        kx_flags &= ~SUM_TO_1;
743
744    for( i = 0; i < ysz; i++ )
745    {
746        float v = ky->data.fl[i];
747        ysum += v;
748        if( v < 0 )
749            ky_flags &= ~POSITIVE;
750        if( fabs(v - cvRound(v)) > eps )
751            ky_flags &= ~INTEGER;
752        if( fabs(v - ky->data.fl[ysz - i - 1]) > eps )
753            ky_flags &= ~SYMMETRICAL;
754        if( fabs(v + ky->data.fl[ysz - i - 1]) > eps )
755            ky_flags &= ~ASYMMETRICAL;
756    }
757
758    if( fabs(ysum - 1.) > eps )
759        ky_flags &= ~SUM_TO_1;
760
761    x_func = 0;
762    y_func = 0;
763
764    if( CV_MAT_DEPTH(src_type) == CV_8U )
765    {
766        if( CV_MAT_DEPTH(dst_type) == CV_8U &&
767            ((kx_flags&ky_flags) & (SYMMETRICAL + POSITIVE + SUM_TO_1)) == SYMMETRICAL + POSITIVE + SUM_TO_1 )
768        {
769            x_func = (CvRowFilterFunc)icvFilterRowSymm_8u32s;
770            y_func = (CvColumnFilterFunc)icvFilterColSymm_32s8u;
771            kx_flags &= ~INTEGER;
772            ky_flags &= ~INTEGER;
773            convert_filters = 1;
774        }
775        else if( CV_MAT_DEPTH(dst_type) == CV_16S &&
776            (kx_flags & (SYMMETRICAL + ASYMMETRICAL)) && (kx_flags & INTEGER) &&
777            (ky_flags & (SYMMETRICAL + ASYMMETRICAL)) && (ky_flags & INTEGER) )
778        {
779            x_func = (CvRowFilterFunc)icvFilterRowSymm_8u32s;
780            y_func = (CvColumnFilterFunc)icvFilterColSymm_32s16s;
781            convert_filters = 1;
782        }
783        else
784        {
785            if( CV_MAT_DEPTH(dst_type) > CV_32F )
786                CV_ERROR( CV_StsUnsupportedFormat, "8u->64f separable filtering is not supported" );
787
788            if( kx_flags & (SYMMETRICAL + ASYMMETRICAL) )
789                x_func = (CvRowFilterFunc)icvFilterRowSymm_8u32f;
790            else
791                x_func = (CvRowFilterFunc)icvFilterRow_8u32f;
792        }
793    }
794    else if( CV_MAT_DEPTH(src_type) == CV_16U )
795    {
796        if( CV_MAT_DEPTH(dst_type) > CV_32F )
797            CV_ERROR( CV_StsUnsupportedFormat, "16u->64f separable filtering is not supported" );
798
799        if( kx_flags & (SYMMETRICAL + ASYMMETRICAL) )
800            x_func = (CvRowFilterFunc)icvFilterRowSymm_16u32f;
801        else
802            x_func = (CvRowFilterFunc)icvFilterRow_16u32f;
803    }
804    else if( CV_MAT_DEPTH(src_type) == CV_16S )
805    {
806        if( CV_MAT_DEPTH(dst_type) > CV_32F )
807            CV_ERROR( CV_StsUnsupportedFormat, "16s->64f separable filtering is not supported" );
808
809        if( kx_flags & (SYMMETRICAL + ASYMMETRICAL) )
810            x_func = (CvRowFilterFunc)icvFilterRowSymm_16s32f;
811        else
812            x_func = (CvRowFilterFunc)icvFilterRow_16s32f;
813    }
814    else if( CV_MAT_DEPTH(src_type) == CV_32F )
815    {
816        if( CV_MAT_DEPTH(dst_type) != CV_32F )
817            CV_ERROR( CV_StsUnsupportedFormat, "When the input has 32f data type, the output must also have 32f type" );
818
819        if( kx_flags & (SYMMETRICAL + ASYMMETRICAL) )
820            x_func = (CvRowFilterFunc)icvFilterRowSymm_32f;
821        else
822            x_func = (CvRowFilterFunc)icvFilterRow_32f;
823    }
824    else
825        CV_ERROR( CV_StsUnsupportedFormat, "Unknown or unsupported input data type" );
826
827    if( !y_func )
828    {
829        if( CV_MAT_DEPTH(dst_type) == CV_8U )
830        {
831            if( ky_flags & (SYMMETRICAL + ASYMMETRICAL) )
832                y_func = (CvColumnFilterFunc)icvFilterColSymm_32f8u;
833            else
834                y_func = (CvColumnFilterFunc)icvFilterCol_32f8u;
835        }
836        else if( CV_MAT_DEPTH(dst_type) == CV_16U )
837        {
838            if( ky_flags & (SYMMETRICAL + ASYMMETRICAL) )
839                y_func = (CvColumnFilterFunc)icvFilterColSymm_32f16u;
840            else
841                y_func = (CvColumnFilterFunc)icvFilterCol_32f16u;
842        }
843        else if( CV_MAT_DEPTH(dst_type) == CV_16S )
844        {
845            if( ky_flags & (SYMMETRICAL + ASYMMETRICAL) )
846                y_func = (CvColumnFilterFunc)icvFilterColSymm_32f16s;
847            else
848                y_func = (CvColumnFilterFunc)icvFilterCol_32f16s;
849        }
850        else if( CV_MAT_DEPTH(dst_type) == CV_32F )
851        {
852            if( ky_flags & (SYMMETRICAL + ASYMMETRICAL) )
853                y_func = (CvColumnFilterFunc)icvFilterColSymm_32f;
854            else
855                y_func = (CvColumnFilterFunc)icvFilterCol_32f;
856        }
857        else
858            CV_ERROR( CV_StsUnsupportedFormat, "Unknown or unsupported input data type" );
859    }
860
861    if( convert_filters )
862    {
863        int scale = kx_flags & ky_flags & INTEGER ? 1 : (1 << FILTER_BITS);
864        int sum;
865
866        for( i = sum = 0; i < xsz; i++ )
867        {
868            int t = cvRound(kx->data.fl[i]*scale);
869            kx->data.i[i] = t;
870            sum += t;
871        }
872        if( scale > 1 )
873            kx->data.i[xsz/2] += scale - sum;
874
875        for( i = sum = 0; i < ysz; i++ )
876        {
877            int t = cvRound(ky->data.fl[i]*scale);
878            ky->data.i[i] = t;
879            sum += t;
880        }
881        if( scale > 1 )
882            ky->data.i[ysz/2] += scale - sum;
883        kx->type = (kx->type & ~CV_MAT_DEPTH_MASK) | CV_32S;
884        ky->type = (ky->type & ~CV_MAT_DEPTH_MASK) | CV_32S;
885    }
886
887    __END__;
888}
889
890
891void CvSepFilter::init( int _max_width, int _src_type, int _dst_type,
892                        bool _is_separable, CvSize _ksize,
893                        CvPoint _anchor, int _border_mode,
894                        CvScalar _border_value )
895{
896    CvBaseImageFilter::init( _max_width, _src_type, _dst_type, _is_separable,
897                             _ksize, _anchor, _border_mode, _border_value );
898}
899
900
901static void
902icvFilterRowSymm_8u32s( const uchar* src, int* dst, void* params )
903{
904    const CvSepFilter* state = (const CvSepFilter*)params;
905    const CvMat* _kx = state->get_x_kernel();
906    const int* kx = _kx->data.i;
907    int ksize = _kx->cols + _kx->rows - 1;
908    int i = 0, j, k, width = state->get_width();
909    int cn = CV_MAT_CN(state->get_src_type());
910    int ksize2 = ksize/2, ksize2n = ksize2*cn;
911    int is_symm = state->get_x_kernel_flags() & CvSepFilter::SYMMETRICAL;
912    const uchar* s = src + ksize2n;
913
914    kx += ksize2;
915    width *= cn;
916
917    if( is_symm )
918    {
919        if( ksize == 1 && kx[0] == 1 )
920        {
921            for( i = 0; i <= width - 2; i += 2 )
922            {
923                int s0 = s[i], s1 = s[i+1];
924                dst[i] = s0; dst[i+1] = s1;
925            }
926            s += i;
927        }
928        else if( ksize == 3 )
929        {
930            if( kx[0] == 2 && kx[1] == 1 )
931                for( ; i <= width - 2; i += 2, s += 2 )
932                {
933                    int s0 = s[-cn] + s[0]*2 + s[cn], s1 = s[1-cn] + s[1]*2 + s[1+cn];
934                    dst[i] = s0; dst[i+1] = s1;
935                }
936            else if( kx[0] == 10 && kx[1] == 3 )
937                for( ; i <= width - 2; i += 2, s += 2 )
938                {
939                    int s0 = s[0]*10 + (s[-cn] + s[cn])*3, s1 = s[1]*10 + (s[1-cn] + s[1+cn])*3;
940                    dst[i] = s0; dst[i+1] = s1;
941                }
942            else if( kx[0] == 2*64 && kx[1] == 1*64 )
943                for( ; i <= width - 2; i += 2, s += 2 )
944                {
945                    int s0 = (s[0]*2 + s[-cn] + s[cn]) << 6;
946                    int s1 = (s[1]*2 + s[1-cn] + s[1+cn]) << 6;
947                    dst[i] = s0; dst[i+1] = s1;
948                }
949            else
950            {
951                int k0 = kx[0], k1 = kx[1];
952                for( ; i <= width - 2; i += 2, s += 2 )
953                {
954                    int s0 = s[0]*k0 + (s[-cn] + s[cn])*k1, s1 = s[1]*k0 + (s[1-cn] + s[1+cn])*k1;
955                    dst[i] = s0; dst[i+1] = s1;
956                }
957            }
958        }
959        else if( ksize == 5 )
960        {
961            int k0 = kx[0], k1 = kx[1], k2 = kx[2];
962            if( k0 == 6*16 && k1 == 4*16 && k2 == 1*16 )
963                for( ; i <= width - 2; i += 2, s += 2 )
964                {
965                    int s0 = (s[0]*6 + (s[-cn] + s[cn])*4 + (s[-cn*2] + s[cn*2])*1) << 4;
966                    int s1 = (s[1]*6 + (s[1-cn] + s[1+cn])*4 + (s[1-cn*2] + s[1+cn*2])*1) << 4;
967                    dst[i] = s0; dst[i+1] = s1;
968                }
969            else
970                for( ; i <= width - 2; i += 2, s += 2 )
971                {
972                    int s0 = s[0]*k0 + (s[-cn] + s[cn])*k1 + (s[-cn*2] + s[cn*2])*k2;
973                    int s1 = s[1]*k0 + (s[1-cn] + s[1+cn])*k1 + (s[1-cn*2] + s[1+cn*2])*k2;
974                    dst[i] = s0; dst[i+1] = s1;
975                }
976        }
977        else
978            for( ; i <= width - 4; i += 4, s += 4 )
979            {
980                int f = kx[0];
981                int s0 = f*s[0], s1 = f*s[1], s2 = f*s[2], s3 = f*s[3];
982                for( k = 1, j = cn; k <= ksize2; k++, j += cn )
983                {
984                    f = kx[k];
985                    s0 += f*(s[j] + s[-j]); s1 += f*(s[j+1] + s[-j+1]);
986                    s2 += f*(s[j+2] + s[-j+2]); s3 += f*(s[j+3] + s[-j+3]);
987                }
988
989                dst[i] = s0; dst[i+1] = s1;
990                dst[i+2] = s2; dst[i+3] = s3;
991            }
992
993        for( ; i < width; i++, s++ )
994        {
995            int s0 = kx[0]*s[0];
996            for( k = 1, j = cn; k <= ksize2; k++, j += cn )
997                s0 += kx[k]*(s[j] + s[-j]);
998            dst[i] = s0;
999        }
1000    }
1001    else
1002    {
1003        if( ksize == 3 && kx[0] == 0 && kx[1] == 1 )
1004            for( ; i <= width - 2; i += 2, s += 2 )
1005            {
1006                int s0 = s[cn] - s[-cn], s1 = s[1+cn] - s[1-cn];
1007                dst[i] = s0; dst[i+1] = s1;
1008            }
1009        else
1010            for( ; i <= width - 4; i += 4, s += 4 )
1011            {
1012                int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
1013                for( k = 1, j = cn; k <= ksize2; k++, j += cn )
1014                {
1015                    int f = kx[k];
1016                    s0 += f*(s[j] - s[-j]); s1 += f*(s[j+1] - s[-j+1]);
1017                    s2 += f*(s[j+2] - s[-j+2]); s3 += f*(s[j+3] - s[-j+3]);
1018                }
1019
1020                dst[i] = s0; dst[i+1] = s1;
1021                dst[i+2] = s2; dst[i+3] = s3;
1022            }
1023
1024        for( ; i < width; i++, s++ )
1025        {
1026            int s0 = kx[0]*s[0];
1027            for( k = 1, j = cn; k <= ksize2; k++, j += cn )
1028                s0 += kx[k]*(s[j] - s[-j]);
1029            dst[i] = s0;
1030        }
1031    }
1032}
1033
1034
1035#define ICV_FILTER_ROW( flavor, srctype, dsttype, load_macro )      \
1036static void                                                         \
1037icvFilterRow_##flavor(const srctype* src, dsttype* dst, void*params)\
1038{                                                                   \
1039    const CvSepFilter* state = (const CvSepFilter*)params;          \
1040    const CvMat* _kx = state->get_x_kernel();                       \
1041    const dsttype* kx = (const dsttype*)(_kx->data.ptr);            \
1042    int ksize = _kx->cols + _kx->rows - 1;                          \
1043    int i = 0, k, width = state->get_width();                       \
1044    int cn = CV_MAT_CN(state->get_src_type());                      \
1045    const srctype* s;                                               \
1046                                                                    \
1047    width *= cn;                                                    \
1048                                                                    \
1049    for( ; i <= width - 4; i += 4 )                                 \
1050    {                                                               \
1051        double f = kx[0];                                           \
1052        double s0=f*load_macro(src[i]), s1=f*load_macro(src[i+1]),  \
1053                s2=f*load_macro(src[i+2]), s3=f*load_macro(src[i+3]);\
1054        for( k = 1, s = src + i + cn; k < ksize; k++, s += cn )     \
1055        {                                                           \
1056            f = kx[k];                                              \
1057            s0 += f*load_macro(s[0]);                               \
1058            s1 += f*load_macro(s[1]);                               \
1059            s2 += f*load_macro(s[2]);                               \
1060            s3 += f*load_macro(s[3]);                               \
1061        }                                                           \
1062        dst[i] = (dsttype)s0; dst[i+1] = (dsttype)s1;               \
1063        dst[i+2] = (dsttype)s2; dst[i+3] = (dsttype)s3;             \
1064    }                                                               \
1065                                                                    \
1066    for( ; i < width; i++ )                                         \
1067    {                                                               \
1068        double s0 = (double)kx[0]*load_macro(src[i]);               \
1069        for( k = 1, s = src + i + cn; k < ksize; k++, s += cn )     \
1070            s0 += (double)kx[k]*load_macro(s[0]);                   \
1071        dst[i] = (dsttype)s0;                                       \
1072    }                                                               \
1073}
1074
1075
1076ICV_FILTER_ROW( 8u32f, uchar, float, CV_8TO32F )
1077ICV_FILTER_ROW( 16s32f, short, float, CV_NOP )
1078ICV_FILTER_ROW( 16u32f, ushort, float, CV_NOP )
1079ICV_FILTER_ROW( 32f, float, float, CV_NOP )
1080
1081
1082#define ICV_FILTER_ROW_SYMM( flavor, srctype, dsttype, load_macro ) \
1083static void                                                         \
1084icvFilterRowSymm_##flavor( const srctype* src,                      \
1085                           dsttype* dst, void* params )             \
1086{                                                                   \
1087    const CvSepFilter* state = (const CvSepFilter*)params;          \
1088    const CvMat* _kx = state->get_x_kernel();                       \
1089    const dsttype* kx = (const dsttype*)(_kx->data.ptr);            \
1090    int ksize = _kx->cols + _kx->rows - 1;                          \
1091    int i = 0, j, k, width = state->get_width();                    \
1092    int cn = CV_MAT_CN(state->get_src_type());                      \
1093    int is_symm=state->get_x_kernel_flags()&CvSepFilter::SYMMETRICAL;\
1094    int ksize2 = ksize/2, ksize2n = ksize2*cn;                      \
1095    const srctype* s = src + ksize2n;                               \
1096                                                                    \
1097    kx += ksize2;                                                   \
1098    width *= cn;                                                    \
1099                                                                    \
1100    if( is_symm )                                                   \
1101    {                                                               \
1102        for( ; i <= width - 4; i += 4, s += 4 )                     \
1103        {                                                           \
1104            double f = kx[0];                                       \
1105            double s0=f*load_macro(s[0]), s1=f*load_macro(s[1]),    \
1106                   s2=f*load_macro(s[2]), s3=f*load_macro(s[3]);    \
1107            for( k = 1, j = cn; k <= ksize2; k++, j += cn )         \
1108            {                                                       \
1109                f = kx[k];                                          \
1110                s0 += f*load_macro(s[j] + s[-j]);                   \
1111                s1 += f*load_macro(s[j+1] + s[-j+1]);               \
1112                s2 += f*load_macro(s[j+2] + s[-j+2]);               \
1113                s3 += f*load_macro(s[j+3] + s[-j+3]);               \
1114            }                                                       \
1115                                                                    \
1116            dst[i] = (dsttype)s0; dst[i+1] = (dsttype)s1;           \
1117            dst[i+2] = (dsttype)s2; dst[i+3] = (dsttype)s3;         \
1118        }                                                           \
1119                                                                    \
1120        for( ; i < width; i++, s++ )                                \
1121        {                                                           \
1122            double s0 = (double)kx[0]*load_macro(s[0]);             \
1123            for( k = 1, j = cn; k <= ksize2; k++, j += cn )         \
1124                s0 += (double)kx[k]*load_macro(s[j] + s[-j]);       \
1125            dst[i] = (dsttype)s0;                                   \
1126        }                                                           \
1127    }                                                               \
1128    else                                                            \
1129    {                                                               \
1130        for( ; i <= width - 4; i += 4, s += 4 )                     \
1131        {                                                           \
1132            double s0 = 0, s1 = 0, s2 = 0, s3 = 0;                  \
1133            for( k = 1, j = cn; k <= ksize2; k++, j += cn )         \
1134            {                                                       \
1135                double f = kx[k];                                   \
1136                s0 += f*load_macro(s[j] - s[-j]);                   \
1137                s1 += f*load_macro(s[j+1] - s[-j+1]);               \
1138                s2 += f*load_macro(s[j+2] - s[-j+2]);               \
1139                s3 += f*load_macro(s[j+3] - s[-j+3]);               \
1140            }                                                       \
1141                                                                    \
1142            dst[i] = (dsttype)s0; dst[i+1] = (dsttype)s1;           \
1143            dst[i+2] = (dsttype)s2; dst[i+3] = (dsttype)s3;         \
1144        }                                                           \
1145                                                                    \
1146        for( ; i < width; i++, s++ )                                \
1147        {                                                           \
1148            double s0 = 0;                                          \
1149            for( k = 1, j = cn; k <= ksize2; k++, j += cn )         \
1150                s0 += (double)kx[k]*load_macro(s[j] - s[-j]);       \
1151            dst[i] = (dsttype)s0;                                   \
1152        }                                                           \
1153    }                                                               \
1154}
1155
1156
1157ICV_FILTER_ROW_SYMM( 8u32f, uchar, float, CV_8TO32F )
1158ICV_FILTER_ROW_SYMM( 16s32f, short, float, CV_NOP )
1159ICV_FILTER_ROW_SYMM( 16u32f, ushort, float, CV_NOP )
1160
1161static void
1162icvFilterRowSymm_32f( const float* src, float* dst, void* params )
1163{
1164    const CvSepFilter* state = (const CvSepFilter*)params;
1165    const CvMat* _kx = state->get_x_kernel();
1166    const float* kx = _kx->data.fl;
1167    int ksize = _kx->cols + _kx->rows - 1;
1168    int i = 0, j, k, width = state->get_width();
1169    int cn = CV_MAT_CN(state->get_src_type());
1170    int ksize2 = ksize/2, ksize2n = ksize2*cn;
1171    int is_symm = state->get_x_kernel_flags() & CvSepFilter::SYMMETRICAL;
1172    const float* s = src + ksize2n;
1173
1174    kx += ksize2;
1175    width *= cn;
1176
1177    if( is_symm )
1178    {
1179        if( ksize == 3 && fabs(kx[0]-2.) <= FLT_EPSILON && fabs(kx[1]-1.) <= FLT_EPSILON )
1180            for( ; i <= width - 2; i += 2, s += 2 )
1181            {
1182                float s0 = s[-cn] + s[0]*2 + s[cn], s1 = s[1-cn] + s[1]*2 + s[1+cn];
1183                dst[i] = s0; dst[i+1] = s1;
1184            }
1185        else if( ksize == 3 && fabs(kx[0]-10.) <= FLT_EPSILON && fabs(kx[1]-3.) <= FLT_EPSILON )
1186            for( ; i <= width - 2; i += 2, s += 2 )
1187            {
1188                float s0 = s[0]*10 + (s[-cn] + s[cn])*3, s1 = s[1]*10 + (s[1-cn] + s[1+cn])*3;
1189                dst[i] = s0; dst[i+1] = s1;
1190            }
1191        else
1192            for( ; i <= width - 4; i += 4, s += 4 )
1193            {
1194                double f = kx[0];
1195                double s0 = f*s[0], s1 = f*s[1], s2 = f*s[2], s3 = f*s[3];
1196                for( k = 1, j = cn; k <= ksize2; k++, j += cn )
1197                {
1198                    f = kx[k];
1199                    s0 += f*(s[j] + s[-j]); s1 += f*(s[j+1] + s[-j+1]);
1200                    s2 += f*(s[j+2] + s[-j+2]); s3 += f*(s[j+3] + s[-j+3]);
1201                }
1202
1203                dst[i] = (float)s0; dst[i+1] = (float)s1;
1204                dst[i+2] = (float)s2; dst[i+3] = (float)s3;
1205            }
1206
1207        for( ; i < width; i++, s++ )
1208        {
1209            double s0 = (double)kx[0]*s[0];
1210            for( k = 1, j = cn; k <= ksize2; k++, j += cn )
1211                s0 += (double)kx[k]*(s[j] + s[-j]);
1212            dst[i] = (float)s0;
1213        }
1214    }
1215    else
1216    {
1217        if( ksize == 3 && fabs(kx[0]) <= FLT_EPSILON && fabs(kx[1]-1.) <= FLT_EPSILON )
1218            for( ; i <= width - 2; i += 2, s += 2 )
1219            {
1220                float s0 = s[cn] - s[-cn], s1 = s[1+cn] - s[1-cn];
1221                dst[i] = s0; dst[i+1] = s1;
1222            }
1223        else
1224            for( ; i <= width - 4; i += 4, s += 4 )
1225            {
1226                double s0 = 0, s1 = 0, s2 = 0, s3 = 0;
1227                for( k = 1, j = cn; k <= ksize2; k++, j += cn )
1228                {
1229                    double f = kx[k];
1230                    s0 += f*(s[j] - s[-j]); s1 += f*(s[j+1] - s[-j+1]);
1231                    s2 += f*(s[j+2] - s[-j+2]); s3 += f*(s[j+3] - s[-j+3]);
1232                }
1233
1234                dst[i] = (float)s0; dst[i+1] = (float)s1;
1235                dst[i+2] = (float)s2; dst[i+3] = (float)s3;
1236            }
1237
1238        for( ; i < width; i++, s++ )
1239        {
1240            double s0 = (double)kx[0]*s[0];
1241            for( k = 1, j = cn; k <= ksize2; k++, j += cn )
1242                s0 += (double)kx[k]*(s[j] - s[-j]);
1243            dst[i] = (float)s0;
1244        }
1245    }
1246}
1247
1248
1249static void
1250icvFilterColSymm_32s8u( const int** src, uchar* dst, int dst_step, int count, void* params )
1251{
1252    const CvSepFilter* state = (const CvSepFilter*)params;
1253    const CvMat* _ky = state->get_y_kernel();
1254    const int* ky = _ky->data.i;
1255    int ksize = _ky->cols + _ky->rows - 1, ksize2 = ksize/2;
1256    int i, k, width = state->get_width();
1257    int cn = CV_MAT_CN(state->get_src_type());
1258
1259    width *= cn;
1260    src += ksize2;
1261    ky += ksize2;
1262
1263    for( ; count--; dst += dst_step, src++ )
1264    {
1265        if( ksize == 3 )
1266        {
1267            const int* sptr0 = src[-1], *sptr1 = src[0], *sptr2 = src[1];
1268            int k0 = ky[0], k1 = ky[1];
1269            for( i = 0; i <= width - 2; i += 2 )
1270            {
1271                int s0 = sptr1[i]*k0 + (sptr0[i] + sptr2[i])*k1;
1272                int s1 = sptr1[i+1]*k0 + (sptr0[i+1] + sptr2[i+1])*k1;
1273                s0 = CV_DESCALE(s0, FILTER_BITS*2);
1274                s1 = CV_DESCALE(s1, FILTER_BITS*2);
1275                dst[i] = (uchar)s0; dst[i+1] = (uchar)s1;
1276            }
1277        }
1278        else if( ksize == 5 )
1279        {
1280            const int* sptr0 = src[-2], *sptr1 = src[-1],
1281                *sptr2 = src[0], *sptr3 = src[1], *sptr4 = src[2];
1282            int k0 = ky[0], k1 = ky[1], k2 = ky[2];
1283            for( i = 0; i <= width - 2; i += 2 )
1284            {
1285                int s0 = sptr2[i]*k0 + (sptr1[i] + sptr3[i])*k1 + (sptr0[i] + sptr4[i])*k2;
1286                int s1 = sptr2[i+1]*k0 + (sptr1[i+1] + sptr3[i+1])*k1 + (sptr0[i+1] + sptr4[i+1])*k2;
1287                s0 = CV_DESCALE(s0, FILTER_BITS*2);
1288                s1 = CV_DESCALE(s1, FILTER_BITS*2);
1289                dst[i] = (uchar)s0; dst[i+1] = (uchar)s1;
1290            }
1291        }
1292        else
1293            for( i = 0; i <= width - 4; i += 4 )
1294            {
1295                int f = ky[0];
1296                const int* sptr = src[0] + i, *sptr2;
1297                int s0 = f*sptr[0], s1 = f*sptr[1], s2 = f*sptr[2], s3 = f*sptr[3];
1298                for( k = 1; k <= ksize2; k++ )
1299                {
1300                    sptr = src[k] + i;
1301                    sptr2 = src[-k] + i;
1302                    f = ky[k];
1303                    s0 += f*(sptr[0] + sptr2[0]);
1304                    s1 += f*(sptr[1] + sptr2[1]);
1305                    s2 += f*(sptr[2] + sptr2[2]);
1306                    s3 += f*(sptr[3] + sptr2[3]);
1307                }
1308
1309                s0 = CV_DESCALE(s0, FILTER_BITS*2);
1310                s1 = CV_DESCALE(s1, FILTER_BITS*2);
1311                dst[i] = (uchar)s0; dst[i+1] = (uchar)s1;
1312                s2 = CV_DESCALE(s2, FILTER_BITS*2);
1313                s3 = CV_DESCALE(s3, FILTER_BITS*2);
1314                dst[i+2] = (uchar)s2; dst[i+3] = (uchar)s3;
1315            }
1316
1317        for( ; i < width; i++ )
1318        {
1319            int s0 = ky[0]*src[0][i];
1320            for( k = 1; k <= ksize2; k++ )
1321                s0 += ky[k]*(src[k][i] + src[-k][i]);
1322
1323            s0 = CV_DESCALE(s0, FILTER_BITS*2);
1324            dst[i] = (uchar)s0;
1325        }
1326    }
1327}
1328
1329
1330static void
1331icvFilterColSymm_32s16s( const int** src, short* dst,
1332                         int dst_step, int count, void* params )
1333{
1334    const CvSepFilter* state = (const CvSepFilter*)params;
1335    const CvMat* _ky = state->get_y_kernel();
1336    const int* ky = (const int*)_ky->data.ptr;
1337    int ksize = _ky->cols + _ky->rows - 1, ksize2 = ksize/2;
1338    int i = 0, k, width = state->get_width();
1339    int cn = CV_MAT_CN(state->get_src_type());
1340    int is_symm = state->get_y_kernel_flags() & CvSepFilter::SYMMETRICAL;
1341    int is_1_2_1 = is_symm && ksize == 3 && ky[1] == 2 && ky[2] == 1;
1342    int is_3_10_3 = is_symm && ksize == 3 && ky[1] == 10 && ky[2] == 3;
1343    int is_m1_0_1 = !is_symm && ksize == 3 && ky[1] == 0 &&
1344        ky[2]*ky[2] == 1 ? (ky[2] > 0 ? 1 : -1) : 0;
1345
1346    width *= cn;
1347    src += ksize2;
1348    ky += ksize2;
1349    dst_step /= sizeof(dst[0]);
1350
1351    if( is_symm )
1352    {
1353        for( ; count--; dst += dst_step, src++ )
1354        {
1355            if( is_1_2_1 )
1356            {
1357                const int *src0 = src[-1], *src1 = src[0], *src2 = src[1];
1358
1359                for( i = 0; i <= width - 2; i += 2 )
1360                {
1361                    int s0 = src0[i] + src1[i]*2 + src2[i],
1362                        s1 = src0[i+1] + src1[i+1]*2 + src2[i+1];
1363
1364                    dst[i] = (short)s0; dst[i+1] = (short)s1;
1365                }
1366            }
1367            else if( is_3_10_3 )
1368            {
1369                const int *src0 = src[-1], *src1 = src[0], *src2 = src[1];
1370
1371                for( i = 0; i <= width - 2; i += 2 )
1372                {
1373                    int s0 = src1[i]*10 + (src0[i] + src2[i])*3,
1374                        s1 = src1[i+1]*10 + (src0[i+1] + src2[i+1])*3;
1375
1376                    dst[i] = (short)s0; dst[i+1] = (short)s1;
1377                }
1378            }
1379            else
1380                for( i = 0; i <= width - 4; i += 4 )
1381                {
1382                    int f = ky[0];
1383                    const int* sptr = src[0] + i, *sptr2;
1384                    int s0 = f*sptr[0], s1 = f*sptr[1],
1385                        s2 = f*sptr[2], s3 = f*sptr[3];
1386                    for( k = 1; k <= ksize2; k++ )
1387                    {
1388                        sptr = src[k] + i; sptr2 = src[-k] + i; f = ky[k];
1389                        s0 += f*(sptr[0] + sptr2[0]); s1 += f*(sptr[1] + sptr2[1]);
1390                        s2 += f*(sptr[2] + sptr2[2]); s3 += f*(sptr[3] + sptr2[3]);
1391                    }
1392
1393                    dst[i] = CV_CAST_16S(s0); dst[i+1] = CV_CAST_16S(s1);
1394                    dst[i+2] = CV_CAST_16S(s2); dst[i+3] = CV_CAST_16S(s3);
1395                }
1396
1397            for( ; i < width; i++ )
1398            {
1399                int s0 = ky[0]*src[0][i];
1400                for( k = 1; k <= ksize2; k++ )
1401                    s0 += ky[k]*(src[k][i] + src[-k][i]);
1402                dst[i] = CV_CAST_16S(s0);
1403            }
1404        }
1405    }
1406    else
1407    {
1408        for( ; count--; dst += dst_step, src++ )
1409        {
1410            if( is_m1_0_1 )
1411            {
1412                const int *src0 = src[-is_m1_0_1], *src2 = src[is_m1_0_1];
1413
1414                for( i = 0; i <= width - 2; i += 2 )
1415                {
1416                    int s0 = src2[i] - src0[i], s1 = src2[i+1] - src0[i+1];
1417                    dst[i] = (short)s0; dst[i+1] = (short)s1;
1418                }
1419            }
1420            else
1421                for( i = 0; i <= width - 4; i += 4 )
1422                {
1423                    int f = ky[0];
1424                    const int* sptr = src[0] + i, *sptr2;
1425                    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
1426                    for( k = 1; k <= ksize2; k++ )
1427                    {
1428                        sptr = src[k] + i; sptr2 = src[-k] + i; f = ky[k];
1429                        s0 += f*(sptr[0] - sptr2[0]); s1 += f*(sptr[1] - sptr2[1]);
1430                        s2 += f*(sptr[2] - sptr2[2]); s3 += f*(sptr[3] - sptr2[3]);
1431                    }
1432
1433                    dst[i] = CV_CAST_16S(s0); dst[i+1] = CV_CAST_16S(s1);
1434                    dst[i+2] = CV_CAST_16S(s2); dst[i+3] = CV_CAST_16S(s3);
1435                }
1436
1437            for( ; i < width; i++ )
1438            {
1439                int s0 = ky[0]*src[0][i];
1440                for( k = 1; k <= ksize2; k++ )
1441                    s0 += ky[k]*(src[k][i] - src[-k][i]);
1442                dst[i] = CV_CAST_16S(s0);
1443            }
1444        }
1445    }
1446}
1447
1448
1449#define ICV_FILTER_COL( flavor, srctype, dsttype, worktype,     \
1450                        cast_macro1, cast_macro2 )              \
1451static void                                                     \
1452icvFilterCol_##flavor( const srctype** src, dsttype* dst,       \
1453                       int dst_step, int count, void* params )  \
1454{                                                               \
1455    const CvSepFilter* state = (const CvSepFilter*)params;      \
1456    const CvMat* _ky = state->get_y_kernel();                   \
1457    const srctype* ky = (const srctype*)_ky->data.ptr;          \
1458    int ksize = _ky->cols + _ky->rows - 1;                      \
1459    int i, k, width = state->get_width();                       \
1460    int cn = CV_MAT_CN(state->get_src_type());                  \
1461                                                                \
1462    width *= cn;                                                \
1463    dst_step /= sizeof(dst[0]);                                 \
1464                                                                \
1465    for( ; count--; dst += dst_step, src++ )                    \
1466    {                                                           \
1467        for( i = 0; i <= width - 4; i += 4 )                    \
1468        {                                                       \
1469            double f = ky[0];                                   \
1470            const srctype* sptr = src[0] + i;                   \
1471            double s0 = f*sptr[0], s1 = f*sptr[1],              \
1472                   s2 = f*sptr[2], s3 = f*sptr[3];              \
1473            worktype t0, t1;                                    \
1474            for( k = 1; k < ksize; k++ )                        \
1475            {                                                   \
1476                sptr = src[k] + i; f = ky[k];                   \
1477                s0 += f*sptr[0]; s1 += f*sptr[1];               \
1478                s2 += f*sptr[2]; s3 += f*sptr[3];               \
1479            }                                                   \
1480                                                                \
1481            t0 = cast_macro1(s0); t1 = cast_macro1(s1);         \
1482            dst[i]=cast_macro2(t0); dst[i+1]=cast_macro2(t1);   \
1483            t0 = cast_macro1(s2); t1 = cast_macro1(s3);         \
1484            dst[i+2]=cast_macro2(t0); dst[i+3]=cast_macro2(t1); \
1485        }                                                       \
1486                                                                \
1487        for( ; i < width; i++ )                                 \
1488        {                                                       \
1489            double s0 = (double)ky[0]*src[0][i];                \
1490            worktype t0;                                        \
1491            for( k = 1; k < ksize; k++ )                        \
1492                s0 += (double)ky[k]*src[k][i];                  \
1493            t0 = cast_macro1(s0);                               \
1494            dst[i] = cast_macro2(t0);                           \
1495        }                                                       \
1496    }                                                           \
1497}
1498
1499
1500ICV_FILTER_COL( 32f8u, float, uchar, int, cvRound, CV_CAST_8U )
1501ICV_FILTER_COL( 32f16s, float, short, int, cvRound, CV_CAST_16S )
1502ICV_FILTER_COL( 32f16u, float, ushort, int, cvRound, CV_CAST_16U )
1503
1504#define ICV_FILTER_COL_SYMM( flavor, srctype, dsttype, worktype,    \
1505                             cast_macro1, cast_macro2 )             \
1506static void                                                         \
1507icvFilterColSymm_##flavor( const srctype** src, dsttype* dst,       \
1508                           int dst_step, int count, void* params )  \
1509{                                                                   \
1510    const CvSepFilter* state = (const CvSepFilter*)params;          \
1511    const CvMat* _ky = state->get_y_kernel();                       \
1512    const srctype* ky = (const srctype*)_ky->data.ptr;              \
1513    int ksize = _ky->cols + _ky->rows - 1, ksize2 = ksize/2;        \
1514    int i, k, width = state->get_width();                           \
1515    int cn = CV_MAT_CN(state->get_src_type());                      \
1516    int is_symm = state->get_y_kernel_flags() & CvSepFilter::SYMMETRICAL;\
1517                                                                    \
1518    width *= cn;                                                    \
1519    src += ksize2;                                                  \
1520    ky += ksize2;                                                   \
1521    dst_step /= sizeof(dst[0]);                                     \
1522                                                                    \
1523    if( is_symm )                                                   \
1524    {                                                               \
1525        for( ; count--; dst += dst_step, src++ )                    \
1526        {                                                           \
1527            for( i = 0; i <= width - 4; i += 4 )                    \
1528            {                                                       \
1529                double f = ky[0];                                   \
1530                const srctype* sptr = src[0] + i, *sptr2;           \
1531                double s0 = f*sptr[0], s1 = f*sptr[1],              \
1532                       s2 = f*sptr[2], s3 = f*sptr[3];              \
1533                worktype t0, t1;                                    \
1534                for( k = 1; k <= ksize2; k++ )                      \
1535                {                                                   \
1536                    sptr = src[k] + i;                              \
1537                    sptr2 = src[-k] + i;                            \
1538                    f = ky[k];                                      \
1539                    s0 += f*(sptr[0] + sptr2[0]);                   \
1540                    s1 += f*(sptr[1] + sptr2[1]);                   \
1541                    s2 += f*(sptr[2] + sptr2[2]);                   \
1542                    s3 += f*(sptr[3] + sptr2[3]);                   \
1543                }                                                   \
1544                                                                    \
1545                t0 = cast_macro1(s0); t1 = cast_macro1(s1);         \
1546                dst[i]=cast_macro2(t0); dst[i+1]=cast_macro2(t1);   \
1547                t0 = cast_macro1(s2); t1 = cast_macro1(s3);         \
1548                dst[i+2]=cast_macro2(t0); dst[i+3]=cast_macro2(t1); \
1549            }                                                       \
1550                                                                    \
1551            for( ; i < width; i++ )                                 \
1552            {                                                       \
1553                double s0 = (double)ky[0]*src[0][i];                \
1554                worktype t0;                                        \
1555                for( k = 1; k <= ksize2; k++ )                      \
1556                    s0 += (double)ky[k]*(src[k][i] + src[-k][i]);   \
1557                t0 = cast_macro1(s0);                               \
1558                dst[i] = cast_macro2(t0);                           \
1559            }                                                       \
1560        }                                                           \
1561    }                                                               \
1562    else                                                            \
1563    {                                                               \
1564        for( ; count--; dst += dst_step, src++ )                    \
1565        {                                                           \
1566            for( i = 0; i <= width - 4; i += 4 )                    \
1567            {                                                       \
1568                double f = ky[0];                                   \
1569                const srctype* sptr = src[0] + i, *sptr2;           \
1570                double s0 = 0, s1 = 0, s2 = 0, s3 = 0;              \
1571                worktype t0, t1;                                    \
1572                for( k = 1; k <= ksize2; k++ )                      \
1573                {                                                   \
1574                    sptr = src[k] + i;                              \
1575                    sptr2 = src[-k] + i;                            \
1576                    f = ky[k];                                      \
1577                    s0 += f*(sptr[0] - sptr2[0]);                   \
1578                    s1 += f*(sptr[1] - sptr2[1]);                   \
1579                    s2 += f*(sptr[2] - sptr2[2]);                   \
1580                    s3 += f*(sptr[3] - sptr2[3]);                   \
1581                }                                                   \
1582                                                                    \
1583                t0 = cast_macro1(s0); t1 = cast_macro1(s1);         \
1584                dst[i]=cast_macro2(t0); dst[i+1]=cast_macro2(t1);   \
1585                t0 = cast_macro1(s2); t1 = cast_macro1(s3);         \
1586                dst[i+2]=cast_macro2(t0); dst[i+3]=cast_macro2(t1); \
1587            }                                                       \
1588                                                                    \
1589            for( ; i < width; i++ )                                 \
1590            {                                                       \
1591                double s0 = (double)ky[0]*src[0][i];                \
1592                worktype t0;                                        \
1593                for( k = 1; k <= ksize2; k++ )                      \
1594                    s0 += (double)ky[k]*(src[k][i] - src[-k][i]);   \
1595                t0 = cast_macro1(s0);                               \
1596                dst[i] = cast_macro2(t0);                           \
1597            }                                                       \
1598        }                                                           \
1599    }                                                               \
1600}
1601
1602
1603ICV_FILTER_COL_SYMM( 32f8u, float, uchar, int, cvRound, CV_CAST_8U )
1604ICV_FILTER_COL_SYMM( 32f16s, float, short, int, cvRound, CV_CAST_16S )
1605ICV_FILTER_COL_SYMM( 32f16u, float, ushort, int, cvRound, CV_CAST_16U )
1606
1607
1608static void
1609icvFilterCol_32f( const float** src, float* dst,
1610                  int dst_step, int count, void* params )
1611{
1612    const CvSepFilter* state = (const CvSepFilter*)params;
1613    const CvMat* _ky = state->get_y_kernel();
1614    const float* ky = (const float*)_ky->data.ptr;
1615    int ksize = _ky->cols + _ky->rows - 1;
1616    int i, k, width = state->get_width();
1617    int cn = CV_MAT_CN(state->get_src_type());
1618
1619    width *= cn;
1620    dst_step /= sizeof(dst[0]);
1621
1622    for( ; count--; dst += dst_step, src++ )
1623    {
1624        for( i = 0; i <= width - 4; i += 4 )
1625        {
1626            double f = ky[0];
1627            const float* sptr = src[0] + i;
1628            double s0 = f*sptr[0], s1 = f*sptr[1],
1629                   s2 = f*sptr[2], s3 = f*sptr[3];
1630            for( k = 1; k < ksize; k++ )
1631            {
1632                sptr = src[k] + i; f = ky[k];
1633                s0 += f*sptr[0]; s1 += f*sptr[1];
1634                s2 += f*sptr[2]; s3 += f*sptr[3];
1635            }
1636
1637            dst[i] = (float)s0; dst[i+1] = (float)s1;
1638            dst[i+2] = (float)s2; dst[i+3] = (float)s3;
1639        }
1640
1641        for( ; i < width; i++ )
1642        {
1643            double s0 = (double)ky[0]*src[0][i];
1644            for( k = 1; k < ksize; k++ )
1645                s0 += (double)ky[k]*src[k][i];
1646            dst[i] = (float)s0;
1647        }
1648    }
1649}
1650
1651
1652static void
1653icvFilterColSymm_32f( const float** src, float* dst,
1654                      int dst_step, int count, void* params )
1655{
1656    const CvSepFilter* state = (const CvSepFilter*)params;
1657    const CvMat* _ky = state->get_y_kernel();
1658    const float* ky = (const float*)_ky->data.ptr;
1659    int ksize = _ky->cols + _ky->rows - 1, ksize2 = ksize/2;
1660    int i = 0, k, width = state->get_width();
1661    int cn = CV_MAT_CN(state->get_src_type());
1662    int is_symm = state->get_y_kernel_flags() & CvSepFilter::SYMMETRICAL;
1663    int is_1_2_1 = is_symm && ksize == 3 &&
1664        fabs(ky[1] - 2.) <= FLT_EPSILON && fabs(ky[2] - 1.) <= FLT_EPSILON;
1665    int is_3_10_3 = is_symm && ksize == 3 &&
1666            fabs(ky[1] - 10.) <= FLT_EPSILON && fabs(ky[2] - 3.) <= FLT_EPSILON;
1667    int is_m1_0_1 = !is_symm && ksize == 3 &&
1668            fabs(ky[1]) <= FLT_EPSILON && fabs(ky[2]*ky[2] - 1.) <= FLT_EPSILON ?
1669            (ky[2] > 0 ? 1 : -1) : 0;
1670
1671    width *= cn;
1672    src += ksize2;
1673    ky += ksize2;
1674    dst_step /= sizeof(dst[0]);
1675
1676    if( is_symm )
1677    {
1678        for( ; count--; dst += dst_step, src++ )
1679        {
1680            if( is_1_2_1 )
1681            {
1682                const float *src0 = src[-1], *src1 = src[0], *src2 = src[1];
1683
1684                for( i = 0; i <= width - 4; i += 4 )
1685                {
1686                    float s0 = src0[i] + src1[i]*2 + src2[i],
1687                          s1 = src0[i+1] + src1[i+1]*2 + src2[i+1],
1688                          s2 = src0[i+2] + src1[i+2]*2 + src2[i+2],
1689                          s3 = src0[i+3] + src1[i+3]*2 + src2[i+3];
1690
1691                    dst[i] = s0; dst[i+1] = s1;
1692                    dst[i+2] = s2; dst[i+3] = s3;
1693                }
1694            }
1695            else if( is_3_10_3 )
1696            {
1697                const float *src0 = src[-1], *src1 = src[0], *src2 = src[1];
1698
1699                for( i = 0; i <= width - 4; i += 4 )
1700                {
1701                    float s0 = src1[i]*10 + (src0[i] + src2[i])*3,
1702                          s1 = src1[i+1]*10 + (src0[i+1] + src2[i+1])*3,
1703                          s2 = src1[i+2]*10 + (src0[i+2] + src2[i+2])*3,
1704                          s3 = src1[i+3]*10 + (src0[i+3] + src2[i+3])*3;
1705
1706                    dst[i] = s0; dst[i+1] = s1;
1707                    dst[i+2] = s2; dst[i+3] = s3;
1708                }
1709            }
1710            else
1711                for( i = 0; i <= width - 4; i += 4 )
1712                {
1713                    double f = ky[0];
1714                    const float* sptr = src[0] + i, *sptr2;
1715                    double s0 = f*sptr[0], s1 = f*sptr[1],
1716                           s2 = f*sptr[2], s3 = f*sptr[3];
1717                    for( k = 1; k <= ksize2; k++ )
1718                    {
1719                        sptr = src[k] + i; sptr2 = src[-k] + i; f = ky[k];
1720                        s0 += f*(sptr[0] + sptr2[0]); s1 += f*(sptr[1] + sptr2[1]);
1721                        s2 += f*(sptr[2] + sptr2[2]); s3 += f*(sptr[3] + sptr2[3]);
1722                    }
1723
1724                    dst[i] = (float)s0; dst[i+1] = (float)s1;
1725                    dst[i+2] = (float)s2; dst[i+3] = (float)s3;
1726                }
1727
1728            for( ; i < width; i++ )
1729            {
1730                double s0 = (double)ky[0]*src[0][i];
1731                for( k = 1; k <= ksize2; k++ )
1732                    s0 += (double)ky[k]*(src[k][i] + src[-k][i]);
1733                dst[i] = (float)s0;
1734            }
1735        }
1736    }
1737    else
1738    {
1739        for( ; count--; dst += dst_step, src++ )
1740        {
1741            if( is_m1_0_1 )
1742            {
1743                const float *src0 = src[-is_m1_0_1], *src2 = src[is_m1_0_1];
1744
1745                for( i = 0; i <= width - 4; i += 4 )
1746                {
1747                    float s0 = src2[i] - src0[i], s1 = src2[i+1] - src0[i+1],
1748                          s2 = src2[i+2] - src0[i+2], s3 = src2[i+3] - src0[i+3];
1749                    dst[i] = s0; dst[i+1] = s1;
1750                    dst[i+2] = s2; dst[i+3] = s3;
1751                }
1752            }
1753            else
1754                for( i = 0; i <= width - 4; i += 4 )
1755                {
1756                    double f = ky[0];
1757                    const float* sptr = src[0] + i, *sptr2;
1758                    double s0 = 0, s1 = 0, s2 = 0, s3 = 0;
1759                    for( k = 1; k <= ksize2; k++ )
1760                    {
1761                        sptr = src[k] + i; sptr2 = src[-k] + i; f = ky[k];
1762                        s0 += f*(sptr[0] - sptr2[0]); s1 += f*(sptr[1] - sptr2[1]);
1763                        s2 += f*(sptr[2] - sptr2[2]); s3 += f*(sptr[3] - sptr2[3]);
1764                    }
1765
1766                    dst[i] = (float)s0; dst[i+1] = (float)s1;
1767                    dst[i+2] = (float)s2; dst[i+3] = (float)s3;
1768                }
1769
1770            for( ; i < width; i++ )
1771            {
1772                double s0 = (double)ky[0]*src[0][i];
1773                for( k = 1; k <= ksize2; k++ )
1774                    s0 += (double)ky[k]*(src[k][i] - src[-k][i]);
1775                dst[i] = (float)s0;
1776            }
1777        }
1778    }
1779}
1780
1781
1782#define SMALL_GAUSSIAN_SIZE  7
1783
1784void CvSepFilter::init_gaussian_kernel( CvMat* kernel, double sigma )
1785{
1786    static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE/2+1] =
1787    {
1788        {1.f},
1789        {0.5f, 0.25f},
1790        {0.375f, 0.25f, 0.0625f},
1791        {0.28125f, 0.21875f, 0.109375f, 0.03125f}
1792    };
1793
1794    CV_FUNCNAME( "CvSepFilter::init_gaussian_kernel" );
1795
1796    __BEGIN__;
1797
1798    int type, i, n, step;
1799    const float* fixed_kernel = 0;
1800    double sigmaX, scale2X, sum;
1801    float* cf;
1802    double* cd;
1803
1804    if( !CV_IS_MAT(kernel) )
1805        CV_ERROR( CV_StsBadArg, "kernel is not a valid matrix" );
1806
1807    type = CV_MAT_TYPE(kernel->type);
1808
1809    if( (kernel->cols != 1 && kernel->rows != 1) ||
1810        (kernel->cols + kernel->rows - 1) % 2 == 0 ||
1811        (type != CV_32FC1 && type != CV_64FC1) )
1812        CV_ERROR( CV_StsBadSize, "kernel should be 1D floating-point vector of odd (2*k+1) size" );
1813
1814    n = kernel->cols + kernel->rows - 1;
1815
1816    if( n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 )
1817        fixed_kernel = small_gaussian_tab[n>>1];
1818
1819    sigmaX = sigma > 0 ? sigma : (n/2 - 1)*0.3 + 0.8;
1820    scale2X = -0.5/(sigmaX*sigmaX);
1821    step = kernel->rows == 1 ? 1 : kernel->step/CV_ELEM_SIZE1(type);
1822    cf = kernel->data.fl;
1823    cd = kernel->data.db;
1824
1825    sum = fixed_kernel ? -fixed_kernel[0] : -1.;
1826
1827    for( i = 0; i <= n/2; i++ )
1828    {
1829        double t = fixed_kernel ? (double)fixed_kernel[i] : exp(scale2X*i*i);
1830        if( type == CV_32FC1 )
1831        {
1832            cf[(n/2+i)*step] = (float)t;
1833            sum += cf[(n/2+i)*step]*2;
1834        }
1835        else
1836        {
1837            cd[(n/2+i)*step] = t;
1838            sum += cd[(n/2+i)*step]*2;
1839        }
1840    }
1841
1842    sum = 1./sum;
1843    for( i = 0; i <= n/2; i++ )
1844    {
1845        if( type == CV_32FC1 )
1846            cf[(n/2+i)*step] = cf[(n/2-i)*step] = (float)(cf[(n/2+i)*step]*sum);
1847        else
1848            cd[(n/2+i)*step] = cd[(n/2-i)*step] = cd[(n/2+i)*step]*sum;
1849    }
1850
1851    __END__;
1852}
1853
1854
1855void CvSepFilter::init_sobel_kernel( CvMat* _kx, CvMat* _ky, int dx, int dy, int flags )
1856{
1857    CV_FUNCNAME( "CvSepFilter::init_sobel_kernel" );
1858
1859    __BEGIN__;
1860
1861    int i, j, k, msz;
1862    int* kerI;
1863    bool normalize = (flags & NORMALIZE_KERNEL) != 0;
1864    bool flip = (flags & FLIP_KERNEL) != 0;
1865
1866    if( !CV_IS_MAT(_kx) || !CV_IS_MAT(_ky) )
1867        CV_ERROR( CV_StsBadArg, "One of the kernel matrices is not valid" );
1868
1869    msz = MAX( _kx->cols + _kx->rows, _ky->cols + _ky->rows );
1870    if( msz > 32 )
1871        CV_ERROR( CV_StsOutOfRange, "Too large kernel size" );
1872
1873    kerI = (int*)cvStackAlloc( msz*sizeof(kerI[0]) );
1874
1875    if( dx < 0 || dy < 0 || dx+dy <= 0 )
1876        CV_ERROR( CV_StsOutOfRange,
1877            "Both derivative orders (dx and dy) must be non-negative "
1878            "and at least one of them must be positive." );
1879
1880    for( k = 0; k < 2; k++ )
1881    {
1882        CvMat* kernel = k == 0 ? _kx : _ky;
1883        int order = k == 0 ? dx : dy;
1884        int n = kernel->cols + kernel->rows - 1, step;
1885        int type = CV_MAT_TYPE(kernel->type);
1886        double scale = !normalize ? 1. : 1./(1 << (n-order-1));
1887        int iscale = 1;
1888
1889        if( (kernel->cols != 1 && kernel->rows != 1) ||
1890            (kernel->cols + kernel->rows - 1) % 2 == 0 ||
1891            (type != CV_32FC1 && type != CV_64FC1 && type != CV_32SC1) )
1892            CV_ERROR( CV_StsOutOfRange,
1893            "Both kernels must be 1D floating-point or integer vectors of odd (2*k+1) size." );
1894
1895        if( normalize && n > 1 && type == CV_32SC1 )
1896            CV_ERROR( CV_StsBadArg, "Integer kernel can not be normalized" );
1897
1898        if( n <= order )
1899            CV_ERROR( CV_StsOutOfRange,
1900            "Derivative order must be smaller than the corresponding kernel size" );
1901
1902        if( n == 1 )
1903            kerI[0] = 1;
1904        else if( n == 3 )
1905        {
1906            if( order == 0 )
1907                kerI[0] = 1, kerI[1] = 2, kerI[2] = 1;
1908            else if( order == 1 )
1909                kerI[0] = -1, kerI[1] = 0, kerI[2] = 1;
1910            else
1911                kerI[0] = 1, kerI[1] = -2, kerI[2] = 1;
1912        }
1913        else
1914        {
1915            int oldval, newval;
1916            kerI[0] = 1;
1917            for( i = 0; i < n; i++ )
1918                kerI[i+1] = 0;
1919
1920            for( i = 0; i < n - order - 1; i++ )
1921            {
1922                oldval = kerI[0];
1923                for( j = 1; j <= n; j++ )
1924                {
1925                    newval = kerI[j]+kerI[j-1];
1926                    kerI[j-1] = oldval;
1927                    oldval = newval;
1928                }
1929            }
1930
1931            for( i = 0; i < order; i++ )
1932            {
1933                oldval = -kerI[0];
1934                for( j = 1; j <= n; j++ )
1935                {
1936                    newval = kerI[j-1] - kerI[j];
1937                    kerI[j-1] = oldval;
1938                    oldval = newval;
1939                }
1940            }
1941        }
1942
1943        step = kernel->rows == 1 ? 1 : kernel->step/CV_ELEM_SIZE1(type);
1944        if( flip && (order & 1) && k )
1945            iscale = -iscale, scale = -scale;
1946
1947        for( i = 0; i < n; i++ )
1948        {
1949            if( type == CV_32SC1 )
1950                kernel->data.i[i*step] = kerI[i]*iscale;
1951            else if( type == CV_32FC1 )
1952                kernel->data.fl[i*step] = (float)(kerI[i]*scale);
1953            else
1954                kernel->data.db[i*step] = kerI[i]*scale;
1955        }
1956    }
1957
1958    __END__;
1959}
1960
1961
1962void CvSepFilter::init_scharr_kernel( CvMat* _kx, CvMat* _ky, int dx, int dy, int flags )
1963{
1964    CV_FUNCNAME( "CvSepFilter::init_scharr_kernel" );
1965
1966    __BEGIN__;
1967
1968    int i, k;
1969    int kerI[3];
1970    bool normalize = (flags & NORMALIZE_KERNEL) != 0;
1971    bool flip = (flags & FLIP_KERNEL) != 0;
1972
1973    if( !CV_IS_MAT(_kx) || !CV_IS_MAT(_ky) )
1974        CV_ERROR( CV_StsBadArg, "One of the kernel matrices is not valid" );
1975
1976    if( ((dx|dy)&~1) || dx+dy != 1 )
1977        CV_ERROR( CV_StsOutOfRange,
1978            "Scharr kernel can only be used for 1st order derivatives" );
1979
1980    for( k = 0; k < 2; k++ )
1981    {
1982        CvMat* kernel = k == 0 ? _kx : _ky;
1983        int order = k == 0 ? dx : dy;
1984        int n = kernel->cols + kernel->rows - 1, step;
1985        int type = CV_MAT_TYPE(kernel->type);
1986        double scale = !normalize ? 1. : order == 0 ? 1./16 : 1./2;
1987        int iscale = 1;
1988
1989        if( (kernel->cols != 1 && kernel->rows != 1) ||
1990            kernel->cols + kernel->rows - 1 != 3 ||
1991            (type != CV_32FC1 && type != CV_64FC1 && type != CV_32SC1) )
1992            CV_ERROR( CV_StsOutOfRange,
1993            "Both kernels must be 1D floating-point or integer vectors containing 3 elements each." );
1994
1995        if( normalize && type == CV_32SC1 )
1996            CV_ERROR( CV_StsBadArg, "Integer kernel can not be normalized" );
1997
1998        if( order == 0 )
1999            kerI[0] = 3, kerI[1] = 10, kerI[2] = 3;
2000        else
2001            kerI[0] = -1, kerI[1] = 0, kerI[2] = 1;
2002
2003        step = kernel->rows == 1 ? 1 : kernel->step/CV_ELEM_SIZE1(type);
2004        if( flip && (order & 1) && k )
2005            iscale = -iscale, scale = -scale;
2006
2007        for( i = 0; i < n; i++ )
2008        {
2009            if( type == CV_32SC1 )
2010                kernel->data.i[i*step] = kerI[i]*iscale;
2011            else if( type == CV_32FC1 )
2012                kernel->data.fl[i*step] = (float)(kerI[i]*scale);
2013            else
2014                kernel->data.db[i*step] = kerI[i]*scale;
2015        }
2016    }
2017
2018    __END__;
2019}
2020
2021
2022void CvSepFilter::init_deriv( int _max_width, int _src_type, int _dst_type,
2023                              int dx, int dy, int aperture_size, int flags )
2024{
2025    CV_FUNCNAME( "CvSepFilter::init_deriv" );
2026
2027    __BEGIN__;
2028
2029    int kx_size = aperture_size == CV_SCHARR ? 3 : aperture_size, ky_size = kx_size;
2030    float kx_data[CV_MAX_SOBEL_KSIZE], ky_data[CV_MAX_SOBEL_KSIZE];
2031    CvMat _kx, _ky;
2032
2033    if( kx_size <= 0 || ky_size > CV_MAX_SOBEL_KSIZE )
2034        CV_ERROR( CV_StsOutOfRange, "Incorrect aperture_size" );
2035
2036    if( kx_size == 1 && dx )
2037        kx_size = 3;
2038    if( ky_size == 1 && dy )
2039        ky_size = 3;
2040
2041    _kx = cvMat( 1, kx_size, CV_32FC1, kx_data );
2042    _ky = cvMat( 1, ky_size, CV_32FC1, ky_data );
2043
2044    if( aperture_size == CV_SCHARR )
2045    {
2046        CV_CALL( init_scharr_kernel( &_kx, &_ky, dx, dy, flags ));
2047    }
2048    else
2049    {
2050        CV_CALL( init_sobel_kernel( &_kx, &_ky, dx, dy, flags ));
2051    }
2052
2053    CV_CALL( init( _max_width, _src_type, _dst_type, &_kx, &_ky ));
2054
2055    __END__;
2056}
2057
2058
2059void CvSepFilter::init_gaussian( int _max_width, int _src_type, int _dst_type,
2060                                 int gaussian_size, double sigma )
2061{
2062    float* kdata = 0;
2063
2064    CV_FUNCNAME( "CvSepFilter::init_gaussian" );
2065
2066    __BEGIN__;
2067
2068    CvMat _kernel;
2069
2070    if( gaussian_size <= 0 || gaussian_size > 1024 )
2071        CV_ERROR( CV_StsBadSize, "Incorrect size of gaussian kernel" );
2072
2073    kdata = (float*)cvStackAlloc(gaussian_size*sizeof(kdata[0]));
2074    _kernel = cvMat( 1, gaussian_size, CV_32F, kdata );
2075
2076    CV_CALL( init_gaussian_kernel( &_kernel, sigma ));
2077    CV_CALL( init( _max_width, _src_type, _dst_type, &_kernel, &_kernel ));
2078
2079    __END__;
2080}
2081
2082
2083/****************************************************************************************\
2084                              Non-separable Linear Filter
2085\****************************************************************************************/
2086
2087static void icvLinearFilter_8u( const uchar** src, uchar* dst, int dst_step,
2088                                int count, void* params );
2089static void icvLinearFilter_16s( const short** src, short* dst, int dst_step,
2090                                 int count, void* params );
2091static void icvLinearFilter_16u( const ushort** src, ushort* dst, int dst_step,
2092                                 int count, void* params );
2093static void icvLinearFilter_32f( const float** src, float* dst, int dst_step,
2094                                 int count, void* params );
2095
2096CvLinearFilter::CvLinearFilter()
2097{
2098    kernel = 0;
2099    k_sparse = 0;
2100}
2101
2102CvLinearFilter::CvLinearFilter( int _max_width, int _src_type, int _dst_type,
2103                                const CvMat* _kernel, CvPoint _anchor,
2104                                int _border_mode, CvScalar _border_value )
2105{
2106    kernel = 0;
2107    k_sparse = 0;
2108    init( _max_width, _src_type, _dst_type, _kernel,
2109          _anchor, _border_mode, _border_value );
2110}
2111
2112
2113void CvLinearFilter::clear()
2114{
2115    cvReleaseMat( &kernel );
2116    cvFree( &k_sparse );
2117    CvBaseImageFilter::clear();
2118}
2119
2120
2121CvLinearFilter::~CvLinearFilter()
2122{
2123    clear();
2124}
2125
2126
2127void CvLinearFilter::init( int _max_width, int _src_type, int _dst_type,
2128                           const CvMat* _kernel, CvPoint _anchor,
2129                           int _border_mode, CvScalar _border_value )
2130{
2131    CV_FUNCNAME( "CvLinearFilter::init" );
2132
2133    __BEGIN__;
2134
2135    int depth = CV_MAT_DEPTH(_src_type);
2136    int cn = CV_MAT_CN(_src_type);
2137    CvPoint* nz_loc;
2138    float* coeffs;
2139    int i, j, k = 0;
2140
2141    if( !CV_IS_MAT(_kernel) )
2142        CV_ERROR( CV_StsBadArg, "kernel is not valid matrix" );
2143
2144    _src_type = CV_MAT_TYPE(_src_type);
2145    _dst_type = CV_MAT_TYPE(_dst_type);
2146
2147    if( _src_type != _dst_type )
2148        CV_ERROR( CV_StsUnmatchedFormats,
2149        "The source and destination image types must be the same" );
2150
2151    CV_CALL( CvBaseImageFilter::init( _max_width, _src_type, _dst_type,
2152        false, cvGetMatSize(_kernel), _anchor, _border_mode, _border_value ));
2153
2154    if( !(kernel && k_sparse && ksize.width == kernel->cols && ksize.height == kernel->rows ))
2155    {
2156        cvReleaseMat( &kernel );
2157        cvFree( &k_sparse );
2158        CV_CALL( kernel = cvCreateMat( ksize.height, ksize.width, CV_32FC1 ));
2159        CV_CALL( k_sparse = (uchar*)cvAlloc(
2160            ksize.width*ksize.height*(2*sizeof(int) + sizeof(uchar*) + sizeof(float))));
2161    }
2162
2163    CV_CALL( cvConvert( _kernel, kernel ));
2164
2165    nz_loc = (CvPoint*)k_sparse;
2166    for( i = 0; i < ksize.height; i++ )
2167    {
2168        for( j = 0; j < ksize.width; j++ )
2169            if( fabs(((float*)(kernel->data.ptr + i*kernel->step))[j])>FLT_EPSILON )
2170                nz_loc[k++] = cvPoint(j,i);
2171    }
2172    if( k == 0 )
2173        nz_loc[k++] = anchor;
2174    k_sparse_count = k;
2175    coeffs = (float*)((uchar**)(nz_loc + k_sparse_count) + k_sparse_count);
2176
2177    for( k = 0; k < k_sparse_count; k++ )
2178    {
2179        coeffs[k] = CV_MAT_ELEM( *kernel, float, nz_loc[k].y, nz_loc[k].x );
2180        nz_loc[k].x *= cn;
2181    }
2182
2183    x_func = 0;
2184    if( depth == CV_8U )
2185        y_func = (CvColumnFilterFunc)icvLinearFilter_8u;
2186    else if( depth == CV_16S )
2187        y_func = (CvColumnFilterFunc)icvLinearFilter_16s;
2188    else if( depth == CV_16U )
2189        y_func = (CvColumnFilterFunc)icvLinearFilter_16u;
2190    else if( depth == CV_32F )
2191        y_func = (CvColumnFilterFunc)icvLinearFilter_32f;
2192    else
2193        CV_ERROR( CV_StsUnsupportedFormat, "Unsupported image type" );
2194
2195    __END__;
2196}
2197
2198
2199void CvLinearFilter::init( int _max_width, int _src_type, int _dst_type,
2200                           bool _is_separable, CvSize _ksize,
2201                           CvPoint _anchor, int _border_mode,
2202                           CvScalar _border_value )
2203{
2204    CvBaseImageFilter::init( _max_width, _src_type, _dst_type, _is_separable,
2205                             _ksize, _anchor, _border_mode, _border_value );
2206}
2207
2208
2209#define ICV_FILTER( flavor, arrtype, worktype, load_macro,          \
2210                    cast_macro1, cast_macro2 )                      \
2211static void                                                         \
2212icvLinearFilter_##flavor( const arrtype** src, arrtype* dst,        \
2213                    int dst_step, int count, void* params )         \
2214{                                                                   \
2215    CvLinearFilter* state = (CvLinearFilter*)params;                \
2216    int width = state->get_width();                                 \
2217    int cn = CV_MAT_CN(state->get_src_type());                      \
2218    int i, k;                                                       \
2219    CvPoint* k_sparse = (CvPoint*)state->get_kernel_sparse_buf();   \
2220    int k_count = state->get_kernel_sparse_count();                 \
2221    const arrtype** k_ptr = (const arrtype**)(k_sparse + k_count);  \
2222    const arrtype** k_end = k_ptr + k_count;                        \
2223    const float* k_coeffs = (const float*)(k_ptr + k_count);        \
2224                                                                    \
2225    width *= cn;                                                    \
2226    dst_step /= sizeof(dst[0]);                                     \
2227                                                                    \
2228    for( ; count > 0; count--, dst += dst_step, src++ )             \
2229    {                                                               \
2230        for( k = 0; k < k_count; k++ )                              \
2231            k_ptr[k] = src[k_sparse[k].y] + k_sparse[k].x;          \
2232                                                                    \
2233        for( i = 0; i <= width - 4; i += 4 )                        \
2234        {                                                           \
2235            const arrtype** kp = k_ptr;                             \
2236            const float* kc = k_coeffs;                             \
2237            double s0 = 0, s1 = 0, s2 = 0, s3 = 0;                  \
2238            worktype t0, t1;                                        \
2239                                                                    \
2240            while( kp != k_end )                                    \
2241            {                                                       \
2242                const arrtype* sptr = (*kp++) + i;                  \
2243                float f = *kc++;                                    \
2244                s0 += f*load_macro(sptr[0]);                        \
2245                s1 += f*load_macro(sptr[1]);                        \
2246                s2 += f*load_macro(sptr[2]);                        \
2247                s3 += f*load_macro(sptr[3]);                        \
2248            }                                                       \
2249                                                                    \
2250            t0 = cast_macro1(s0); t1 = cast_macro1(s1);             \
2251            dst[i] = cast_macro2(t0);                               \
2252            dst[i+1] = cast_macro2(t1);                             \
2253            t0 = cast_macro1(s2); t1 = cast_macro1(s3);             \
2254            dst[i+2] = cast_macro2(t0);                             \
2255            dst[i+3] = cast_macro2(t1);                             \
2256        }                                                           \
2257                                                                    \
2258        for( ; i < width; i++ )                                     \
2259        {                                                           \
2260            const arrtype** kp = k_ptr;                             \
2261            const float* kc = k_coeffs;                             \
2262            double s0 = 0;                                          \
2263            worktype t0;                                            \
2264                                                                    \
2265            while( kp != k_end )                                    \
2266            {                                                       \
2267                const arrtype* sptr = *kp++;                        \
2268                float f = *kc++;                                    \
2269                s0 += f*load_macro(sptr[i]);                        \
2270            }                                                       \
2271                                                                    \
2272            t0 = cast_macro1(s0);                                   \
2273            dst[i] = cast_macro2(t0);                               \
2274        }                                                           \
2275    }                                                               \
2276}
2277
2278
2279ICV_FILTER( 8u, uchar, int, CV_8TO32F, cvRound, CV_CAST_8U )
2280ICV_FILTER( 16u, ushort, int, CV_NOP, cvRound, CV_CAST_16U )
2281ICV_FILTER( 16s, short, int, CV_NOP, cvRound, CV_CAST_16S )
2282ICV_FILTER( 32f, float, float, CV_NOP, CV_CAST_32F, CV_NOP )
2283
2284
2285/////////////////////// common functions for working with IPP filters ////////////////////
2286
2287CvMat* icvIPPFilterInit( const CvMat* src, int stripe_size, CvSize ksize )
2288{
2289    CvSize temp_size;
2290    int pix_size = CV_ELEM_SIZE(src->type);
2291    temp_size.width = cvAlign(src->cols + ksize.width - 1,8/CV_ELEM_SIZE(src->type & CV_MAT_DEPTH_MASK));
2292    //temp_size.width = src->cols + ksize.width - 1;
2293    temp_size.height = (stripe_size*2 + temp_size.width*pix_size) / (temp_size.width*pix_size*2);
2294    temp_size.height = MAX( temp_size.height, ksize.height );
2295    temp_size.height = MIN( temp_size.height, src->rows + ksize.height - 1 );
2296
2297    return cvCreateMat( temp_size.height, temp_size.width, src->type );
2298}
2299
2300
2301int icvIPPFilterNextStripe( const CvMat* src, CvMat* temp, int y,
2302                            CvSize ksize, CvPoint anchor )
2303{
2304    int pix_size = CV_ELEM_SIZE(src->type);
2305    int src_step = src->step ? src->step : CV_STUB_STEP;
2306    int temp_step = temp->step ? temp->step : CV_STUB_STEP;
2307    int i, dy, src_y1 = 0, src_y2;
2308    int temp_rows;
2309    uchar* temp_ptr = temp->data.ptr;
2310    CvSize stripe_size, temp_size;
2311
2312    dy = MIN( temp->rows - ksize.height + 1, src->rows - y );
2313    if( y > 0 )
2314    {
2315        int temp_ready = ksize.height - 1;
2316
2317        for( i = 0; i < temp_ready; i++ )
2318            memcpy( temp_ptr + temp_step*i, temp_ptr +
2319                    temp_step*(temp->rows - temp_ready + i), temp_step );
2320
2321        temp_ptr += temp_ready*temp_step;
2322        temp_rows = dy;
2323        src_y1 = y + temp_ready - anchor.y;
2324        src_y2 = src_y1 + dy;
2325        if( src_y1 >= src->rows )
2326        {
2327            src_y1 = src->rows - 1;
2328            src_y2 = src->rows;
2329        }
2330    }
2331    else
2332    {
2333        temp_rows = dy + ksize.height - 1;
2334        src_y2 = temp_rows - anchor.y;
2335    }
2336
2337    src_y2 = MIN( src_y2, src->rows );
2338
2339    stripe_size = cvSize(src->cols, src_y2 - src_y1);
2340    temp_size = cvSize(temp->cols, temp_rows);
2341    icvCopyReplicateBorder_8u( src->data.ptr + src_y1*src_step, src_step,
2342                      stripe_size, temp_ptr, temp_step, temp_size,
2343                      (y == 0 ? anchor.y : 0), anchor.x, pix_size );
2344    return dy;
2345}
2346
2347
2348/////////////////////////////// IPP separable filter functions ///////////////////////////
2349
2350icvFilterRow_8u_C1R_t icvFilterRow_8u_C1R_p = 0;
2351icvFilterRow_8u_C3R_t icvFilterRow_8u_C3R_p = 0;
2352icvFilterRow_8u_C4R_t icvFilterRow_8u_C4R_p = 0;
2353icvFilterRow_16s_C1R_t icvFilterRow_16s_C1R_p = 0;
2354icvFilterRow_16s_C3R_t icvFilterRow_16s_C3R_p = 0;
2355icvFilterRow_16s_C4R_t icvFilterRow_16s_C4R_p = 0;
2356icvFilterRow_32f_C1R_t icvFilterRow_32f_C1R_p = 0;
2357icvFilterRow_32f_C3R_t icvFilterRow_32f_C3R_p = 0;
2358icvFilterRow_32f_C4R_t icvFilterRow_32f_C4R_p = 0;
2359
2360icvFilterColumn_8u_C1R_t icvFilterColumn_8u_C1R_p = 0;
2361icvFilterColumn_8u_C3R_t icvFilterColumn_8u_C3R_p = 0;
2362icvFilterColumn_8u_C4R_t icvFilterColumn_8u_C4R_p = 0;
2363icvFilterColumn_16s_C1R_t icvFilterColumn_16s_C1R_p = 0;
2364icvFilterColumn_16s_C3R_t icvFilterColumn_16s_C3R_p = 0;
2365icvFilterColumn_16s_C4R_t icvFilterColumn_16s_C4R_p = 0;
2366icvFilterColumn_32f_C1R_t icvFilterColumn_32f_C1R_p = 0;
2367icvFilterColumn_32f_C3R_t icvFilterColumn_32f_C3R_p = 0;
2368icvFilterColumn_32f_C4R_t icvFilterColumn_32f_C4R_p = 0;
2369
2370//////////////////////////////////////////////////////////////////////////////////////////
2371
2372typedef CvStatus (CV_STDCALL * CvIPPSepFilterFunc)
2373    ( const void* src, int srcstep, void* dst, int dststep,
2374      CvSize size, const float* kernel, int ksize, int anchor );
2375
2376int icvIPPSepFilter( const CvMat* src, CvMat* dst, const CvMat* kernelX,
2377                     const CvMat* kernelY, CvPoint anchor )
2378{
2379    int result = 0;
2380
2381    CvMat* top_bottom = 0;
2382    CvMat* vout_hin = 0;
2383    CvMat* dst_buf = 0;
2384
2385    CV_FUNCNAME( "icvIPPSepFilter" );
2386
2387    __BEGIN__;
2388
2389    CvSize ksize;
2390    CvPoint el_anchor;
2391    CvSize size;
2392    int type, depth, pix_size;
2393    int i, x, y, dy = 0, prev_dy = 0, max_dy;
2394    CvMat vout;
2395    CvIPPSepFilterFunc x_func = 0, y_func = 0;
2396    int src_step, top_bottom_step;
2397    float *kx, *ky;
2398    int align, stripe_size;
2399
2400    if( !icvFilterRow_8u_C1R_p )
2401        EXIT;
2402
2403    if( !CV_ARE_TYPES_EQ( src, dst ) || !CV_ARE_SIZES_EQ( src, dst ) ||
2404        !CV_IS_MAT_CONT(kernelX->type & kernelY->type) ||
2405        CV_MAT_TYPE(kernelX->type) != CV_32FC1 ||
2406        CV_MAT_TYPE(kernelY->type) != CV_32FC1 ||
2407        (kernelX->cols != 1 && kernelX->rows != 1) ||
2408        (kernelY->cols != 1 && kernelY->rows != 1) ||
2409        (unsigned)anchor.x >= (unsigned)(kernelX->cols + kernelX->rows - 1) ||
2410        (unsigned)anchor.y >= (unsigned)(kernelY->cols + kernelY->rows - 1) )
2411        CV_ERROR( CV_StsError, "Internal Error: incorrect parameters" );
2412
2413    ksize.width = kernelX->cols + kernelX->rows - 1;
2414    ksize.height = kernelY->cols + kernelY->rows - 1;
2415
2416    /*if( ksize.width <= 5 && ksize.height <= 5 )
2417    {
2418        float* ker = (float*)cvStackAlloc( ksize.width*ksize.height*sizeof(ker[0]));
2419        CvMat kernel = cvMat( ksize.height, ksize.width, CV_32F, ker );
2420        for( y = 0, i = 0; y < ksize.height; y++ )
2421            for( x = 0; x < ksize.width; x++, i++ )
2422                ker[i] = kernelY->data.fl[y]*kernelX->data.fl[x];
2423
2424        CV_CALL( cvFilter2D( src, dst, &kernel, anchor ));
2425        EXIT;
2426    }*/
2427
2428    type = CV_MAT_TYPE(src->type);
2429    depth = CV_MAT_DEPTH(type);
2430    pix_size = CV_ELEM_SIZE(type);
2431
2432    if( type == CV_8UC1 )
2433        x_func = icvFilterRow_8u_C1R_p, y_func = icvFilterColumn_8u_C1R_p;
2434    else if( type == CV_8UC3 )
2435        x_func = icvFilterRow_8u_C3R_p, y_func = icvFilterColumn_8u_C3R_p;
2436    else if( type == CV_8UC4 )
2437        x_func = icvFilterRow_8u_C4R_p, y_func = icvFilterColumn_8u_C4R_p;
2438    else if( type == CV_16SC1 )
2439        x_func = icvFilterRow_16s_C1R_p, y_func = icvFilterColumn_16s_C1R_p;
2440    else if( type == CV_16SC3 )
2441        x_func = icvFilterRow_16s_C3R_p, y_func = icvFilterColumn_16s_C3R_p;
2442    else if( type == CV_16SC4 )
2443        x_func = icvFilterRow_16s_C4R_p, y_func = icvFilterColumn_16s_C4R_p;
2444    else if( type == CV_32FC1 )
2445        x_func = icvFilterRow_32f_C1R_p, y_func = icvFilterColumn_32f_C1R_p;
2446    else if( type == CV_32FC3 )
2447        x_func = icvFilterRow_32f_C3R_p, y_func = icvFilterColumn_32f_C3R_p;
2448    else if( type == CV_32FC4 )
2449        x_func = icvFilterRow_32f_C4R_p, y_func = icvFilterColumn_32f_C4R_p;
2450    else
2451        EXIT;
2452
2453    size = cvGetMatSize(src);
2454    stripe_size = src->data.ptr == dst->data.ptr ? 1 << 15 : 1 << 16;
2455    max_dy = MAX( ksize.height - 1, stripe_size/(size.width + ksize.width - 1));
2456    max_dy = MIN( max_dy, size.height + ksize.height - 1 );
2457
2458    align = 8/CV_ELEM_SIZE(depth);
2459
2460    CV_CALL( top_bottom = cvCreateMat( ksize.height*2, cvAlign(size.width,align), type ));
2461
2462    CV_CALL( vout_hin = cvCreateMat( max_dy + ksize.height,
2463        cvAlign(size.width + ksize.width - 1, align), type ));
2464
2465    if( src->data.ptr == dst->data.ptr && size.height )
2466        CV_CALL( dst_buf = cvCreateMat( max_dy + ksize.height,
2467            cvAlign(size.width, align), type ));
2468
2469    kx = (float*)cvStackAlloc( ksize.width*sizeof(kx[0]) );
2470    ky = (float*)cvStackAlloc( ksize.height*sizeof(ky[0]) );
2471
2472    // mirror the kernels
2473    for( i = 0; i < ksize.width; i++ )
2474        kx[i] = kernelX->data.fl[ksize.width - i - 1];
2475
2476    for( i = 0; i < ksize.height; i++ )
2477        ky[i] = kernelY->data.fl[ksize.height - i - 1];
2478
2479    el_anchor = cvPoint( ksize.width - anchor.x - 1, ksize.height - anchor.y - 1 );
2480
2481    cvGetCols( vout_hin, &vout, anchor.x, anchor.x + size.width );
2482
2483    src_step = src->step ? src->step : CV_STUB_STEP;
2484    top_bottom_step = top_bottom->step ? top_bottom->step : CV_STUB_STEP;
2485    vout.step = vout.step ? vout.step : CV_STUB_STEP;
2486
2487    for( y = 0; y < size.height; y += dy )
2488    {
2489        const CvMat *vin = src, *hout = dst;
2490        int src_y = y, dst_y = y;
2491        dy = MIN( max_dy, size.height - (ksize.height - anchor.y - 1) - y );
2492
2493        if( y < anchor.y || dy < anchor.y )
2494        {
2495            int ay = anchor.y;
2496            CvSize src_stripe_size = size;
2497
2498            if( y < anchor.y )
2499            {
2500                src_y = 0;
2501                dy = MIN( anchor.y, size.height );
2502                src_stripe_size.height = MIN( dy + ksize.height - anchor.y - 1, size.height );
2503            }
2504            else
2505            {
2506                src_y = MAX( y - anchor.y, 0 );
2507                dy = size.height - y;
2508                src_stripe_size.height = MIN( dy + anchor.y, size.height );
2509                ay = MAX( anchor.y - y, 0 );
2510            }
2511
2512            icvCopyReplicateBorder_8u( src->data.ptr + src_y*src_step, src_step,
2513                src_stripe_size, top_bottom->data.ptr, top_bottom_step,
2514                cvSize(size.width, dy + ksize.height - 1), ay, 0, pix_size );
2515            vin = top_bottom;
2516            src_y = anchor.y;
2517        }
2518
2519        // do vertical convolution
2520        IPPI_CALL( y_func( vin->data.ptr + src_y*vin->step, vin->step ? vin->step : CV_STUB_STEP,
2521                           vout.data.ptr, vout.step, cvSize(size.width, dy),
2522                           ky, ksize.height, el_anchor.y ));
2523
2524        // now it's time to copy the previously processed stripe to the input/output image
2525        if( src->data.ptr == dst->data.ptr )
2526        {
2527            for( i = 0; i < prev_dy; i++ )
2528                memcpy( dst->data.ptr + (y - prev_dy + i)*dst->step,
2529                        dst_buf->data.ptr + i*dst_buf->step, size.width*pix_size );
2530            if( y + dy < size.height )
2531            {
2532                hout = dst_buf;
2533                dst_y = 0;
2534            }
2535        }
2536
2537        // create a border for every line by replicating the left-most/right-most elements
2538        for( i = 0; i < dy; i++ )
2539        {
2540            uchar* ptr = vout.data.ptr + i*vout.step;
2541            for( x = -1; x >= -anchor.x*pix_size; x-- )
2542                ptr[x] = ptr[x + pix_size];
2543            for( x = size.width*pix_size; x < (size.width+ksize.width-anchor.x-1)*pix_size; x++ )
2544                ptr[x] = ptr[x - pix_size];
2545        }
2546
2547        // do horizontal convolution
2548        IPPI_CALL( x_func( vout.data.ptr, vout.step, hout->data.ptr + dst_y*hout->step,
2549                           hout->step ? hout->step : CV_STUB_STEP,
2550                           cvSize(size.width, dy), kx, ksize.width, el_anchor.x ));
2551        prev_dy = dy;
2552    }
2553
2554    result = 1;
2555
2556    __END__;
2557
2558    cvReleaseMat( &vout_hin );
2559    cvReleaseMat( &dst_buf );
2560    cvReleaseMat( &top_bottom );
2561
2562    return result;
2563}
2564
2565//////////////////////////////////////////////////////////////////////////////////////////
2566
2567//////////////////////////////// IPP generic filter functions ////////////////////////////
2568
2569icvFilter_8u_C1R_t icvFilter_8u_C1R_p = 0;
2570icvFilter_8u_C3R_t icvFilter_8u_C3R_p = 0;
2571icvFilter_8u_C4R_t icvFilter_8u_C4R_p = 0;
2572icvFilter_16s_C1R_t icvFilter_16s_C1R_p = 0;
2573icvFilter_16s_C3R_t icvFilter_16s_C3R_p = 0;
2574icvFilter_16s_C4R_t icvFilter_16s_C4R_p = 0;
2575icvFilter_32f_C1R_t icvFilter_32f_C1R_p = 0;
2576icvFilter_32f_C3R_t icvFilter_32f_C3R_p = 0;
2577icvFilter_32f_C4R_t icvFilter_32f_C4R_p = 0;
2578
2579
2580typedef CvStatus (CV_STDCALL * CvFilterIPPFunc)
2581( const void* src, int srcstep, void* dst, int dststep, CvSize size,
2582  const float* kernel, CvSize ksize, CvPoint anchor );
2583
2584CV_IMPL void
2585cvFilter2D( const CvArr* _src, CvArr* _dst, const CvMat* kernel, CvPoint anchor )
2586{
2587    const int dft_filter_size = 100;
2588
2589    CvLinearFilter filter;
2590    CvMat* ipp_kernel = 0;
2591
2592    // below that approximate size OpenCV is faster
2593    const int ipp_lower_limit = 20;
2594    CvMat* temp = 0;
2595
2596    CV_FUNCNAME( "cvFilter2D" );
2597
2598    __BEGIN__;
2599
2600    int coi1 = 0, coi2 = 0;
2601    CvMat srcstub, *src = (CvMat*)_src;
2602    CvMat dststub, *dst = (CvMat*)_dst;
2603    int type;
2604
2605    CV_CALL( src = cvGetMat( src, &srcstub, &coi1 ));
2606    CV_CALL( dst = cvGetMat( dst, &dststub, &coi2 ));
2607
2608    if( coi1 != 0 || coi2 != 0 )
2609        CV_ERROR( CV_BadCOI, "" );
2610
2611    type = CV_MAT_TYPE( src->type );
2612
2613    if( !CV_ARE_SIZES_EQ( src, dst ))
2614        CV_ERROR( CV_StsUnmatchedSizes, "" );
2615
2616    if( !CV_ARE_TYPES_EQ( src, dst ))
2617        CV_ERROR( CV_StsUnmatchedFormats, "" );
2618
2619    if( anchor.x == -1 && anchor.y == -1 )
2620        anchor = cvPoint(kernel->cols/2,kernel->rows/2);
2621
2622    if( kernel->cols*kernel->rows >= dft_filter_size &&
2623        kernel->cols <= src->cols && kernel->rows <= src->rows )
2624    {
2625        if( src->data.ptr == dst->data.ptr )
2626        {
2627            temp = cvCloneMat( src );
2628            src = temp;
2629        }
2630        icvCrossCorr( src, kernel, dst, anchor );
2631        EXIT;
2632    }
2633
2634    if( icvFilter_8u_C1R_p && (src->rows >= ipp_lower_limit || src->cols >= ipp_lower_limit) )
2635    {
2636        CvFilterIPPFunc ipp_func =
2637                type == CV_8UC1 ? (CvFilterIPPFunc)icvFilter_8u_C1R_p :
2638                type == CV_8UC3 ? (CvFilterIPPFunc)icvFilter_8u_C3R_p :
2639                type == CV_8UC4 ? (CvFilterIPPFunc)icvFilter_8u_C4R_p :
2640                type == CV_16SC1 ? (CvFilterIPPFunc)icvFilter_16s_C1R_p :
2641                type == CV_16SC3 ? (CvFilterIPPFunc)icvFilter_16s_C3R_p :
2642                type == CV_16SC4 ? (CvFilterIPPFunc)icvFilter_16s_C4R_p :
2643                type == CV_32FC1 ? (CvFilterIPPFunc)icvFilter_32f_C1R_p :
2644                type == CV_32FC3 ? (CvFilterIPPFunc)icvFilter_32f_C3R_p :
2645                type == CV_32FC4 ? (CvFilterIPPFunc)icvFilter_32f_C4R_p : 0;
2646
2647        if( ipp_func )
2648        {
2649            CvSize el_size = { kernel->cols, kernel->rows };
2650            CvPoint el_anchor;
2651            int stripe_size = 1 << 16; // the optimal value may depend on CPU cache,
2652                                       // overhead of current IPP code etc.
2653            const uchar* shifted_ptr;
2654            int i, j, y, dy = 0;
2655            int temp_step, dst_step = dst->step ? dst->step : CV_STUB_STEP;
2656
2657            if( (unsigned)anchor.x >= (unsigned)kernel->cols ||
2658                (unsigned)anchor.y >= (unsigned)kernel->rows )
2659                CV_ERROR( CV_StsOutOfRange, "anchor point is out of kernel" );
2660
2661            el_anchor = cvPoint( el_size.width - anchor.x - 1, el_size.height - anchor.y - 1 );
2662
2663            CV_CALL( ipp_kernel = cvCreateMat( kernel->rows, kernel->cols, CV_32FC1 ));
2664            CV_CALL( cvConvert( kernel, ipp_kernel ));
2665
2666            // mirror the kernel around the center
2667            for( i = 0; i < (el_size.height+1)/2; i++ )
2668            {
2669                float* top_row = ipp_kernel->data.fl + el_size.width*i;
2670                float* bottom_row = ipp_kernel->data.fl + el_size.width*(el_size.height - i - 1);
2671
2672                for( j = 0; j < (el_size.width+1)/2; j++ )
2673                {
2674                    float a = top_row[j], b = top_row[el_size.width - j - 1];
2675                    float c = bottom_row[j], d = bottom_row[el_size.width - j - 1];
2676                    top_row[j] = d;
2677                    top_row[el_size.width - j - 1] = c;
2678                    bottom_row[j] = b;
2679                    bottom_row[el_size.width - j - 1] = a;
2680                }
2681            }
2682
2683            CV_CALL( temp = icvIPPFilterInit( src, stripe_size, el_size ));
2684
2685            shifted_ptr = temp->data.ptr +
2686                anchor.y*temp->step + anchor.x*CV_ELEM_SIZE(type);
2687            temp_step = temp->step ? temp->step : CV_STUB_STEP;
2688
2689            for( y = 0; y < src->rows; y += dy )
2690            {
2691                dy = icvIPPFilterNextStripe( src, temp, y, el_size, anchor );
2692                IPPI_CALL( ipp_func( shifted_ptr, temp_step,
2693                    dst->data.ptr + y*dst_step, dst_step, cvSize(src->cols, dy),
2694                    ipp_kernel->data.fl, el_size, el_anchor ));
2695            }
2696            EXIT;
2697        }
2698    }
2699
2700    CV_CALL( filter.init( src->cols, type, type, kernel, anchor ));
2701    CV_CALL( filter.process( src, dst ));
2702
2703    __END__;
2704
2705    cvReleaseMat( &temp );
2706    cvReleaseMat( &ipp_kernel );
2707}
2708
2709
2710/* End of file. */
2711