1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21//******************************************************************************
22//*
23//* @brief
24//*  This file contains definitions of routines for spatial filter
25//*
26//* @author
27//*  Ittiam
28//*
29//* @par List of Functions:
30//*  - ideint_spatial_filter_a9()
31//*
32//* @remarks
33//*  None
34//*
35//*******************************************************************************
36
37
38//******************************************************************************
39//*
40//*  @brief Performs spatial filtering
41//*
42//*  @par   Description
43//*   This functions performs edge adaptive spatial filtering on a 8x8 block
44//*
45//* @param[in] pu1_src
46//*  UWORD8 pointer to the source
47//*
48//* @param[in] pu1_out
49//*  UWORD8 pointer to the destination
50//*
51//* @param[in] src_strd
52//*  source stride
53//*
54//* @param[in] src_strd
55//*  destination stride
56//*
57//* @returns
58//*     None
59//*
60//* @remarks
61//*
62//******************************************************************************
63
64    .global ideint_spatial_filter_av8
65
66ideint_spatial_filter_av8:
67
68    movi  v16.8h, #0
69    movi  v18.8h, #0
70    movi  v20.8h, #0
71
72    // Backup x0
73    mov     x10,    x0
74
75    // Load from &pu1_row_1[0]
76    sub     x5,         x0,         #1
77    ld1     {v0.8b},    [x0],       x2
78
79    // Load from &pu1_row_1[-1]
80    ld1     {v1.8b},    [x5]
81    add     x5,         x5,        #2
82
83    // Load from &pu1_row_1[1]
84    ld1     {v2.8b},    [x5]
85
86    // Number of rows
87    mov     x4,         #4
88
89    // EDGE_BIAS_0
90    movi    v30.2s,     #5
91
92    // EDGE_BIAS_1
93    movi    v31.2s,     #7
94
95detect_edge:
96    // Load from &pu1_row_2[0]
97    sub     x5,         x0,         #1
98    ld1     {v3.8b},    [x0],       x2
99
100    // Load from &pu1_row_2[-1]
101    ld1     {v4.8b},    [x5]
102    add     x5,         x5,         #2
103
104    // Load from &pu1_row_2[1]
105    ld1     {v5.8b},    [x5]
106
107    // Calculate absolute differences
108    // pu1_row_1[i] - pu1_row_2[i]
109    uabal   v16.8h,      v0.8b,        v3.8b
110
111    // pu1_row_1[i - 1] - pu1_row_2[i + 1]
112    uabal   v18.8h,      v1.8b,        v5.8b
113
114    // pu1_row_1[i + 1] - pu1_row_2[i - 1]
115    uabal   v20.8h,      v2.8b,        v4.8b
116
117    mov     v0.8b,      v3.8b
118    mov     v1.8b,      v4.8b
119    mov     v2.8b,      v5.8b
120
121    subs    x4,         x4,             #1
122    bgt            detect_edge
123
124    // Calculate sum of absolute differeces for each edge
125    addp  v16.8h,       v16.8h,         v16.8h
126    addp  v18.8h,       v18.8h,         v18.8h
127    addp  v20.8h,       v20.8h,         v20.8h
128
129    uaddlp  v16.2s,     v16.4h
130    uaddlp  v18.2s,     v18.4h
131    uaddlp  v20.2s,     v20.4h
132
133    // adiff[0] *= EDGE_BIAS_0;
134    mul     v16.2s,     v16.2s,         v30.2s
135
136    // adiff[1] *= EDGE_BIAS_1;
137    mul     v18.2s,     v18.2s,         v31.2s
138
139    // adiff[2] *= EDGE_BIAS_1;
140    mul     v20.2s,     v20.2s,         v31.2s
141
142    // Move the differences to ARM registers
143
144
145    // Compute shift for first half of the block
146compute_shift_1:
147    smov    x5,         v16.s[0]
148    smov    x6,         v18.s[0]
149    smov    x7,         v20.s[0]
150
151    // Compute shift
152    mov     x8,         #0
153
154    // adiff[2] <= adiff[1]
155    cmp     x7,         x6
156    bgt     dir_45_gt_135_1
157
158    // adiff[2] <= adiff[0]
159    cmp     x7,         x5
160    mov     x11,        #1
161    csel    x8,         x11,        x8,     le
162
163    b       compute_shift_2
164dir_45_gt_135_1:
165
166    // adiff[1] <= adiff[0]
167    cmp     x6,         x5
168    // Move -1 if less than or equal to
169    movn    x11,        #0
170    csel    x8,         x11,        x8,     le
171
172
173compute_shift_2:
174    // Compute shift for first half of the block
175    smov    x5,         v16.s[1]
176    smov    x6,         v18.s[1]
177    smov    x7,         v20.s[1]
178
179    // Compute shift
180    mov     x9,         #0
181
182    // adiff[2] <= adiff[1]
183    cmp     x7,         x6
184    bgt     dir_45_gt_135_2
185
186    // adiff[2] <= adiff[0]
187    cmp     x7,         x5
188    mov     x11,        #1
189    csel    x9,         x11,        x9,     le
190
191    b       interpolate
192
193dir_45_gt_135_2:
194    // adiff[1] <= adiff[0]
195    cmp     x6,         x5
196
197    // Move -1 if less than or equal to
198    movn    x11,        #0
199    csel    x9,         x11,        x9,     le
200
201interpolate:
202    add     x4,         x10,        x8
203    add     x5,         x10,        x2
204    sub     x5,         x5,         x8
205
206    add     x10,        x10,        #4
207    add     x6,         x10,        x9
208    add     x7,         x10,        x2
209    sub     x7,         x7,         x9
210    mov     x8,         #4
211
212filter_loop:
213    ld1     {v0.s}[0],  [x4],       x2
214    ld1     {v2.s}[0],  [x5],       x2
215
216    ld1     {v0.s}[1],  [x6],       x2
217    ld1     {v2.s}[1],  [x7],       x2
218
219    urhadd  v4.8b,      v0.8b,      v2.8b
220    st1     {v4.2s},    [x1],       x3
221
222    subs    x8,         x8,         #1
223    bgt     filter_loop
224
225    ret
226