1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_inter_pred_chroma_copy_neon.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*   chroma interprediction filter for copy
45@*
46@* @par description:
47@*    copies the array of width 'wd' and height 'ht' from the  location pointed
48@*    by 'src' to the location pointed by 'dst'
49@*
50@* @param[in] pu1_src
51@*  uword8 pointer to the source
52@*
53@* @param[out] pu1_dst
54@*  uword8 pointer to the destination
55@*
56@* @param[in] src_strd
57@*  integer source stride
58@*
59@* @param[in] dst_strd
60@*  integer destination stride
61@*
62@* @param[in] pi1_coeff
63@*  word8 pointer to the filter coefficients
64@*
65@* @param[in] ht
66@*  integer height of the array
67@*
68@* @param[in] wd
69@*  integer width of the array
70@*
71@* @returns
72@*
73@* @remarks
74@*  none
75@*
76@*******************************************************************************
77@*/
78
79@void ihevc_inter_pred_chroma_copy( uword8 *pu1_src,
80@                                   uword8 *pu1_dst,
81@                                   word32 src_strd,
82@                                   word32 dst_strd,
83@                                   word8 *pi1_coeff,
84@                                   word32 ht,
85@                                   word32 wd)
86@**************variables vs registers*****************************************
87@               r0 => *pu1_src
88@               r1 => *pu1_dst
89@               r2 =>  src_strd
90@               r3 =>  dst_strd
91@               r4 => *pi1_coeff
92@               r5 =>  ht
93@               r6 =>  wd
94
95.equ    ht_offset,      44
96.equ    wd_offset,      48
97
98.text
99.align 4
100
101
102
103
104.globl ihevc_inter_pred_chroma_copy_a9q
105
106.type ihevc_inter_pred_chroma_copy_a9q, %function
107
108ihevc_inter_pred_chroma_copy_a9q:
109    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
110    ldr         r12,[sp,#wd_offset]         @loads wd
111    lsl         r12,r12,#1
112    ldr         r7,[sp,#ht_offset]          @loads ht
113    cmp         r7,#0                       @checks ht == 0
114    ble         end_loops
115    and         r8,r7,#3                    @check ht for mul of 2
116    sub         r7,r7,r8                    @check the rounded height value
117    tst         r12,#15                     @checks wd for multiples for 4 & 8
118    beq         core_loop_wd_16
119    tst         r12,#7                      @checks wd for multiples for 4 & 8
120    beq         core_loop_wd_8
121
122    sub         r11,r12,#4
123    cmp         r7,#0
124    beq         outer_loop_wd_4_ht_2
125
126outer_loop_wd_4:
127    subs        r4,r12,#0                   @checks wd == 0
128    ble         end_inner_loop_wd_4
129
130inner_loop_wd_4:
131    vld1.32     {d0[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
132    add         r5,r0,r2                    @pu1_src_tmp += src_strd
133    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
134    vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
135    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
136    add         r0,r0,#4                    @pu1_src += 4
137    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
138    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
139    subs        r4,r4,#4                    @(wd -4)
140    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
141    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
142    add         r1,r1,#4                    @pu1_dst += 4
143    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
144    bgt         inner_loop_wd_4
145
146end_inner_loop_wd_4:
147    subs        r7,r7,#4                    @ht - 4
148    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
149    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
150    bgt         outer_loop_wd_4
151    cmp         r8,#0
152    bgt         outer_loop_wd_4_ht_2
153
154end_loops:
155    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
156
157
158outer_loop_wd_4_ht_2:
159    subs        r4,r12,#0                   @checks wd == 0
160    ble         end_loops
161
162inner_loop_wd_4_ht_2:
163    vld1.32     {d0[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
164    add         r5,r0,r2                    @pu1_src_tmp += src_strd
165    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
166    vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
167    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
168    add         r0,r0,#4                    @pu1_src += 4
169    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
170    subs        r4,r4,#4                    @(wd -4)
171    add         r1,r1,#4                    @pu1_dst += 4
172    bgt         inner_loop_wd_4_ht_2
173    b           end_loops
174
175core_loop_wd_8:
176    sub         r11,r12,#8
177    cmp         r7,#0
178    beq         outer_loop_wd_8_ht_2
179
180outer_loop_wd_8:
181    subs        r4,r12,#0                   @checks wd
182    ble         end_inner_loop_wd_8
183
184inner_loop_wd_8:
185    add         r5,r0,r2                    @pu1_src_tmp += src_strd
186    vld1.8      {d0},[r0]!                  @vld1_u8(pu1_src_tmp)
187    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
188    vst1.8      {d0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
189    vld1.8      {d1},[r5],r2                @vld1_u8(pu1_src_tmp)
190    vst1.8      {d1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
191    subs        r4,r4,#8                    @wd - 8(loop condition)
192    vld1.8      {d2},[r5],r2                @vld1_u8(pu1_src_tmp)
193    vst1.8      {d2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
194    vld1.8      {d3},[r5],r2                @vld1_u8(pu1_src_tmp)
195    vst1.8      {d3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
196    bgt         inner_loop_wd_8
197
198end_inner_loop_wd_8:
199    subs        r7,r7,#4                    @ht -= 4
200    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
201    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
202    bgt         outer_loop_wd_8
203    cmp         r8,#0
204    bgt         outer_loop_wd_8_ht_2
205    b           end_loops
206
207outer_loop_wd_8_ht_2:
208    subs        r4,r12,#0                   @checks wd
209    ble         end_loops
210
211inner_loop_wd_8_ht_2:
212    add         r5,r0,r2                    @pu1_src_tmp += src_strd
213    vld1.8      {d0},[r0]!                  @vld1_u8(pu1_src_tmp)
214    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
215    vst1.8      {d0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
216    vld1.8      {d1},[r5],r2                @vld1_u8(pu1_src_tmp)
217    vst1.8      {d1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
218    @subs     r4,r4,#8                      @wd - 8(loop condition)
219    @bgt      inner_loop_wd_8_ht_2
220    b           end_loops
221
222core_loop_wd_16:
223    sub         r11,r12,#16
224    cmp         r7,#0
225    beq         outer_loop_wd_16_ht_2
226
227outer_loop_wd_16:
228    subs        r4,r12,#0                   @checks wd
229    ble         end_inner_loop_wd_16
230
231inner_loop_wd_16:
232    add         r5,r0,r2                    @pu1_src_tmp += src_strd
233    vld1.8      {q0},[r0]!                  @vld1_u8(pu1_src_tmp)
234    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
235    vst1.8      {q0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
236    vld1.8      {q1},[r5],r2                @vld1_u8(pu1_src_tmp)
237    vst1.8      {q1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
238    subs        r4,r4,#16                   @wd - 16(loop condition)
239    vld1.8      {q2},[r5],r2                @vld1_u8(pu1_src_tmp)
240    vst1.8      {q2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
241    vld1.8      {q3},[r5],r2                @vld1_u8(pu1_src_tmp)
242    vst1.8      {q3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
243    bgt         inner_loop_wd_16
244
245end_inner_loop_wd_16:
246    subs        r7,r7,#4                    @ht -= 4
247    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
248    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
249    bgt         outer_loop_wd_16
250    cmp         r8,#0
251    bgt         outer_loop_wd_16_ht_2
252    b           end_loops
253
254outer_loop_wd_16_ht_2:
255    subs        r4,r12,#0                   @checks wd
256    ble         end_loops
257
258inner_loop_wd_16_ht_2:
259    add         r5,r0,r2                    @pu1_src_tmp += src_strd
260    vld1.8      {q0},[r0]!                  @vld1_u8(pu1_src_tmp)
261    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
262    vst1.8      {q0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
263    vld1.8      {q1},[r5],r2                @vld1_u8(pu1_src_tmp)
264    vst1.8      {q1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
265    @subs     r4,r4,#16                     @wd - 16(loop condition)
266    @bgt      inner_loop_wd_16_ht_2
267
268    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
269
270
271
272
273
274