1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@**
22@*******************************************************************************
23@*
24@* @brief
25@*     Interprediction luma function for copy
26@*
27@* @par Description:
28@*   Copies the array of width 'wd' and height 'ht' from the  location pointed
29@*   by 'src' to the location pointed by 'dst'
30@*
31@* @param[in] pu1_src
32@*  UWORD8 pointer to the source
33@*
34@* @param[out] pu1_dst
35@*  UWORD8 pointer to the destination
36@*
37@* @param[in] src_strd
38@*  integer source stride
39@*
40@* @param[in] dst_strd
41@*  integer destination stride
42@*
43@*
44@* @param[in] ht
45@*  integer height of the array
46@*
47@* @param[in] wd
48@*  integer width of the array
49@*
50@* @returns
51@*
52@* @remarks
53@*  None
54@*
55@*******************************************************************************
56@*
57@void ih264_inter_pred_luma_copy (
58@                            UWORD8 *pu1_src,
59@                            UWORD8 *pu1_dst,
60@                            WORD32 src_strd,
61@                            WORD32 dst_strd,
62@                            WORD32 ht,
63@                            WORD32 wd   )
64
65@**************Variables Vs Registers*****************************************
66@   r0 => *pu1_src
67@   r1 => *pu1_dst
68@   r2 =>  src_strd
69@   r3 =>  dst_strd
70@   r7 =>  ht
71@   r12 => wd
72
73.text
74.p2align 2
75
76    .global ih264_inter_pred_luma_copy_a9q
77
78ih264_inter_pred_luma_copy_a9q:
79    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
80    vstmdb        sp!, {d8-d15}         @push neon registers to stack
81    ldr           r12, [sp, #108]       @Loads wd
82    ldr           r7, [sp, #104]        @Loads ht
83    cmp           r7, #0                @checks ht == 0
84    ble           end_loops
85    tst           r12, #15              @checks wd for multiples for 4 & 8
86    beq           core_loop_wd_16
87    tst           r12, #7               @checks wd for multiples for 4 & 8
88    beq           core_loop_wd_8
89    sub           r11, r12, #4
90
91outer_loop_wd_4:
92    subs          r4, r12, #0           @checks wd == 0
93    ble           end_inner_loop_wd_4
94
95inner_loop_wd_4:
96    vld1.32       {d0[0]}, [r0]         @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
97    add           r5, r0, r2            @pu1_src_tmp += src_strd
98    add           r6, r1, r3            @pu1_dst_tmp += dst_strd
99    vst1.32       {d0[0]}, [r1]         @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
100    vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
101    add           r0, r0, #4            @pu1_src += 4
102    vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
103    vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
104    subs          r4, r4, #4            @(wd -4)
105    vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
106    vld1.32       {d0[0]}, [r5], r2     @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
107    add           r1, r1, #4            @pu1_dst += 4
108    vst1.32       {d0[0]}, [r6], r3     @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
109
110    bgt           inner_loop_wd_4
111
112end_inner_loop_wd_4:
113    subs          r7, r7, #4            @ht - 4
114    sub           r0, r5, r11           @pu1_src = pu1_src_tmp
115    sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
116    bgt           outer_loop_wd_4
117
118end_loops:
119    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
120    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
121
122
123
124core_loop_wd_8:
125    sub           r11, r12, #8
126
127outer_loop_wd_8:
128    subs          r4, r12, #0           @checks wd
129    ble           end_inner_loop_wd_8
130
131inner_loop_wd_8:
132    add           r5, r0, r2            @pu1_src_tmp += src_strd
133    vld1.8        {d0}, [r0]!           @vld1_u8(pu1_src_tmp)
134    add           r6, r1, r3            @pu1_dst_tmp += dst_strd
135    vst1.8        {d0}, [r1]!           @vst1_u8(pu1_dst_tmp, tmp_src)
136    vld1.8        {d1}, [r5], r2        @vld1_u8(pu1_src_tmp)
137    vst1.8        {d1}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
138    subs          r4, r4, #8            @wd - 8(Loop condition)
139    vld1.8        {d2}, [r5], r2        @vld1_u8(pu1_src_tmp)
140    vst1.8        {d2}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
141    vld1.8        {d3}, [r5], r2        @vld1_u8(pu1_src_tmp)
142    vst1.8        {d3}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
143    bgt           inner_loop_wd_8
144
145end_inner_loop_wd_8:
146    subs          r7, r7, #4            @ht -= 4
147    sub           r0, r5, r11           @pu1_src = pu1_src_tmp
148    sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
149    bgt           outer_loop_wd_8
150
151    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
152    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
153
154core_loop_wd_16:
155    sub           r11, r12, #16
156
157outer_loop_wd_16:
158    subs          r4, r12, #0           @checks wd
159    ble           end_inner_loop_wd_16
160
161inner_loop_wd_16:
162    add           r5, r0, r2            @pu1_src_tmp += src_strd
163    vld1.8        {q0}, [r0]!           @vld1_u8(pu1_src_tmp)
164    add           r6, r1, r3            @pu1_dst_tmp += dst_strd
165    vst1.8        {q0}, [r1]!           @vst1_u8(pu1_dst_tmp, tmp_src)
166    vld1.8        {q1}, [r5], r2        @vld1_u8(pu1_src_tmp)
167    vst1.8        {q1}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
168    subs          r4, r4, #16           @wd - 8(Loop condition)
169    vld1.8        {q2}, [r5], r2        @vld1_u8(pu1_src_tmp)
170    vst1.8        {q2}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
171    vld1.8        {q3}, [r5], r2        @vld1_u8(pu1_src_tmp)
172    vst1.8        {q3}, [r6], r3        @vst1_u8(pu1_dst_tmp, tmp_src)
173    bgt           inner_loop_wd_16
174
175end_inner_loop_wd_16:
176    subs          r7, r7, #4            @ht -= 4
177    sub           r0, r5, r11           @pu1_src = pu1_src_tmp
178    sub           r1, r6, r11           @pu1_dst = pu1_dst_tmp
179    bgt           outer_loop_wd_16
180
181    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
182    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
183
184
185@ *
186@ ********************************************************************************
187@ *
188@ * @brief This function copies a 4x4 block to destination
189@ *
190@ * @par Description:
191@ * Copies a 4x4 block to destination, where both src and dst are interleaved
192@ *
193@ * @param[in] pi2_src
194@ *  Source
195@ *
196@ * @param[in] pu1_out
197@ *  Output pointer
198@ *
199@ * @param[in] pred_strd,
200@ *  Prediction buffer stride
201@ *
202@ * @param[in] out_strd
203@ *  output buffer buffer Stride
204@ *
205@ * @returns none
206@ *
207@ * @remarks none
208@ * Currently wd and height is not used, ie a 4x4 block is always copied
209@ *
210@ *******************************************************************************
211@ *
212@ void ih264_interleave_copy(WORD16 *pi2_src,
213@                            UWORD8 *pu1_out,
214@                            WORD32 pred_strd,
215@                            WORD32 out_strd
216@                            WORD32 wd
217@                            WORD32 ht)
218@ Register Usage
219@ r0 : pi2_src
220@ r1 : pu1_out
221@ r2 : src_strd
222@ r3 : out_strd
223@ Neon registers d0-d7, d16-d30 are used
224@ No need for pushing  arm and neon registers
225
226    .global ih264_interleave_copy_a9
227ih264_interleave_copy_a9:
228
229    vld1.u8       d2, [r0], r2          @load src plane 1 => d2 &pred palne 2 => d3
230    vld1.u8       d3, [r0], r2
231    vld1.u8       d4, [r0], r2
232    vld1.u8       d5, [r0], r2
233
234    mov           r0, r1
235
236    vld1.u8       d18, [r1], r3         @load out [8 bit size) -8 coeffs
237    vld1.u8       d19, [r1], r3
238    vmov.u16      q15, #0x00ff
239    vld1.u8       d20, [r1], r3
240    vld1.u8       d21, [r1], r3
241
242    vbit.u8       q9, q1, q15
243    vbit.u8       q10, q2, q15
244
245    vst1.u8       d18, [r0], r3         @store  out
246    vst1.u8       d19, [r0], r3
247    vst1.u8       d20, [r0], r3
248    vst1.u8       d21, [r0], r3
249
250    bx            lr
251
252
253
254