1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21///**
22//*******************************************************************************
23//*
24//* @brief
25//*     Interprediction luma function for copy
26//*
27//* @par Description:
28//*   Copies the array of width 'wd' and height 'ht' from the  location pointed
29//*   by 'src' to the location pointed by 'dst'
30//*
31//* @param[in] pu1_src
32//*  UWORD8 pointer to the source
33//*
34//* @param[out] pu1_dst
35//*  UWORD8 pointer to the destination
36//*
37//* @param[in] src_strd
38//*  integer source stride
39//*
40//* @param[in] dst_strd
41//*  integer destination stride
42//*
43//*
44//* @param[in] ht
45//*  integer height of the array
46//*
47//* @param[in] wd
48//*  integer width of the array
49//*
50//* @returns
51//*
52//* @remarks
53//*  None
54//*
55//*******************************************************************************
56//*/
57//void ih264_inter_pred_luma_copy (
58//                            UWORD8 *pu1_src,
59//                            UWORD8 *pu1_dst,
60//                            WORD32 src_strd,
61//                            WORD32 dst_strd,
62//                            WORD32 ht,
63//                            WORD32 wd   )
64
65//**************Variables Vs Registers*****************************************
66//    x0 => *pu1_src
67//    x1 => *pu1_dst
68//    x2 =>  src_strd
69//    x3 =>  dst_strd
70//    x7 =>  ht
71//    x12 => wd
72
73.text
74.p2align 2
75.include "ih264_neon_macros.s"
76
77
78
79    .global ih264_inter_pred_luma_copy_av8
80
81ih264_inter_pred_luma_copy_av8:
82
83    push_v_regs
84    stp       x19, x20, [sp, #-16]!
85
86    mov       x12, x5
87    mov       x7, x4
88    cmp       x7, #0                    //checks ht == 0
89    ble       end_loops
90    tst       x12, #15                  //checks wd for multiples for 4 & 8
91    beq       core_loop_wd_16
92    tst       x12, #7                   //checks wd for multiples for 4 & 8
93    beq       core_loop_wd_8
94    sub       x11, x12, #4
95
96outer_loop_wd_4:
97    subs      x4, x12, #0               //checks wd == 0
98    ble       end_inner_loop_wd_4
99
100inner_loop_wd_4:
101    ld1       {v0.s}[0], [x0]           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
102    add       x5, x0, x2                //pu1_src_tmp += src_strd
103    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
104    st1       {v0.s}[0], [x1]           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
105    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
106    add       x0, x0, #4                //pu1_src += 4
107    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
108    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
109    subs      x4, x4, #4                //(wd -4)
110    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
111    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
112    add       x1, x1, #4                //pu1_dst += 4
113    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
114
115    bgt       inner_loop_wd_4
116
117end_inner_loop_wd_4:
118    subs      x7, x7, #4                //ht - 4
119    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
120    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
121    bgt       outer_loop_wd_4
122
123end_loops:
124    // LDMFD sp!,{x4-x12,x15}                  //Reload the registers from SP
125    ldp       x19, x20, [sp], #16
126    pop_v_regs
127    ret
128
129
130core_loop_wd_8:
131    sub       x11, x12, #8
132
133outer_loop_wd_8:
134    subs      x4, x12, #0               //checks wd
135    ble       end_inner_loop_wd_8
136
137inner_loop_wd_8:
138    add       x5, x0, x2                //pu1_src_tmp += src_strd
139    ld1       {v0.8b}, [x0], #8         //vld1_u8(pu1_src_tmp)
140    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
141    st1       {v0.8b}, [x1], #8         //vst1_u8(pu1_dst_tmp, tmp_src)
142    ld1       {v1.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
143    st1       {v1.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
144    subs      x4, x4, #8                //wd - 8(Loop condition)
145    ld1       {v2.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
146    st1       {v2.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
147    ld1       {v3.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
148    st1       {v3.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
149    bgt       inner_loop_wd_8
150
151end_inner_loop_wd_8:
152    subs      x7, x7, #4                //ht -= 4
153    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
154    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
155    bgt       outer_loop_wd_8
156
157    // LDMFD sp!,{x4-x12,x15}                  //Reload the registers from SP
158    ldp       x19, x20, [sp], #16
159    pop_v_regs
160    ret
161
162core_loop_wd_16:
163    sub       x11, x12, #16
164
165outer_loop_wd_16:
166    subs      x4, x12, #0               //checks wd
167    ble       end_inner_loop_wd_16
168
169inner_loop_wd_16:
170    add       x5, x0, x2                //pu1_src_tmp += src_strd
171    ld1       { v0.16b}, [x0], #16      //vld1_u8(pu1_src_tmp)
172    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
173    st1       { v0.16b}, [x1], #16      //vst1_u8(pu1_dst_tmp, tmp_src)
174    ld1       { v2.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
175    st1       { v2.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
176    subs      x4, x4, #16               //wd - 8(Loop condition)
177    ld1       { v4.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
178    st1       { v4.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
179    ld1       { v6.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
180    st1       { v6.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
181    bgt       inner_loop_wd_16
182
183end_inner_loop_wd_16:
184    subs      x7, x7, #4                //ht -= 4
185    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
186    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
187    bgt       outer_loop_wd_16
188
189
190    ldp       x19, x20, [sp], #16
191    pop_v_regs
192    ret
193
194
195// /*
196// ********************************************************************************
197// *
198// * @brief This function copies a 4x4 block to destination
199// *
200// * @par Description:
201// * Copies a 4x4 block to destination, where both src and dst are interleaved
202// *
203// * @param[in] pi2_src
204// *  Source
205// *
206// * @param[in] pu1_out
207// *  Output pointer
208// *
209// * @param[in] pred_strd,
210// *  Prediction buffer stride
211// *
212// * @param[in] out_strd
213// *  output buffer buffer Stride
214// *
215// * @returns none
216// *
217// * @remarks none
218// * Currently wd and height is not used, ie a 4x4 block is always copied
219// *
220// *******************************************************************************
221// */
222// void ih264_interleave_copy(WORD16 *pi2_src,
223//                            UWORD8 *pu1_out,
224//                            WORD32 pred_strd,
225//                            WORD32 out_strd
226//                            WORD32 wd
227//                            WORD32 ht)
228// Register Usage
229// x0 : pi2_src
230// x1 : pu1_out
231// x2 : src_strd
232// x3 : out_strd
233// Neon registers d0-d7, d16-d30 are used
234// No need for pushing  arm and neon registers
235
236    .global ih264_interleave_copy_av8
237ih264_interleave_copy_av8:
238    push_v_regs
239    ld1       {v2.8b}, [x0], x2         //load src plane 1 => d2 &pred palne 2 => d3
240    ld1       {v3.8b}, [x0], x2
241    mov       v2.d[1], v3.d[0]
242    ld1       {v4.8b}, [x0], x2
243    ld1       {v5.8b}, [x0], x2
244    mov       v4.d[1], v5.d[0]
245
246    mov       x0, x1
247
248    ld1       {v18.8b}, [x1], x3        //load out [8 bit size) -8 coeffs
249    ld1       {v19.8b}, [x1], x3
250    mov       v18.d[1], v19.d[0]
251    movi      v30.8h, #0x00ff
252    ld1       {v20.8b}, [x1], x3
253    ld1       {v21.8b}, [x1], x3
254    mov       v20.d[1], v21.d[0]
255
256    bit       v18.16b, v2.16b , v30.16b
257    bit       v20.16b, v4.16b , v30.16b
258
259    st1       {v18.8b}, [x0], x3        //store  out
260    st1       {v18.d}[1], [x0], x3
261    st1       {v20.8b}, [x0], x3
262    st1       {v20.d}[1], [x0], x3
263
264    pop_v_regs
265    ret
266
267
268