1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_inter_pred_chroma_copy.s
22//*
23//* @brief
24//*  Contains function definitions for inter prediction  interpolation.
25//* Functions are coded using NEON  intrinsics and can be compiled using ARM
26//* RVCT
27//*
28//* @author
29//*  Yogeswaran RS
30//*
31//* @par List of Functions:
32//*
33//*
34//* @remarks
35//*  None
36//*
37//*******************************************************************************
38//*/
39///**
40//*******************************************************************************
41//*
42//* @brief
43//*   Chroma interprediction filter for copy
44//*
45//* @par Description:
46//*    Copies the array of width 'wd' and height 'ht' from the  location pointed
47//*    by 'src' to the location pointed by 'dst'
48//*
49//* @param[in] pu1_src
50//*  UWORD8 pointer to the source
51//*
52//* @param[out] pu1_dst
53//*  UWORD8 pointer to the destination
54//*
55//* @param[in] src_strd
56//*  integer source stride
57//*
58//* @param[in] dst_strd
59//*  integer destination stride
60//*
61//* @param[in] pi1_coeff
62//*  WORD8 pointer to the filter coefficients
63//*
64//* @param[in] ht
65//*  integer height of the array
66//*
67//* @param[in] wd
68//*  integer width of the array
69//*
70//* @returns
71//*
72//* @remarks
73//*  None
74//*
75//*******************************************************************************
76//*/
77
78//void ihevc_inter_pred_chroma_copy( UWORD8 *pu1_src,
79//                                   UWORD8 *pu1_dst,
80//                                   WORD32 src_strd,
81//                                   WORD32 dst_strd,
82//                                   WORD8 *pi1_coeff,
83//                                   WORD32 ht,
84//                                   WORD32 wd)
85//**************Variables Vs Registers*****************************************
86//x0 => *pu1_src
87//x1 => *pu1_dst
88//x2 =>  src_strd
89//x3 =>  dst_strd
90//x4 => *pi1_coeff
91//x5 =>  ht
92//x6 =>  wd
93
94.text
95.align 4
96
97.globl ihevc_inter_pred_chroma_copy_av8
98
99.type ihevc_inter_pred_chroma_copy_av8, %function
100
101ihevc_inter_pred_chroma_copy_av8:
102
103    LSL         x12,x6,#1                   //wd << 1
104    CMP         x5,#0                       //checks ht == 0
105    BLE         END_LOOPS
106    AND         x8,x5,#3                    //check ht for mul of 2
107    SUB         x5,x5,x8                    //check the rounded height value
108    TST         x12,#15                     //checks wd for multiples for 16
109    BEQ         CORE_LOOP_WD_16
110    TST         x12,#7                      //checks wd for multiples for 4 & 8
111    BEQ         CORE_LOOP_WD_8
112    SUB         x11,x12,#4
113    CMP         x5,#0
114    BEQ         OUTER_LOOP_WD_4_HT_2
115
116OUTER_LOOP_WD_4:
117    SUBS        x4,x12,#0                   //checks wd == 0
118    BLE         END_INNER_LOOP_WD_4
119
120INNER_LOOP_WD_4:
121    LD1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
122    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
123    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
124    ST1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
125    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
126    ADD         x0,x0,#4                    //pu1_src += 4
127    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
128    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
129    SUBS        x4,x4,#4                    //(wd -4)
130    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
131    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
132    ADD         x1,x1,#4                    //pu1_dst += 4
133    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
134    BGT         INNER_LOOP_WD_4
135
136END_INNER_LOOP_WD_4:
137    SUBS        x5,x5,#4                    //ht - 4
138    SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
139    SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
140    BGT         OUTER_LOOP_WD_4
141    CMP         x8,#0
142    BGT         OUTER_LOOP_WD_4_HT_2
143
144END_LOOPS:
145    RET
146
147OUTER_LOOP_WD_4_HT_2:
148    SUBS        x4,x12,#0                   //checks wd == 0
149    BLE         END_LOOPS
150
151INNER_LOOP_WD_4_HT_2:
152    LD1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
153    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
154    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
155    ST1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
156    LD1         {v0.s}[0],[x7],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
157    ADD         x0,x0,#4                    //pu1_src += 4
158    ST1         {v0.s}[0],[x6],x3           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
159    SUBS        x4,x4,#4                    //(wd -4)
160    ADD         x1,x1,#4                    //pu1_dst += 4
161    BGT         INNER_LOOP_WD_4_HT_2
162    B           END_LOOPS
163
164CORE_LOOP_WD_8:
165    SUB         x11,x12,#8
166    CMP         x5,#0
167    BEQ         OUTER_LOOP_WD_8_HT_2
168
169OUTER_LOOP_WD_8:
170    SUBS        x4,x12,#0                   //checks wd
171    BLE         END_INNER_LOOP_WD_8
172
173
174INNER_LOOP_WD_8:
175    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
176    LD1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
177    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
178    ST1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
179    LD1         {v1.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
180    ST1         {v1.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
181    SUBS        x4,x4,#8                    //wd - 8(Loop condition)
182    LD1         {v2.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
183    ST1         {v2.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
184    LD1         {v3.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
185    ST1         {v3.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
186    BGT         INNER_LOOP_WD_8
187
188END_INNER_LOOP_WD_8:
189    SUBS        x5,x5,#4                    //ht -= 4
190    SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
191    SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
192    BGT         OUTER_LOOP_WD_8
193    CMP         x8,#0
194    BGT         OUTER_LOOP_WD_8_HT_2
195    B           END_LOOPS
196
197OUTER_LOOP_WD_8_HT_2:
198    SUBS        x4,x12,#0                   //checks wd
199    BLE         END_LOOPS
200
201INNER_LOOP_WD_8_HT_2:
202    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
203    LD1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
204    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
205    ST1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
206    LD1         {v1.8b},[x7],x2             //vld1_u8(pu1_src_tmp)
207    ST1         {v1.8b},[x6],x3             //vst1_u8(pu1_dst_tmp, tmp_src)
208    B           END_LOOPS
209
210CORE_LOOP_WD_16:
211    SUB         x11,x12,#16
212    CMP         x5,#0
213    BEQ         OUTER_LOOP_WD_16_HT_2
214
215OUTER_LOOP_WD_16:
216    SUBS        x4,x12,#0                   //checks wd
217    BLE         END_INNER_LOOP_WD_16
218
219INNER_LOOP_WD_16:
220    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
221    LD1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
222    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
223    ST1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
224    LD1         {v1.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
225    ST1         {v1.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
226    SUBS        x4,x4,#16                   //wd - 16(Loop condition)
227    LD1         {v2.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
228    ST1         {v2.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
229    LD1         {v3.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
230    ST1         {v3.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
231    BGT         INNER_LOOP_WD_16
232
233END_INNER_LOOP_WD_16:
234    SUBS        x5,x5,#4                    //ht -= 4
235    SUB         x0,x7,x11                   //pu1_src = pu1_src_tmp
236    SUB         x1,x6,x11                   //pu1_dst = pu1_dst_tmp
237    BGT         OUTER_LOOP_WD_16
238    CMP         x8,#0
239    BGT         OUTER_LOOP_WD_16_HT_2
240    B           END_LOOPS
241
242OUTER_LOOP_WD_16_HT_2:
243    SUBS        x4,x12,#0                   //checks wd
244    BLE         END_LOOPS
245
246INNER_LOOP_WD_16_HT_2:
247    ADD         x7,x0,x2                    //pu1_src_tmp += src_strd
248    LD1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
249    ADD         x6,x1,x3                    //pu1_dst_tmp += dst_strd
250    ST1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
251    LD1         {v1.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
252    ST1         {v1.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
253
254    RET
255
256
257