1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19///*******************************************************************************
20//* //file
21//*  ihevcd_fmt_conv_420sp_to_420p.s
22//*
23//* //brief
24//*  contains function definitions for format conversions
25//*
26//* //author
27//*  ittiam
28//*
29//* //par list of functions:
30//*
31//*
32//* //remarks
33//*  none
34//*
35//*******************************************************************************/
36
37.text
38
39.include "ihevc_neon_macros.s"
40
41
42
43
44///*****************************************************************************
45//*                                                                            *
46//*  Function Name    : neon_copy_yuv420sp_to_yuv420p()                       *
47//*                                                                            *
48//*  Description      : This function conversts the image from YUV420sP color  *
49//*                     space to 420SP color space(UV interleaved).                 *
50//*                                                                            *
51//*  Arguments        : x0           pu1_src_y                                 *
52//*                     x1           pu1_src_uv                                *
53//*                     x2           pu1_dest_y                                *
54//*                     x3           pu1_dest_u                               *
55//*                     [x13 #40]    pu1_dest_v                               *
56//*                     [x13 #44]    u2_width                                 *
57//*                     [x13 #48]    u2_height                                   *
58//*                     [x13 #52]    u2_stridey                                *
59//*                     [x13 #56]    u2_strideuv                               *
60//*                     [x13 #60]    u2_dest_stridey                           *
61//*                     [x13 #64]    u2_dest_strideuv                          *
62//*                     [x13 #68]    is_u_first                                *
63//*                     [x13 #72]    disable_luma_copy                         *
64//*                                                                            *
65//*  Values Returned  : None                                                   *
66//*                                                                            *
67//*  Register Usage   : x0 - x14                                               *
68//*                                                                            *
69//*  Stack Usage      : 40 Bytes                                               *
70//*                                                                            *
71//*  Interruptibility : Interruptible                                          *
72//*                                                                            *
73//*  Known Limitations                                                         *
74//*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
75//*                     Image Height:    Assumed to be even.                   *
76//*                                                                            *
77//*  Revision History :                                                        *
78//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
79//*         16 05 2012   Naveen SR     draft                                     *
80//*                                                                            *
81//*****************************************************************************/
82
83.globl ihevcd_fmt_conv_420sp_to_420p_av8
84
85.type ihevcd_fmt_conv_420sp_to_420p_av8, %function
86
87ihevcd_fmt_conv_420sp_to_420p_av8:
88    // STMFD sp!,{x4-x12, x14}
89    push_v_regs
90    stp         x19, x20,[sp,#-16]!
91    mov         x15, x4
92    mov         x8, x5                      ////Load u2_width
93    mov         x9, x6                      ////Load u2_height
94
95    LDR         w5, [sp,#88]                ////Load u2_dest_stridey
96    sxtw        x5,w5
97//    LDR        x6,[sp,#80]                @//Load u2_strideuv
98
99    SUB         x10,x7,x8                   //// Src Y increment
100    SUB         x11,x5,x8                   //// Dst Y increment
101
102    LDR         w5, [sp,#112]               ////Load disable_luma_copy flag
103    sxtw        x5,w5
104    CMP         x5,#0                       ////skip luma if disable_luma_copy is non-zero
105    BNE         uv_copy_start
106
107    ///* Copy Y */
108
109    MOV         x4,x9                       //// Copying height
110y_row_loop:
111    MOV         x6,x8                       //// Copying width
112
113y_col_loop:
114
115    SUB         x6,x6,#16
116    ld1         {v0.8b, v1.8b},[x0],#16
117    st1         {v0.8b, v1.8b},[x2],#16
118    CMP         x6,#16
119    BGE         y_col_loop
120    CMP         x6,#0
121    BEQ         y_col_loop_end
122    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
123    ////Ex if width is 162, above loop will process 160 pixels. And
124    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
125    //// and written using VLD1 and VST1
126    sub         x20,x6,#16
127    neg         x6, x20
128    SUB         x0,x0,x6
129    SUB         x2,x2,x6
130    ld1         {v0.8b, v1.8b}, [x0],#16
131    st1         {v0.8b, v1.8b}, [x2],#16
132
133y_col_loop_end:
134    ADD         x0, x0, x10
135    ADD         x2, x2, x11
136    SUBS        x4, x4, #1
137    BGT         y_row_loop
138
139
140    ///* Copy UV */
141uv_copy_start:
142
143    LDR         w5, [sp,#96]                ////Load u2_dest_strideuv
144    sxtw        x5,w5
145    LDR         w7, [sp,#80]                ////Load u2_strideuv
146    sxtw        x7,w7
147
148    LSR         x9, x9, #1                  //// height/2
149//    MOV     x8,x8,LSR #1            @// Width/2
150
151    SUB         x10,x7,x8                   //// Src UV increment
152    LSR         x11, x8, #1
153    SUB         x11,x5,x11                  //// Dst U and V increment
154
155    mov         x5, x15                     ////Load pu1_dest_v
156
157    LDR         w4, [sp,#104]               ////Load is_u_first_flag
158    sxtw        x4,w4
159    CMP         x4,#0                       ////Swap U and V dest if is_u_first_flag is zero
160    csel        x4, x5, x4,EQ
161    csel        x5, x3, x5,EQ
162    csel        x3, x4, x3,EQ
163
164    MOV         x4,x9                       //// Copying height
165uv_row_loop:
166    MOV         x6,x8                       //// Copying width
167
168uv_col_loop:
169
170    SUB         x6,x6,#16
171
172    prfm        PLDL1KEEP,[x1,#128]
173    ld2         {v0.8b, v1.8b},[x1],#16
174    ST1         {v0.8b},[x3],#8
175    ST1         {v1.8b},[x5],#8
176    CMP         x6,#16
177    BGE         uv_col_loop
178    CMP         x6,#0
179    BEQ         uv_col_loop_end
180    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
181    ////Ex if width is 162, above loop will process 160 pixels. And
182    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
183    //// and written using VLD1 and VST1
184    sub         x20,x6,#16
185    neg         x6, x20
186    SUB         x1,x1,x6
187    SUB         x3,x3,x6,LSR #1
188    SUB         x5,x5,x6,LSR #1
189    ld2         {v0.8b, v1.8b}, [x1],#16
190    ST1         {v0.8b},[x3],#8
191    ST1         {v1.8b},[x5],#8
192uv_col_loop_end:
193    ADD         x1, x1, x10
194    ADD         x3, x3, x11
195    ADD         x5, x5, x11
196    SUBS        x4, x4, #1
197    BGT         uv_row_loop
198
199exit:
200    // LDMFD sp!,{x4-x12, pc}
201    ldp         x19, x20,[sp],#16
202    pop_v_regs
203    ret
204
205
206
207
208
209
210