1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19///*******************************************************************************
20//* //file
21//*  ihevcd_fmt_conv_420sp_to_420sp.s
22//*
23//* //brief
24//*  contains function definitions for format conversions
25//*
26//* //author
27//*  ittiam
28//*
29//* //par list of functions:
30//*
31//*
32//* //remarks
33//*  none
34//*
35//*******************************************************************************/
36    .equ DO1STROUNDING, 0
37
38    // ARM
39    //
40    // PRESERVE8
41
42.text
43.p2align 2
44
45.include "ihevc_neon_macros.s"
46
47
48
49
50///*****************************************************************************
51//*                                                                            *
52//*  Function Name    : ihevcd_fmt_conv_420sp_to_420sp()                       *
53//*                                                                            *
54//*  Description      : This function conversts the image from YUV420SP color  *
55//*                     space to 420SP color space(UV interleaved).                 *
56//*                                                                            *
57//*  Arguments        : x0           pu1_y                                     *
58//*                     x1           pu1_uv                                    *
59//*                     x2           pu1_dest_y                                *
60//*                     x3           pu1_dest_uv                               *
61//*                     [x13 #40]    u2_width                                  *
62//*                     [x13 #44]    u2_height                                 *
63//*                     [x13 #48]    u2_stridey                                *
64//*                     [x13 #52]    u2_stridechroma                           *
65//*                     [x13 #56]    u2_dest_stridey                           *
66//*                     [x13 #60]    u2_dest_stridechroma                      *
67//*                                                                            *
68//*  Values Returned  : None                                                   *
69//*                                                                            *
70//*  Register Usage   : x0 - x14                                               *
71//*                                                                            *
72//*  Stack Usage      : 40 Bytes                                               *
73//*                                                                            *
74//*  Interruptibility : Interruptible                                          *
75//*                                                                            *
76//*  Known Limitations                                                         *
77//*       Assumptions: Image Width:     Assumed to be multiple of 2 and       *
78//*                     Image Height:    Assumed to be even.                   *
79//*                                                                            *
80//*  Revision History :                                                        *
81//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
82//*         16 05 2012   Naveen SR     draft                                     *
83//*                                                                            *
84//*****************************************************************************/
85
86    .global ihevcd_fmt_conv_420sp_to_420sp_av8
87.type ihevcd_fmt_conv_420sp_to_420sp_a9q, %function
88ihevcd_fmt_conv_420sp_to_420sp_av8:
89
90    // STMFD sp!,{x4-x12, x14}
91    push_v_regs
92    stp         x19, x20,[sp,#-16]!
93
94    mov         x8, x4                      ////Load u2_width
95    mov         x9, x5                      ////Load u2_height
96
97    LDR         w5, [sp,#80]                ////Load u2_dest_stridey
98    sxtw        x5,w5
99
100    mov         x7, x6                      ////Load u2_stridey
101
102    SUB         x10,x7,x8                   //// Src Y increment
103    SUB         x11,x5,x8                   //// Dst Y increment
104
105    ///* Copy Y */
106
107    MOV         x4,x9                       //// Copying height
108y_row_loop:
109    MOV         x6,x8                       //// Copying width
110
111y_col_loop:
112    prfm        PLDL1KEEP,[x0, #128]
113    SUB         x6,x6,#32
114    LD1         {v0.8b},[x0],#8
115    LD1         {v1.8b},[x0],#8
116    LD1         {v2.8b},[x0],#8
117    LD1         {v3.8b},[x0],#8
118    ST1         {v0.8b},[x2],#8
119    ST1         {v1.8b},[x2],#8
120    ST1         {v2.8b},[x2],#8
121    ST1         {v3.8b},[x2],#8
122    CMP         x6,#32
123    BGE         y_col_loop
124    CMP         x6,#0
125    BEQ         y_col_loop_end
126    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
127    ////Ex if width is 162, above loop will process 160 pixels. And
128    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
129    //// and written using VLD1 and VST1
130    sub         x20,x6,#32
131    neg         x6, x20
132    SUB         x0,x0,x6
133    SUB         x2,x2,x6
134    LD1         {v0.8b},[x0],#8
135    LD1         {v1.8b},[x0],#8
136    LD1         {v2.8b},[x0],#8
137    LD1         {v3.8b},[x0],#8
138    ST1         {v0.8b},[x2],#8
139    ST1         {v1.8b},[x2],#8
140    ST1         {v2.8b},[x2],#8
141    ST1         {v3.8b},[x2],#8
142
143y_col_loop_end:
144    ADD         x0, x0, x10
145    ADD         x2, x2, x11
146    SUBS        x4, x4, #1
147    BGT         y_row_loop
148
149
150
151    ///* Copy UV */
152
153    LDR         w5, [sp,#88]                ////Load u2_dest_stridechroma
154    sxtw        x5,w5
155
156    LSR         x9, x9, #1                  //// height/2
157//    MOV     x8,x8,LSR #1            @// Width/2
158
159    MOV         x2,x3                       //pu1_dest_uv
160
161    SUB         x10,x7,x8                   //// Src UV increment
162    SUB         x11,x5,x8                   //// Dst UV increment
163
164    MOV         x4,x9                       //// Copying height
165uv_row_loop:
166    MOV         x6,x8                       //// Copying width
167
168uv_col_loop:
169
170    prfm        PLDL1KEEP,[x1, #128]
171    SUB         x6,x6,#16
172    LD1         {v0.8b},[x1],#8
173    LD1         {v1.8b},[x1],#8
174    ST1         {v0.8b},[x2],#8
175    ST1         {v1.8b},[x2],#8
176    CMP         x6,#16
177    BGE         uv_col_loop
178    CMP         x6,#0
179    BEQ         u_col_loop_end
180    ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
181    ////Ex if width is 162, above loop will process 160 pixels. And
182    ////Both source and destination will point to 146th pixel and then 16 bytes will be read
183    //// and written using VLD1 and VST1
184    sub         x20,x6,#16
185    neg         x6, x20
186    SUB         x1,x1,x6
187    SUB         x2,x2,x6
188    LD1         {v0.8b},[x1],#8
189    LD1         {v1.8b},[x1],#8
190    ST1         {v0.8b},[x2],#8
191    ST1         {v1.8b},[x2],#8
192
193u_col_loop_end:
194    ADD         x1, x1, x10
195    ADD         x2, x2, x11
196    SUBS        x4, x4, #1
197    BGT         uv_row_loop
198
199exit:
200    // LDMFD sp!,{x4-x12, pc}
201    ldp         x19, x20,[sp],#16
202    pop_v_regs
203    ret
204
205
206    .section .note.GNU-stack,"",%progbits
207
208