1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19// *******************************************************************************
20// * @file
21// *  ihevc_itrans_recon_4x4_ttype1.s
22// *
23// * @brief
24// *  contains function definitions for inverse transform  and reconstruction
25// *
26// *
27// * @author
28// *  naveen sr
29// *
30// * @par list of functions:
31// *  - ihevc_itrans_recon_4x4_ttype1()
32// *
33// * @remarks
34// *  none
35// *
36// *******************************************************************************
37// */
38
39///* all the functions here are replicated from ihevc_itrans.c and modified to */
40///* include reconstruction */
41//
42///**
43// *******************************************************************************
44// *
45// * @brief
46// *  this function performs inverse transform type 1 (dst)  and reconstruction
47// * for 4x4 input block
48// *
49// * @par description:
50// *  performs inverse transform and adds the prediction  data and clips output
51// * to 8 bit
52// *
53// * @param[in] pi2_src
54// *  input 4x4 coefficients
55// *
56// * @param[in] pi2_tmp
57// *  temporary 4x4 buffer for storing inverse
58// *
59// *  transform
60// *  1st stage output
61// *
62// * @param[in] pu1_pred
63// *  prediction 4x4 block
64// *
65// * @param[out] pu1_dst
66// *  output 4x4 block
67// *
68// * @param[in] src_strd
69// *  input stride
70// *
71// * @param[in] pred_strd
72// *  prediction stride
73// *
74// * @param[in] dst_strd
75// *  output stride
76// *
77// * @param[in] zero_cols
78// *  zero columns in pi2_src
79// *
80// * @returns  void
81// *
82// * @remarks
83// *  none
84// *
85// *******************************************************************************
86// */
87//void ihevc_itrans_recon_4x4_ttype1(word16 *pi2_src,
88//        word16 *pi2_tmp,
89//        uword8 *pu1_pred,
90//        uword8 *pu1_dst,
91//        word32 src_strd,
92//        word32 pred_strd,
93//        word32 dst_strd,
94//        word32 zero_cols)
95
96//**************variables vs registers*************************
97//    x0 => *pi2_src
98//    x1 => *pi2_tmp
99//    x2 => *pu1_pred
100//    x3 => *pu1_dst
101//    x4 => src_strd
102//    x5 => pred_strd
103//    x6 => dst_strd
104//    x7 => zero_cols
105
106.text
107.align 4
108
109.include "ihevc_neon_macros.s"
110
111.set shift_stage1_idct ,   7
112.set shift_stage2_idct ,   12
113
114.globl ihevc_itrans_recon_4x4_ttype1_av8
115
116.type ihevc_itrans_recon_4x4_ttype1_av8, %function
117
118ihevc_itrans_recon_4x4_ttype1_av8:
119
120    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
121
122    stp         x19, x20,[sp,#-16]!
123
124    add         x4,x4,x4                    // src_strd in terms of word16
125
126    mov         x8,#29
127    mov         x9,#55
128    mov         x10,#74
129    mov         x11,#84
130    mov         v4.h[0], w8
131    ld1         {v0.4h},[x0],x4             //loading pi2_src 1st row
132    mov         v4.h[1], w9
133    ld1         {v1.4h},[x0],x4             //loading pi2_src 2nd row
134    mov         v4.h[2], w10
135    ld1         {v2.4h},[x0],x4             //loading pi2_src 3rd row
136    mov         v4.h[3], w11
137    ld1         {v3.4h},[x0],x4             //loading pi2_src 4th row
138
139    // first stage computation starts
140    smull       v6.4s, v1.4h, v4.h[2]       //74 * pi2_src[1]
141    smlal       v6.4s, v0.4h, v4.h[0]       //74 * pi2_src[1] + 29 * pi2_src[0]
142    smlal       v6.4s, v3.4h, v4.h[1]       //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
143    smlal       v6.4s, v2.4h, v4.h[3]       //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
144
145    smull       v5.4s, v1.4h, v4.h[2]       //74 * pi2_src[1]
146    smlal       v5.4s, v0.4h, v4.h[1]       //74 * pi2_src[1] + 55 * pi2_src[0]
147    smlsl       v5.4s, v2.4h, v4.h[0]       //74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
148    smlsl       v5.4s, v3.4h, v4.h[3]       //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
149
150    smull       v7.4s, v0.4h, v4.h[2]       // 74 * pi2_src[0]
151    smlsl       v7.4s, v2.4h, v4.h[2]       // 74 * pi2_src[0] - 74 * pi2_src[2]
152    smlal       v7.4s, v3.4h, v4.h[2]       //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
153
154    smull       v20.4s, v2.4h, v4.h[1]      // 55 * pi2_src[2]
155    smlsl       v20.4s, v1.4h, v4.h[2]      // 55 * pi2_src[2] - 74 * pi2_src[1]
156    smlsl       v20.4s, v3.4h, v4.h[0]      // - 74 * pi2_src[1] +   55 * pi2_src[2]    - 29 * pi2_src[3]
157    smlal       v20.4s, v0.4h, v4.h[3]      //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
158
159    sqrshrn     v28.4h, v6.4s,#shift_stage1_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
160    sqrshrn     v29.4h, v5.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
161    sqrshrn     v30.4h, v7.4s,#shift_stage1_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
162    sqrshrn     v31.4h, v20.4s,#shift_stage1_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
163    ld1         {v18.s}[0],[x2],x5
164
165    trn1        v24.4h, v28.4h, v29.4h
166    trn2        v25.4h, v28.4h, v29.4h
167    trn1        v26.4h, v30.4h, v31.4h
168    trn2        v27.4h, v30.4h, v31.4h
169    trn1        v21.2s, v24.2s, v26.2s
170    trn2        v16.2s, v24.2s, v26.2s
171    trn1        v22.2s, v25.2s, v27.2s
172    trn2        v17.2s, v25.2s, v27.2s
173    // output in d14,d15,d16,d17
174    // first stage computation ends
175
176    // second stage computation starts  :  copy pasting 1st stage
177    // register changes
178    // d14 - d0
179    // d15 - d1
180    // d16 - d2
181    // d17 - d3
182    ld1         {v18.s}[1],[x2],x5
183    smull       v6.4s, v22.4h, v4.h[2]      //74 * pi2_src[1]
184    smlal       v6.4s, v21.4h, v4.h[0]      //74 * pi2_src[1] + 29 * pi2_src[0]
185    smlal       v6.4s, v17.4h, v4.h[1]      //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
186    smlal       v6.4s, v16.4h, v4.h[3]      //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
187
188    smull       v5.4s, v22.4h, v4.h[2]      //74 * pi2_src[1]
189    smlal       v5.4s, v21.4h, v4.h[1]      //74 * pi2_src[1] + 55 * pi2_src[0]
190    smlsl       v5.4s, v16.4h, v4.h[0]      //74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
191    smlsl       v5.4s, v17.4h, v4.h[3]      //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
192
193    smull       v7.4s, v21.4h, v4.h[2]      // 74 * pi2_src[0]
194    smlsl       v7.4s, v16.4h, v4.h[2]      // 74 * pi2_src[0] - 74 * pi2_src[2]
195    smlal       v7.4s, v17.4h, v4.h[2]      //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
196    ld1         {v19.s}[0],[x2],x5
197
198    smull       v20.4s, v16.4h, v4.h[1]     // 55 * pi2_src[2]
199    smlsl       v20.4s, v22.4h, v4.h[2]     //  - 74 * pi2_src[1] +   55 * pi2_src[2]
200    smlsl       v20.4s, v17.4h, v4.h[0]     // - 74 * pi2_src[1] +   55 * pi2_src[2]    - 29 * pi2_src[3]
201    smlal       v20.4s, v21.4h, v4.h[3]     //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
202
203    sqrshrn     v28.4h, v6.4s,#shift_stage2_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
204    sqrshrn     v29.4h, v5.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
205    sqrshrn     v30.4h, v7.4s,#shift_stage2_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
206    sqrshrn     v31.4h, v20.4s,#shift_stage2_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
207    ld1         {v19.s}[1],[x2],x5
208    trn1        v24.4h, v28.4h, v29.4h
209    trn2        v25.4h, v28.4h, v29.4h
210    trn1        v26.4h, v30.4h, v31.4h
211    trn2        v27.4h, v30.4h, v31.4h
212    trn1        v0.2s, v24.2s, v26.2s
213    trn2        v2.2s, v24.2s, v26.2s
214    trn1        v1.2s, v25.2s, v27.2s
215    trn2        v3.2s, v25.2s, v27.2s
216    // output in d0,d1,d2,d3
217    // second stage computation ends
218
219    // loading pred
220    mov         v0.d[1],v1.d[0]
221    mov         v2.d[1],v3.d[0]
222
223    uaddw       v0.8h,  v0.8h ,  v18.8b     // pi2_out(16bit) + pu1_pred(8bit)
224    sqxtun      v0.8b, v0.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
225    uaddw       v2.8h,  v2.8h ,  v19.8b     // pi2_out(16bit) + pu1_pred(8bit)
226    sqxtun      v1.8b, v2.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
227
228    // storing destination
229    st1         {v0.s}[0],[x3],x6
230    st1         {v0.s}[1],[x3],x6
231    st1         {v1.s}[0],[x3],x6
232    st1         {v1.s}[1],[x3],x6
233
234    // ldmfd sp!,{x4-x12,x15}            //reload the registers from sp
235    ldp         x19, x20,[sp],#16
236
237    ret
238
239
240
241
242
243
244
245
246
247