1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@/*******************************************************************************
20@* @file
21@*  ihevcd_fmt_conv_420sp_to_rgba8888.s
22@*
23@* @brief
24@*  contains function definitions for format conversions
25@*
26@* @author
27@*  ittiam
28@*
29@* @par list of functions:
30@*
31@*
32@* @remarks
33@*  none
34@*
35@*******************************************************************************/
36    .equ DO1STROUNDING, 0
37
38    @ ARM
39    @
40    @ PRESERVE8
41
42.text
43.p2align 2
44
45
46
47
48@/*****************************************************************************
49@*                                                                            *
50@*  Function Name    : ihevcd_fmt_conv_420sp_to_rgba8888()                    *
51@*                                                                            *
52@*  Description      : This function conversts the image from YUV422 color    *
53@*                     space to RGB888 color space. The function can be       *
54@*                     invoked at the MB level.                               *
55@*                                                                            *
56@*  Arguments        : R0           pubY                                      *
57@*                     R1           pubUV                                     *
58@*                     R2           pusRGB                                    *
59@*                     R3           pusRGB                                    *
60@*                     [R13 #40]    usHeight                                  *
61@*                     [R13 #44]    usWidth                                   *
62@*                     [R13 #48]    usStrideY                                 *
63@*                     [R13 #52]    usStrideU                                 *
64@*                     [R13 #56]    usStrideV                                 *
65@*                     [R13 #60]    usStrideRGB                               *
66@*                                                                            *
67@*  Values Returned  : None                                                   *
68@*                                                                            *
69@*  Register Usage   : R0 - R14                                               *
70@*                                                                            *
71@*  Stack Usage      : 40 Bytes                                               *
72@*                                                                            *
73@*  Interruptibility : Interruptible                                          *
74@*                                                                            *
75@*  Known Limitations                                                         *
76@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
77@*                     greater than or equal to 16                *
78@*                     Image Height:    Assumed to be even.                   *
79@*                                                                            *
80@*  Revision History :                                                        *
81@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
82@*         07 06 2010   Varshita        Draft                                 *
83@*         07 06 2010   Naveen Kr T     Completed                             *
84@*         05 08 2013   Naveen K P      Modified for HEVC                     *
85@*****************************************************************************/
86    .global ihevcd_fmt_conv_420sp_to_rgba8888_a9q
87.type ihevcd_fmt_conv_420sp_to_rgba8888_a9q, function
88ihevcd_fmt_conv_420sp_to_rgba8888_a9q:
89
90    @// push the registers on the stack
91    STMFD       SP!,{R4-R12,LR}
92
93
94    @//R0 - Y PTR
95    @//R1 - UV PTR
96    @//R2 - RGB PTR
97    @//R3 - RGB PTR
98    @//R4 - PIC WIDTH
99    @//R5 - PIC HT
100    @//R6 - STRIDE Y
101    @//R7 - STRIDE U
102    @//R8 - STRIDE V
103    @//R9 - STRIDE RGB
104
105    @//ONE ROW PROCESSING AT A TIME
106
107    @//THE FOUR CONSTANTS ARE:
108    @//C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
109
110    @PLD        [R0]
111    @PLD        [R1]
112    @PLD        [R2]
113
114
115    @/* can be loaded from a defined const type */
116    MOVW        R10,#0x3311
117    VMOV.16     D0[0],R10                   @//C1
118
119    MOVW        R10,#0xF379
120    VMOV.16     D0[1],R10                   @//C2
121
122    MOVW        R10,#0xE5F8
123    VMOV.16     D0[2],R10                   @//C3
124
125    MOVW        R10,#0x4092
126    VMOV.16     D0[3],R10                   @//C4
127
128    @//LOAD CONSTANT 128 INTO A CORTEX REGISTER
129    MOV         R10,#128
130    VDUP.8      D1,R10
131
132    @//D0 HAS C1-C2-C3-C4
133    @// load other parameters from stack
134    LDR         R5,[sp,#40]
135    @LDR  R4,[sp,#44]
136    LDR         R6,[sp,#44]
137    LDR         R7,[sp,#48]
138    @LDR  R8,[sp,#52]
139    LDR         R9,[sp,#52]
140
141    @// calculate offsets, offset = stride - width
142    SUB         R10,R6,R3                   @// luma offset
143    SUB         R11,R7,R3
144    @, LSR #1   @// u offset
145    @SUB     R12,R8,R3, LSR #1  @// v offset
146    SUB         R14,R9,R3                   @// rgb offset in pixels
147
148    @// calculate height loop count
149    MOV         R5,R5, LSR #1               @// height_cnt = height / 16
150
151    @// create next row pointers for rgb and luma data
152    ADD         R7,R0,R6                    @// luma_next_row = luma + luma_stride
153    ADD         R8,R2,R9,LSL #2             @// rgb_next_row = rgb + rgb_stride
154
155LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
156
157    @//LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
158    VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF UV
159    @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V
160
161    @// calculate width loop count
162    MOV         R6,R3, LSR #4               @// width_cnt = width / 16
163
164    @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
165    @//LOAD VALUES OF Y 8-BIT VALUES
166    VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
167                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
168    VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
169                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
170
171    SUBS        R6,R6,#1
172    BEQ         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
173
174LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
175    @VMOV.I8 Q1,#128
176    VUZP.8      D2,D3
177
178
179    @//NEED TO SUBTRACT (U-128) AND (V-128)
180    @//(D2-D1),(D3-D1)
181    VSUBL.U8    Q2,D2,D1                    @//(U-128)
182    VSUBL.U8    Q3,D3,D1                    @//(V-128)
183
184    @//LOAD VALUES OF U&V for next row
185    VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF U
186    @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V
187
188    @PLD        [R0]
189    PLD         [R1]
190
191    @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
192    VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
193    VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B
194
195    VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
196    VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R
197
198    VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
199    VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
200    VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
201    VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3
202
203    @//NARROW RIGHT SHIFT BY 13 FOR R&B
204    VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
205    VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
206    @//Q4 - WEIGHT FOR B
207
208    @//NARROW RIGHT SHIFT BY 13 FOR R&B
209    VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
210    VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
211    @//Q5 - WEIGHT FOR R
212
213    @//NARROW RIGHT SHIFT BY 13 FOR G
214    VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
215    VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
216    @//Q6 - WEIGHT FOR G
217
218    VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
219    VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
220    VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G
221
222    VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
223    VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
224    VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G
225
226    VQMOVUN.S16 D14,Q7
227    VQMOVUN.S16 D15,Q9
228    VQMOVUN.S16 D16,Q8
229    VMOV.I8     D17,#0
230
231    VZIP.8      D14,D15
232    VZIP.8      D16,D17
233    VZIP.16     Q7,Q8
234
235
236    VQMOVUN.S16 D20,Q10
237    VQMOVUN.S16 D21,Q12
238    VQMOVUN.S16 D22,Q11
239    VMOV.I8     D23,#0
240
241    VZIP.8      D20,D21
242    VZIP.8      D22,D23
243    VZIP.16     Q10,Q11
244
245    VZIP.32     Q7,Q10
246    VZIP.32     Q8,Q11
247
248    VST1.32     D14,[R2]!
249    VST1.32     D15,[R2]!
250    VST1.32     D20,[R2]!
251    VST1.32     D21,[R2]!
252    VST1.32     D16,[R2]!
253    VST1.32     D17,[R2]!
254    VST1.32     D22,[R2]!
255    VST1.32     D23,[R2]!
256
257    @//D14-D20 - TOALLY HAVE 16 VALUES
258    @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
259    VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
260    VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
261    VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G
262
263    VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
264    VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
265    VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G
266
267    @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
268    @//LOAD VALUES OF Y 8-BIT VALUES
269    VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
270                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
271    VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
272                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
273
274    PLD         [R0]
275    PLD         [R7]
276
277    VQMOVUN.S16 D14,Q7
278    VQMOVUN.S16 D15,Q9
279    VQMOVUN.S16 D16,Q8
280    VMOV.I8     D17,#0
281
282    VZIP.8      D14,D15
283    VZIP.8      D16,D17
284    VZIP.16     Q7,Q8
285
286
287    VQMOVUN.S16 D20,Q10
288    VQMOVUN.S16 D21,Q12
289    VQMOVUN.S16 D22,Q11
290    VMOV.I8     D23,#0
291
292    VZIP.8      D20,D21
293    VZIP.8      D22,D23
294    VZIP.16     Q10,Q11
295
296    VZIP.32     Q7,Q10
297    VZIP.32     Q8,Q11
298
299    VST1.32     D14,[R8]!
300    VST1.32     D15,[R8]!
301    VST1.32     D20,[R8]!
302    VST1.32     D21,[R8]!
303    VST1.32     D16,[R8]!
304    VST1.32     D17,[R8]!
305    VST1.32     D22,[R8]!
306    VST1.32     D23,[R8]!
307
308    SUBS        R6,R6,#1                    @// width_cnt -= 1
309    BNE         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
310
311LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
312    @VMOV.I8 Q1,#128
313    VUZP.8      D2,D3
314
315
316    @//NEED TO SUBTRACT (U-128) AND (V-128)
317    @//(D2-D1),(D3-D1)
318    VSUBL.U8    Q2,D2,D1                    @//(U-128)
319    VSUBL.U8    Q3,D3,D1                    @//(V-128)
320
321
322    @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
323    VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
324    VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B
325
326    VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
327    VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R
328
329    VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
330    VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
331    VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
332    VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3
333
334    @//NARROW RIGHT SHIFT BY 13 FOR R&B
335    VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
336    VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
337    @//Q4 - WEIGHT FOR B
338
339    @//NARROW RIGHT SHIFT BY 13 FOR R&B
340    VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
341    VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
342    @//Q5 - WEIGHT FOR R
343
344    @//NARROW RIGHT SHIFT BY 13 FOR G
345    VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
346    VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
347    @//Q6 - WEIGHT FOR G
348
349    VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
350    VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
351    VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G
352
353    VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
354    VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
355    VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G
356
357    VQMOVUN.S16 D14,Q7
358    VQMOVUN.S16 D15,Q9
359    VQMOVUN.S16 D16,Q8
360    VMOV.I8     D17,#0
361
362    VZIP.8      D14,D15
363    VZIP.8      D16,D17
364    VZIP.16     Q7,Q8
365
366
367    VQMOVUN.S16 D20,Q10
368    VQMOVUN.S16 D21,Q12
369    VQMOVUN.S16 D22,Q11
370    VMOV.I8     D23,#0
371
372    VZIP.8      D20,D21
373    VZIP.8      D22,D23
374    VZIP.16     Q10,Q11
375
376    VZIP.32     Q7,Q10
377    VZIP.32     Q8,Q11
378
379    VST1.32     D14,[R2]!
380    VST1.32     D15,[R2]!
381    VST1.32     D20,[R2]!
382    VST1.32     D21,[R2]!
383    VST1.32     D16,[R2]!
384    VST1.32     D17,[R2]!
385    VST1.32     D22,[R2]!
386    VST1.32     D23,[R2]!
387
388    @//D14-D20 - TOALLY HAVE 16 VALUES
389    @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
390    VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
391    VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
392    VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G
393
394    VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
395    VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
396    VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G
397
398
399    VQMOVUN.S16 D14,Q7
400    VQMOVUN.S16 D15,Q9
401    VQMOVUN.S16 D16,Q8
402    VMOV.I8     D17,#0
403
404    VZIP.8      D14,D15
405    VZIP.8      D16,D17
406    VZIP.16     Q7,Q8
407
408
409    VQMOVUN.S16 D20,Q10
410    VQMOVUN.S16 D21,Q12
411    VQMOVUN.S16 D22,Q11
412    VMOV.I8     D23,#0
413
414    VZIP.8      D20,D21
415    VZIP.8      D22,D23
416    VZIP.16     Q10,Q11
417
418    VZIP.32     Q7,Q10
419    VZIP.32     Q8,Q11
420
421    VST1.32     D14,[R8]!
422    VST1.32     D15,[R8]!
423    VST1.32     D20,[R8]!
424    VST1.32     D21,[R8]!
425    VST1.32     D16,[R8]!
426    VST1.32     D17,[R8]!
427    VST1.32     D22,[R8]!
428    VST1.32     D23,[R8]!
429
430    @// Adjust the address pointers
431    ADD         R0,R7,R10                   @// luma = luma_next + offset
432    ADD         R2,R8,R14,LSL #2            @// rgb = rgb_next + offset
433
434    ADD         R7,R0,R3                    @// luma_next = luma + width
435    ADD         R8,R2,R3,LSL #2             @// rgb_next_row = rgb + width
436
437    ADD         R1,R1,R11                   @// adjust u pointer
438    @ADD        R2,R2,R12           @// adjust v pointer
439
440    ADD         R7,R7,R10                   @// luma_next = luma + width + offset (because of register crunch)
441    ADD         R8,R8,R14,LSL #2            @// rgb_next_row = rgb + width + offset
442
443    SUBS        R5,R5,#1                    @// height_cnt -= 1
444
445    BNE         LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
446
447    @//POP THE REGISTERS
448    LDMFD       SP!,{R4-R12,PC}
449
450
451
452
453    .section .note.GNU-stack,"",%progbits
454
455