1
2    .arch armv7-a
3    .text
4    .global csc_ARGB8888_to_YUV420SP_NEON
5    .type   csc_ARGB8888_to_YUV420SP_NEON, %function
6csc_ARGB8888_to_YUV420SP_NEON:
7    .fnstart
8
9    @r0     pDstY
10    @r1     pDstUV
11    @r2     pSrcRGB
12    @r3     nWidth
13    @r4     pDstY2 = pDstY + nWidth
14    @r5     pSrcRGB2 = pSrcRGB + nWidthx2
15    @r6     temp7, nWidth16m
16    @r7     temp6, accumilator
17    @r8     temp5, nWidthTemp
18    @r9     temp4, Raw RGB565
19    @r10    temp3, r,g,b
20    @r11    temp2, immediate operand
21    @r12    temp1, nHeight
22    @r14    temp0, debugging pointer
23
24    .equ CACHE_LINE_SIZE, 32
25    .equ PRE_LOAD_OFFSET, 6
26
27    stmfd       sp!, {r4-r12,r14}       @ backup registers
28    ldr         r12, [sp, #40]           @ load nHeight
29    @ldr         r14, [sp, #44]          @ load pTest
30    add         r4, r0, r3             @r4: pDstY2 = pDstY + nWidth
31    add         r5, r2, r3, lsl #2     @r5: pSrcRGB2 = tmpSrcRGB + nWidthx4
32    sub         r8, r3, #16                @r8: nWidthTmp = nWidth -16
33
34    @q0: temp1, R
35    @q1: temp2, GB
36    @q2: R
37    @q3: G
38    @q4: B
39    @q5: temp3, output
40
41
42    vmov.u16 q6, #66 @coefficient assignment
43    vmov.u16 q7, #129
44    vmov.u16 q8, #25
45    vmov.u16 q9,  #0x8080  @ 128<<8 + 128
46
47    vmov.u16 q10, #0x1000  @ 16<<8 + 128
48    vorr.u16 q10, #0x0080
49
50    vmov.u16 q11, #38 @#-38
51    vmov.u16 q12, #74 @#-74
52    vmov.u16 q13, #112
53    vmov.u16 q14, #94 @#-94
54    vmov.u16 q15, #18 @#-18
55
56
57
58
59LOOP_NHEIGHT2:
60    stmfd       sp!, {r12}       @ backup registers
61
62LOOP_NWIDTH16:
63    pld         [r2, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
64   @-------------------------------------------YUV ------------------------------------------
65    vmov.u16 q14, #94 @#94
66    vmov.u16 q15, #18 @#18
67    vld4.8   {d0,d1,d2,d3}, [r2]! @loadRGB interleavely
68    vld4.8   {d4,d5,d6,d7}, [r2]! @loadRGB interleavely
69
70
71    vmov.u16 d8,d2
72    vmov.u16 d9,d6
73    vmov.u16 d10,d1
74    vmov.u16 d11,d5
75    vmov.u16 d12,d0
76    vmov.u16 d13,d4
77
78    vand.u16 q4,#0x00FF  @R
79    vand.u16 q5,#0x00FF  @G
80    vand.u16 q6,#0x00FF  @B
81
82    vmov.u16 q8,q9   @ CalcU()
83    vmla.u16 q8,q6,q13  @112 * B[k]
84    vmls.u16 q8,q4,q11  @q0:U -(38 * R[k]) @128<<6+ 32 + u>>2
85    vmls.u16 q8,q5,q12  @-(74 * G[k])
86    vshr.u16 q8,q8, #8  @(128<<8+ 128 + u)>>8
87
88    vmov.u16 q7,q9      @CalcV()
89    vmla.u16 q7,q4,q13  @112 * R[k]
90    vmls.u16 q7,q5,q14  @q0:U -(94 * G[k])  @128<<6+ 32 + v>>2
91    vmls.u16 q7,q6,q15  @-(18 * B[k])
92    vshr.u16 q7,q7, #8  @(128<<8+ 128 + v)>>8
93
94
95    vtrn.8 q8,q7
96    vst1.8  {q8}, [r1]!    @write UV component to yuv420_buffer+linear_ylanesiez
97
98    @-------------------------------------------Y ------------------------------------------
99
100    vmov.u16 q14, #66 @#66
101    vmov.u16 q15, #129 @#129
102    vmov.u16 q8, #25 @#25
103
104    @CalcY_Y()
105
106    vmul.u16 q7,q4,q14  @q0 = 66 *R[k]
107    vmla.u16 q7,q5,q15  @q0 += 129 *G[k]
108    vmla.u16 q7,q6,q8  @q0 += 25 *B[k]
109
110    vadd.u16 q7,q7,q10
111    vshr.u16 q7,q7, #8
112
113    vmov.u16 d8,d2
114    vmov.u16 d9,d6
115    vmov.u16 d10,d1
116    vmov.u16 d11,d5
117    vmov.u16 d12,d0
118    vmov.u16 d13,d4
119
120    vshr.u16 q4,q4,#8  @R
121    vshr.u16 q5,q5,#8  @G
122    vshr.u16 q6,q6,#8  @B
123
124    vmul.u16 q0,q4,q14  @q0 = 66 *R[k]
125    vmla.u16 q0,q5,q15  @q0 += 129 *G[k]
126    vmla.u16 q0,q6,q8  @q0 += 25 *B[k]
127    vadd.u16 q0,q0,q10
128    vshr.u16 q0,q0, #8
129
130    vtrn.8 q7,q0
131    vst1.8  {q7}, [r0]!@write to Y to yuv420_buffer
132
133
134
135   @-------------------------------------------Y ------------------------------------------
136
137            @---------------------------------------------Y1-------------------------------------------
138
139    pld         [r5, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
140    vld4.8   {d0,d1,d2,d3}, [r5]! @loadRGB interleavely
141    vld4.8   {d4,d5,d6,d7}, [r5]! @loadRGB interleavely
142
143    vmov.u16 d8,d2
144    vmov.u16 d9,d6
145    vmov.u16 d10,d1
146    vmov.u16 d11,d5
147    vmov.u16 d12,d0
148    vmov.u16 d13,d4
149
150
151    vand.u16 q4,#0x00FF  @R
152    vand.u16 q5,#0x00FF  @G
153    vand.u16 q6,#0x00FF  @B
154
155
156
157    vmul.u16 q7,q4,q14  @q0 = 66 *R[k]
158    vmla.u16 q7,q5,q15  @q0 += 129 *G[k]
159    vmla.u16 q7,q6,q8  @q0 += 25 *B[k]
160    vadd.u16 q7,q7,q10
161    vshr.u16 q7,q7, #8
162
163    vmov.u16 d8,d2
164    vmov.u16 d9,d6
165    vmov.u16 d10,d1
166    vmov.u16 d11,d5
167    vmov.u16 d12,d0
168    vmov.u16 d13,d4
169
170    vshr.u16 q4,q4,#8  @R
171    vshr.u16 q5,q5,#8  @G
172    vshr.u16 q6,q6,#8  @B
173
174    vmul.u16 q0,q4,q14  @q0 = 66 *R[k]
175    vmla.u16 q0,q5,q15  @q0 += 129 *G[k]
176    vmla.u16 q0,q6,q8  @q0 += 25 *B[k]
177    vadd.u16 q0,q0,q10
178    vshr.u16 q0,q0, #8
179
180    vtrn.8 q7,q0
181    vst1.8  {q7}, [r4]!@write to Y to yuv420_buffer
182
183    subs r8,r8,#16                       @nWidth16--
184    BPL LOOP_NWIDTH16                @if nWidth16>0
185    @-----------------------------------unaligned ---------------------------------------
186
187    adds r8,r8,#16 @ + 16 - 2
188    BEQ NO_UNALIGNED  @in case that nWidht is multiple of 16
189LOOP_NWIDTH2:
190    @----------------------------------pDstRGB1--Y------------------------------------------
191    @stmfd sp!, {r14} @backup r14
192
193
194    ldr r9,  [r2], #4 @loadRGB  int
195    ldr r12,  [r2], #4 @loadRGB  int
196
197    mov r10, r9,lsr #16    @copy to r10
198    mov r14, r12    @copy to r10
199
200    ldr r6, =0x000000FF
201    and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
202    ldr r6, =0x00FF0000
203    and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
204    add r10,r10,r14
205
206    mov r11, #66 @accumilator += R*66
207    mul r7, r10, r11
208
209    mov r10, r9,lsr #8    @copy to r10
210    mov r14, r12,lsl #8    @copy to r10
211
212    ldr r6, =0x000000FF
213    and r10, r10, r6 @G:
214    ldr r6, =0x00FF0000
215    and r14, r14, r6 @G:
216    add r10,r10,r14
217
218    mov r11, #129 @accumilator += G *129
219    mla r7, r10, r11, r7
220
221    mov r10, r9    @copy to r10
222    mov r14, r12,lsl #16    @copy to r10
223
224    ldr r6, =0x000000FF
225    and r10, r10, r6 @B
226    ldr r6, =0x00FF0000
227    and r14, r14, r6 @B
228    add r10,r10,r14
229
230    mov r11, #25 @accumilator 1 -= B *25
231    mla r7, r10, r11, r7
232
233    ldr r6, =0x10801080
234    add  r7, r6
235
236    lsr r7, #8
237    strb r7, [r0],#1
238    lsr r7,#16
239    strb r7, [r0],#1
240    @ldmfd sp!, {r14} @load r14
241
242
243    @----------------------------------pDstRGB2--UV------------------------------------------
244
245    mov r10, r9    @copy to r10
246    ldr  r7,=0x00008080
247    mov  r12,r7
248
249    ldr r6, =0x000000FF
250    and r10, r10, r6 @B:
251
252    mov r11, #112 @accumilator += B*112
253    mla r7, r10, r11, r7
254
255
256    mov r11, #18 @accumilator -= B*18
257    mul r11, r10, r11
258    sub r12, r12, r11
259
260
261
262
263    mov r10, r9, lsr #16    @copy to r10
264    ldr r6, =0x000000FF
265    and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
266
267    mov r11, #38 @accumilator -= R *38
268    mul r11, r10, r11
269    sub r7, r7, r11
270
271    mov r11, #112 @accumilator  = R *112
272    mla r12, r10, r11, r12
273
274    mov r10, r9,lsr #8    @copy to r10
275    ldr r6, =0x000000FF
276    and r10, r10, r6  @G: (rgbIn[k] & 0x07E0) >> 5;
277
278    mov r11, #74 @accumilator -= G*74
279    mul r11, r10, r11
280    sub r7, r7, r11
281
282    mov r11, #94 @accumilator -= G*94
283    mul r11, r10, r11
284    sub r12, r12, r11
285
286    lsr r7, #8 @ >>8
287    strb r7, [r1],#1
288    lsr r12, #8 @ >>8
289    strb r12, [r1],#1
290
291    @----------------------------------pDstRGB2--Y------------------------------------------
292    @stmfd sp!, {r14} @backup r14
293
294
295    ldr r9,  [r5], #4 @loadRGB  int
296    ldr r12,  [r5], #4 @loadRGB  int
297
298    mov r10, r9,lsr #16    @copy to r10
299    mov r14, r12    @copy to r10
300
301    ldr r6, =0x000000FF
302    and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
303    ldr r6, =0x00FF0000
304    and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
305    add r10,r10,r14
306
307    mov r11, #66 @accumilator += R*66
308    mul r7, r10, r11
309
310    mov r10, r9,lsr #8    @copy to r10
311    mov r14, r12,lsl #8    @copy to r10
312
313    ldr r6, =0x000000FF
314    and r10, r10, r6 @G:
315    ldr r6, =0x00FF0000
316    and r14, r14, r6 @G:
317    add r10,r10,r14
318
319    mov r11, #129 @accumilator += G *129
320    mla r7, r10, r11, r7
321
322    mov r10, r9    @copy to r10
323    mov r14, r12,lsl #16    @copy to r10
324
325    ldr r6, =0x000000FF
326    and r10, r10, r6 @B
327    ldr r6, =0x00FF0000
328    and r14, r14, r6 @B
329    add r10,r10,r14
330
331
332
333
334    mov r11, #25 @accumilator 1 -= B *25
335    mla r7, r10, r11, r7
336
337    ldr r6, =0x10801080
338    add  r7, r6
339    lsr r7, #8
340
341    strb r7, [r4],#1
342    lsr r7,#16
343    strb r7, [r4],#1
344    @ldmfd sp!, {r14} @load r14
345
346
347    subs r8,r8,#2                      @ nWidth2 -= 2
348    BGT LOOP_NWIDTH2                @ if nWidth2>0
349
350
351NO_UNALIGNED: @in case that nWidht is multiple of 16
352
353    @-----------------------------------------------------------------------------
354    sub         r8, r3, #16                @r8: nWidthTmp = nWidth -16
355    add r0, r0,  r3   @pDstY +  nwidth
356    add r2, r2, r3, lsl #2    @pSrcRGB +  nwidthx4
357    add r4, r4,  r3   @pDstY2 +  nwidth
358    add r5, r5, r3, lsl #2   @pSrcRGB2 +  nwidthx4
359
360    ldmfd sp!, {r12}
361    subs r12,r12,#2                       @nHeight -=2
362    BGT LOOP_NHEIGHT2                @if nHeight2>0
363
364    ldmfd       sp!, {r4-r12,pc}       @ backup registers
365    .fnend
366