rsCpuIntrinsics_neon_Convolve.S revision c1e6eb07263cd21c42bfd08bb7789406dc863fdb
1e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
2e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Copyright (C) 2012 The Android Open Source Project
3e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
4e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Licensed under the Apache License, Version 2.0 (the "License");
5e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * you may not use this file except in compliance with the License.
6e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * You may obtain a copy of the License at
7e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
8e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *      http://www.apache.org/licenses/LICENSE-2.0
9e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
10e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Unless required by applicable law or agreed to in writing, software
11e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * distributed under the License is distributed on an "AS IS" BASIS,
12e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * See the License for the specific language governing permissions and
14e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * limitations under the License.
15e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams */
16e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
17e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
18e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
19e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams#include <machine/cpu-features.h>
20e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams#include <machine/asm.h>
21e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
22e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
23e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r0 = dst
24e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r1 = y0 base pointer
25e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r2 = y1 base pointer
26e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r3 = y2 base pointer
27e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        sp = coeffs
28e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        sp = length / 2
29e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/
30e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
31e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason SamsENTRY(rsdIntrinsicConvolve3x3_K)
32eca876089980799774bbe5f8bf341e780bd94348Jason Sams        push            {r4-r8, r10, r11, lr}
33e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpush           {q4-q7}
34e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
35e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        /* Get the coeffs pointer from the stack and load the
36eca876089980799774bbe5f8bf341e780bd94348Jason Sams           coefficients in the q0, q1 NEON registers */
37eca876089980799774bbe5f8bf341e780bd94348Jason Sams        ldr r4, [sp, #32+64]
38eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.16 {q0, q1}, [r4]
39eca876089980799774bbe5f8bf341e780bd94348Jason Sams
40eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Get count from the stack */
41eca876089980799774bbe5f8bf341e780bd94348Jason Sams        ldr r4, [sp, #36+64]
42eca876089980799774bbe5f8bf341e780bd94348Jason Sams
43eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Load the frequently used immediate in a register */
44eca876089980799774bbe5f8bf341e780bd94348Jason Sams        mov r5, #8
45e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
46e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams1:
47eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Load and post-increase the address by r5=#8 */
48eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.8 {q13}, [r1], r5
49eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.8 {q14}, [r2], r5
50eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.8 {q15}, [r3], r5
51eca876089980799774bbe5f8bf341e780bd94348Jason Sams
52eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Signal memory for data that will be used in the loop after the next */
53eca876089980799774bbe5f8bf341e780bd94348Jason Sams        PLD         (r1, r5)
54eca876089980799774bbe5f8bf341e780bd94348Jason Sams        PLD         (r2, r5)
55eca876089980799774bbe5f8bf341e780bd94348Jason Sams        PLD         (r3, r5)
56e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
57e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q2, d26
58e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q3, d27
59e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q4, d28
60e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q5, d29
61e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q6, d30
62e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q7, d31
63e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
64e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
65e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        The two pixel source array is
66e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        d4,  d5,  d6,  d7
67e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        d8,  d9,  d10, d11
68e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        d12, d13, d14, d15
69e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/
70e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
71e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmull.s16 q8, d4, d0[0]
72e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q8, d5, d0[1]
73e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q8, d6, d0[2]
742207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmlal.s16 q8, d8, d0[3]
75eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d9, d1[0]
76eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d10, d1[1]
77eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d12, d1[2]
78eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d13, d1[3]
79eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d14, d2[0]
80e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
81eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmull.s16 q9, d5, d0[0]
82eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q9, d6, d0[1]
83eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q9, d7, d0[2]
84e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d9, d0[3]
85e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d10, d1[0]
86e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d11, d1[1]
87e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d13, d1[2]
88e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d14, d1[3]
89e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d15, d2[0]
90e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
91e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d16, q8, #8
92e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d17, q9, #8
93e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
94e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d16, q8
95e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst1.8 d16, [r0]!
96e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
97eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Are we done yet? */
98e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        subs r4, r4, #1
99e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bne 1b
100e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
101eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* We're done, bye! */
102e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpop            {q4-q7}
103eca876089980799774bbe5f8bf341e780bd94348Jason Sams        pop             {r4-r8, r10, r11, lr}
104e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bx              lr
105c1e6eb07263cd21c42bfd08bb7789406dc863fdbJason SamsEND(rsdIntrinsicConvolve3x3_K)
106e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
107e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
108e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r0 = dst
109e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r1 = src
11040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r2 = matrix
111e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r3 = length
112e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/
113e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason SamsENTRY(rsdIntrinsicColorMatrix4x4_K)
114e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        stmfd           sp!, {r4, lr}
115e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpush           {q4-q7}
116e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
117e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld1.16 {q2}, [r2]!
118e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld1.16 {q3}, [r2]!
119e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
120e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams1:
121e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
122e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
123e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
124e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
125e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
12640945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q12, d0  /* R */
12740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q13, d1  /* G */
12840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q14, d2  /* B */
12940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q15, d3  /* A */
130e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
131e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmull.s16 q8,  d24, d4[0]
13240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q9,  d24, d4[1]
13340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q10, d24, d4[2]
13440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q11, d24, d4[3]
135e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
13640945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d26, d5[0]
137e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9,  d26, d5[1]
13840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q10, d26, d5[2]
13940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q11, d26, d5[3]
140e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
14140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d28, d6[0]
14240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q9,  d28, d6[1]
143e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q10, d28, d6[2]
14440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q11, d28, d6[3]
145e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
14640945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d30, d7[0]
14740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q9,  d30, d7[1]
14840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q10, d30, d7[2]
149e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q11, d30, d7[3]
150e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
151e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d24, q8, #8
152e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d26, q9, #8
153e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d28, q10, #8
154e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d30, q11, #8
155e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
156e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d0, q12
157e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d1, q13
158e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d2, q14
159e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d3, q15
160e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
161e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
162e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
163e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
164e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
165e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
166e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        subs r3, r3, #1
167e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bne 1b
168e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
169e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpop            {q4-q7}
170e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        ldmfd           sp!, {r4, lr}
171e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bx              lr
17240945e01597adaed9e728a14a17bf4a35452abd5Jason SamsEND(rsdIntrinsicColorMatrix4x4_K)
173e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
174e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
175e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r0 = dst
176e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r1 = src
17740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r2 = matrix
178e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r3 = length
179e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/
18040945e01597adaed9e728a14a17bf4a35452abd5Jason SamsENTRY(rsdIntrinsicColorMatrix3x3_K)
181e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        stmfd           sp!, {r4, lr}
182e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpush           {q4-q7}
183e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
184e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld1.16 {q2}, [r2]!
185e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld1.16 {q3}, [r2]!
186e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
187e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams1:
188e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
189e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
190e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
191e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
192e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
193e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q12, d0
194e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q13, d1
195e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q14, d2
196e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
197e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmull.s16 q8,  d24, d4[0]
19840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q9,  d24, d4[1]
19940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q10, d24, d4[2]
200e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
20140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d26, d5[0]
202e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9,  d26, d5[1]
20340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q10, d26, d5[2]
204e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
20540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d28, d6[0]
20640945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q9,  d28, d6[1]
207e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q10, d28, d6[2]
208e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
209e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d24, q8, #8
210e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d26, q9, #8
211e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d28, q10, #8
212e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
213e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d0, q12
214e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d1, q13
215e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d2, q14
216e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
217e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
218e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
219e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
220e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
221e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
222e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        subs r3, r3, #1
223e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bne 1b
224e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
225e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpop            {q4-q7}
226e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        ldmfd           sp!, {r4, lr}
227e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bx              lr
22840945e01597adaed9e728a14a17bf4a35452abd5Jason SamsEND(rsdIntrinsicColorMatrix3x3_K)
22940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
23040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams/*
23140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r0 = dst
23240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r1 = src
23340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r2 = matrix
23440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r3 = length
23540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams*/
23640945e01597adaed9e728a14a17bf4a35452abd5Jason SamsENTRY(rsdIntrinsicColorMatrixDot_K)
23740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        stmfd           sp!, {r4, lr}
23840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vpush           {q4-q7}
23940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
24040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld1.16 {q2}, [r2]!
24140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld1.16 {q3}, [r2]!
24240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
24340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams1:
24440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
24540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
24640945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
24740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
24840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
24940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q12, d0
25040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q13, d1
25140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q14, d2
25240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
25340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q8,  d24, d4[0]
25440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d26, d5[0]
25540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d28, d6[0]
25640945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vshrn.i32 d24, q8, #8
25740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vqmovun.s16 d0, q12
25840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmov.u8 d1, d0
25940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmov.u8 d2, d0
26040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
26140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
26240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
26340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
26440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
26540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
26640945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        subs r3, r3, #1
26740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        bne 1b
26840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
26940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vpop            {q4-q7}
27040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        ldmfd           sp!, {r4, lr}
27140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        bx              lr
27240945e01597adaed9e728a14a17bf4a35452abd5Jason SamsEND(rsdIntrinsicColorMatrixDot_K)
273e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
274e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
275e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams/*
276e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Samsstatic void OneVF(float4 *out, const uchar *ptrIn, int iStride,
277e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams                  const float* gPtr, int iradius, int x1, int x2)
278e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
279e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r0 = out
280e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r1 = pin
281e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r2 = stride
282e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r3 = gptr
283e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r4 = sp, ct
284e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r5 = sp+4, x1
285e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r6 = sp+8, x2
286e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams*/
287e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason SamsENTRY(rsdIntrinsicBlurVF_K)
288e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        push            {r4-r8, r10, r11, lr}
289e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vpush           {q4-q7}
290e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
291e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        ldr r4, [sp, #32+64]
292e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        ldr r5, [sp, #32+64 + 4]
293e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        ldr r6, [sp, #32+64 + 8]
294e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
295e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams1:
2962207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        veor q10, q10, q10         /* float4 blurredPixel = 0; */
2972207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        veor q11, q11, q11         /* float4 blurredPixel = 0; */
298e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        add r7, r1, r5, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
299e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        mov r10, r3
300e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
301e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        mov r11, r4
302e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
303e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams2:
3042207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {d2}, [r7]
305e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vmovl.u8 q1, d2
3062207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmovl.u16 q3, d2
3072207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmovl.u16 q4, d3
3082207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vcvt.f32.s32 q3, q3
3092207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vcvt.f32.s32 q4, q4
3102207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {d0[0]}, [r10]!
311e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        add r7, r7, r2
3122207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmla.f32 q10, q3, d0[0]
3132207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmla.f32 q11, q4, d0[0]
314e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        subs r11, r11, #1
315e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bne 2b
316e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
3172207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vst1.32 {q10}, [r0]!
3182207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vst1.32 {q11}, [r0]!
3192207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        add r5, r5, #2
320e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        cmp r5, r6
321e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bne 1b
322e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
323e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
324e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vpop            {q4-q7}
325e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        pop             {r4-r8, r10, r11, lr}
326e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bx              lr
327e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason SamsEND(rsdIntrinsicBlurVF_K)
328e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
329e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams/*
330e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Samsstatic void OneVF(float4 *out, const uchar *ptrIn, int iStride,
331e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams                  const float* gPtr, int iradius, int x1, int x2)
332e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
333e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r0 = out
334e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r1 = pin
335e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r2 = gptr
336e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r3 = ct
337e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r4 = sp, x1
338e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r5 = sp+4, x2
339e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams*/
340e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason SamsENTRY(rsdIntrinsicBlurHF_K)
341e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        push            {r4-r8, r10, r11, lr}
342e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vpush           {q4-q7}
343e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
344e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        ldr r4, [sp, #32+64]
345e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        ldr r5, [sp, #32+64 + 4]
346e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
347e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams1:
348e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        add r7, r1, r4, lsl #4  /* const uchar *pi = ptrIn + x1 * 4; */
349e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        mov r10, r2
350e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        mov r11, r3
351e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
3522207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {q1}, [r7]!
3532207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {d6[0]}, [r10]!
3542207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmul.f32 q0, q1, d6[0]
3552207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        sub r11, r11, #1
3562207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams
357e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams2:
358e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vld1.32 {q1}, [r7]!
3592207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {q2}, [r7]!
360e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vld1.32 {d6[0]}, [r10]!
3612207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {d6[1]}, [r10]!
362e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vmla.f32 q0, q1, d6[0]
3632207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmla.f32 q0, q2, d6[1]
3642207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        subs r11, r11, #2
365e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bne 2b
366e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
367e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vcvt.s32.f32 q0, q0
368e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vmovn.u32 d0, q0
369e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vmovn.u16 d0, q0
370e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
371e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vst1.32 {d0[0]}, [r0]!
372e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        add r4, r4, #1
373e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        cmp r4, r5
374e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bne 1b
375e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
376e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vpop            {q4-q7}
377e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        pop             {r4-r8, r10, r11, lr}
378e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bx              lr
379e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason SamsEND(rsdIntrinsicBlurHF_K)
380e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
381915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams/*
382915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        r0 = dst
383915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        r1 = Y
384915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        r2 = VU
385915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        r3 = length (pixels / 8)
386915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        r4 = sp, params
387915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
388915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        This function converts 8 pixels per iteration
389915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams*/
390915aa964a1a312f5e06c115112a3aea14fd31b33Jason SamsENTRY(rsdIntrinsicYuv_K)
391915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        push            {r4-r8, r10, r11, lr}
392915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vpush           {q4-q7}
393915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
394915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        ldr r4, [sp, #32+64]
395915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vld1.16 {q2}, [r4]!  // mults
396915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vld1.16 {q3}, [r4]!  // y offset
397915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vld1.16 {q4}, [r4]!  // 128
398915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vdup.8 d3, d5[1]
399915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
400915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams1:
401915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vld1.8 {d10}, [r1]!
402915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vld1.8 {d12}, [r2]!
403915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmovl.u8 q5, d10 // Y at .16
404915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmovl.u8 q6, d12 // vu at .16
405915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
406915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vsub.i16 q5, q5, q3
407915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vsub.i16 q6, q6, q4
408915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vtrn.16 d12, d13  // d12 = u, d13 = v
409915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmov q7, q6
410915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vtrn.16 d12, d14
411915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vtrn.32 d12, d14
412915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vtrn.16 d13, d15
413915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vtrn.32 d13, d15
414915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
415915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmull.s16 q8, d10, d4[0]
416915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmull.s16 q11, d11, d4[0]
417915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmov q9, q8
418915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmov q10, q8
419915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmov q12, q11
420915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmov q13, q11
421915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
422915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q8,  d12, d4[1]
423915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q9,  d12, d5[0]
424915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q10, d13, d4[3]
425915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q9,  d13, d4[2]
426915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
427915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q11, d14, d4[1]
428915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q12, d14, d5[0]
429915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q13, d15, d4[3]
430915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q12, d15, d4[2]
431915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
432915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
433915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d16, q8, #8
434915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d18, q9, #8
435915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d20, q10, #8
436915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d0, q8
437915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d1, q9
438915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d2, q10
439915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
440915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
441915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
442915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
443915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
444915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d16, q11, #8
445915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d18, q12, #8
446915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d20, q13, #8
447915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d0, q8
448915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d1, q9
449915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d2, q10
450915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
451915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
452915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
453915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
454915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
455915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        subs r3, r3, #1
456915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        bne 1b
457915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
458915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vpop            {q4-q7}
459915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        pop             {r4-r8, r10, r11, lr}
460915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        bx              lr
461915aa964a1a312f5e06c115112a3aea14fd31b33Jason SamsEND(rsdIntrinsicYuv_K)
462915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
463a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/* Convolve 5x5 */
464a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
465a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
466a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r0 = dst
467a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r1 = y0 base pointer
468a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r2 = y1 base pointer
469a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r3 = y2 base pointer
470a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r4 = y3 base pointer
471a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r5 = y4 base pointer
472a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r6 = coeffs
473a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r7 = length
474a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
475a1b08e2cacf3891fcd6895422c6124887b75975eJason SamsENTRY(rsdIntrinsicConvolve5x5_K)
476a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        push        {r4-r7, lr}
477a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vpush       {q4-q7}
478a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
479a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* load y3 in r4 */
480a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r4, [sp, #20 + 64]
481a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
482a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* load y4 in r5 */
483a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r5, [sp, #24 + 64]
484a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
485a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the coefficients pointer */
486a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r6, [sp, #28 + 64]
487a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
488a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Create the coefficients vector */
489a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.16     {d0, d1, d2, d3}, [r6]!
490a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.16     {d4, d5, d6}, [r6]
491a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
492a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* load the count */
493a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r6, [sp, #32 + 64]
494a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
495a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the frequently used immediate in a register */
496a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        mov     r7, #8
497a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
498a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams1:
499a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
500a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
501a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
502a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
503a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Signal memory for data that will be used in the loop after the next */
504a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        PLD         (r1, r7)
505a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        PLD         (r2, r7)
506a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
507a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Promoting the 8bit channels to 16bit */
508a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q9,  d24
509a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q10, d25
510a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q11, d26
511a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q12, d27
512a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q13, d28
513a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q14, d29
514a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
515a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
516a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d18,  d19,  d20, d21, d22, d23,
517a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d24,  d25
518a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
519a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmull.s16 q4, d18, d0[0]
520a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d19, d0[1]
521a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d20, d0[2]
522a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d21, d0[3]
523a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d22, d1[0]
524a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
525a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d24, d1[1]
526a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d25, d1[2]
527a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d26, d1[3]
528a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d27, d2[0]
529a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d28, d2[1]
530a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
531a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmull.s16 q5, d19, d0[0]
532a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d20, d0[1]
533a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d21, d0[2]
534a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d22, d0[3]
535a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d23, d1[0]
536a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
537a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d25, d1[1]
538a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d26, d1[2]
539a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d27, d1[3]
540a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d28, d2[0]
541a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d29, d2[1]
542a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
543a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
544a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Next 2 rows */
545a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
546a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
547a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
548a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
549a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Signal memory for data that will be used in the loop after the next */
550a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        PLD         (r3, r7)
551a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        PLD         (r4, r7)
552a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
553a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Promoting the 8bit channels to 16bit */
554a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q9,  d24
555a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q10, d25
556a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q11, d26
557a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q12, d27
558a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q13, d28
559a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q14, d29
560a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
561a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
562a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d18,  d19,  d20, d21, d22, d23,
563a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d24,  d25
564a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
565a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d18, d2[2]
566a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d19, d2[3]
567a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d20, d3[0]
568a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d21, d3[1]
569a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d22, d3[2]
570a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
571a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d24, d3[3]
572a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d25, d4[0]
573a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d26, d4[1]
574a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d27, d4[2]
575a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d28, d4[3]
576a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
577a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d19, d2[2]
578a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d20, d2[3]
579a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d21, d3[0]
580a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d22, d3[1]
581a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d23, d3[2]
582a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
583a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d25, d3[3]
584a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d26, d4[0]
585a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d27, d4[1]
586a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d28, d4[2]
587a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d29, d4[3]
588a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
589a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Last row */
590a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
591a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
592a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
593a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Signal memory for data that will be used in the loop after the next */
594a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        PLD         (r5, r7)
595a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
596a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Promoting the 8bit channels to 16bit */
597a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q9,  d24
598a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q10, d25
599a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q11, d26
600a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
601a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
602a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d18,  d19,  d20, d21, d22, d23,
603a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d24,  d25
604a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
605a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
606a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d18, d5[0]
607a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d19, d5[1]
608a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d20, d5[2]
609a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d21, d5[3]
610a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d22, d6[0]
611a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
612a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d19, d5[0]
613a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d20, d5[1]
614a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d21, d5[2]
615a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d22, d5[3]
616a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d23, d6[0]
617a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
618a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
619a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
620a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
621a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*      Narrow it to a d-reg 32 -> 16 bit */
622a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vshrn.i32 d8, q4, #8
623a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vshrn.i32 d9, q5, #8
624a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
625a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
626a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vqmovun.s16 d8, q4
627a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
628a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vst1.8 d8, [r0]!           @ return the output and increase the address of r0
629a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
630a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Are we done? */
631a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        subs r6, r6, #1
632a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        bne 1b
633a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
634a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Yup, bye */
635a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vpop        {q4-q7}
636a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        pop         {r4-r7, lr}
637a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        bx          lr
638a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
639a1b08e2cacf3891fcd6895422c6124887b75975eJason SamsEND(rsdIntrinsicConvolve5x5_K)
640fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
641fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
642fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
643fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
644fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
645fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst = src + dst * (1.0 - src.a)
646fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
647fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
648fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
649fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
650fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
651fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendSrcOver_K)
652fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
653fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
654fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
655fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
656fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, #255
657fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vdup.16 q7, r4
658fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
659fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
660fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
661fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
662fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
663fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
664fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
665fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
666fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
667fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
668fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
669fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
670fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
671fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshll.u8 q12, d0, #8
672fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshll.u8 q13, d1, #8
673fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshll.u8 q14, d2, #8
674fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q6, d3
675fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vsub.i16 q6, q7, q6        // q6 = 1 - src.a
676fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshll.u8 q15, d3, #8
677fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
678fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
679fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
680fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
681fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
682fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
683fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
684fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
685fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
686fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
687fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q8, d0
688fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q9, d1
689fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q10, d2
690fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q11, d3
691fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
692fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q12, q8, q6
693fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q13, q9, q6
694fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q14, q10, q6
695fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q15, q11, q6
696fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
697fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d0, q12, #8
698fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d1, q13, #8
699fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d2, q14, #8
700fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d3, q15, #8
701fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
702fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
703fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
704fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
705fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
706fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
707fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
708fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
709fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
710fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
711fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
712fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
713fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
714fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
715fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
716fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendSrcOver_K)
717fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
718fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
719fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst = dst + src * (1.0 - dst.a)
720fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
721fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
722fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
723fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
724fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
725fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendDstOver_K)
726fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
727fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
728fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
729fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
730fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, #255
731fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vdup.16 q7, r4
732fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
733fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
734fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
735fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
736fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
737fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
738fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
739fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
740fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
741fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
742fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
743fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
744fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
745fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q12, d0
746fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q13, d1
747fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q14, d2
748fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q15, d3
749fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
750fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
751fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
752fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
753fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
754fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
755fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
756fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
757fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
758fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
759fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshll.u8 q8, d0, #8
760fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshll.u8 q9, d1, #8
761fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshll.u8 q10, d2, #8
762fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q6, d3
763fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vsub.i16 q6, q7, q6        // q6 = 1 - dst.a
764fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshll.u8 q11, d3, #8
765fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
766fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
767fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q8, q12, q6
768fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q9, q13, q6
769fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q10, q14, q6
770fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q11, q15, q6
771fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
772fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d0, q8, #8
773fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d1, q9, #8
774fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d2, q10, #8
775fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d3, q11, #8
776fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
777fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
778fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
779fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
780fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
781fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
782fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
783fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
784fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
785fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
786fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
787fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
788fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
789fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
790fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
791fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendDstOver_K)
792fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
793fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
794fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst = src * dst.a
795fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
796fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
797fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
798fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
799fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
800fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendSrcIn_K)
801fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
802fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
803fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
804fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
805fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
806fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
807fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
808fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
809fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
810fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
811fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
812fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
813fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
814fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
815fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
816fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
817fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q12, d0
818fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q13, d1
819fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q14, d2
820fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q15, d3
821fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
822fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
823fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
824fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
825fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
826fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
827fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
828fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
829fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
830fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
831fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q8, d0
832fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q9, d1
833fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q10, d2
834fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q11, d3
835fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
836fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q12, q12, q11
837fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q13, q13, q11
838fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q14, q14, q11
839fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q15, q15, q11
840fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
841fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d0, q12, #8
842fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d1, q13, #8
843fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d2, q14, #8
844fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d3, q15, #8
845fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
846fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
847fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
848fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
849fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
850fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
851fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
852fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
853fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
854fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
855fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
856fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
857fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
858fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
859fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
860fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendSrcIn_K)
861fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
862fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
863fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst = dst * src.a
864fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
865fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
866fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
867fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
868fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
869fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendDstIn_K)
870fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
871fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
872fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
873fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
874fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
875fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
876fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
877fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
878fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
879fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
880fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
881fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
882fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
883fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
884fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
885fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
886fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q12, d0
887fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q13, d1
888fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q14, d2
889fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q15, d3
890fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
891fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
892fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
893fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
894fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
895fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
896fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
897fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
898fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
899fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
900fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q8, d0
901fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q9, d1
902fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q10, d2
903fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q11, d3
904fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
905fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q8, q8, q15
906fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q9, q9, q15
907fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q10, q10, q15
908fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q11, q11, q15
909fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
910fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d0, q8, #8
911fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d1, q9, #8
912fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d2, q10, #8
913fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d3, q11, #8
914fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
915fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
916fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
917fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
918fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
919fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
920fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
921fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
922fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
923fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
924fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
925fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
926fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
927fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
928fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
929fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendDstIn_K)
930fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
931fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
932fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
933fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
934fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst = src * (1.0 - dst.a)
935fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
936fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
937fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
938fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
939fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
940fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendSrcOut_K)
941fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
942fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
943fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
944fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
945fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, #255
946fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vdup.16 q7, r4
947fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
948fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
949fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
950fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
951fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
952fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
953fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
954fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
955fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
956fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
957fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
958fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
959fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
960fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q12, d0
961fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q13, d1
962fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q14, d2
963fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q15, d3
964fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
965fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
966fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
967fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
968fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
969fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
970fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
971fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
972fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
973fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
974fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q8, d0
975fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q9, d1
976fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q10, d2
977fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q11, d3
978fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
979fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
980fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
981fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q12, q12, q6
982fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q13, q13, q6
983fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q14, q14, q6
984fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q15, q15, q6
985fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
986fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d0, q12, #8
987fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d1, q13, #8
988fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d2, q14, #8
989fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d3, q15, #8
990fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
991fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
992fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
993fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
994fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
995fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
996fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
997fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
998fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
999fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
1000fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
1001fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1002fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
1003fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
1004fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
1005fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendSrcOut_K)
1006fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1007fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1008fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
1009fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst = dst * (1.0 - src.a)
1010fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1011fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
1012fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
1013fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
1014fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
1015fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendDstOut_K)
1016fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
1017fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
1018fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
1019fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1020fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, #255
1021fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vdup.16 q7, r4
1022fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1023fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
1024fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
1025fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1026fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
1027fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1028fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1029fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1030fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1031fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1032fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1033fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1034fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1035fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q12, d0
1036fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q13, d1
1037fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vmovl.u8 q14, d2
1038fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q15, d3
1039fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1040fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
1041fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1042fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1043fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1044fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1045fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1046fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1047fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1048fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1049fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q8, d0
1050fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q9, d1
1051fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q10, d2
1052fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q11, d3
1053fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1054fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1055fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vsub.i16 q6, q7, q15        // q6 = 1 - src.a
1056fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q12, q8, q6
1057fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q13, q9, q6
1058fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q14, q10, q6
1059fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q15, q11, q6
1060fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1061fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d0, q12, #8
1062fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d1, q13, #8
1063fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d2, q14, #8
1064fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d3, q15, #8
1065fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1066fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1067fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1068fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1069fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1070fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1071fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1072fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1073fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1074fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
1075fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
1076fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1077fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
1078fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
1079fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
1080fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendDstOut_K)
1081fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1082fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1083fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
1084fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst.rgb = src.rgb * dst.a + (1.0 - src.a) * dst.rgb
1085fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst.a = dst.a
1086fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1087fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
1088fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
1089fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
1090fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
1091fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendSrcAtop_K)
1092fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
1093fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
1094fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
1095fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1096fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, #255
1097fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vdup.16 q7, r4
1098fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1099fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
1100fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
1101fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1102fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
1103fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1104fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1105fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1106fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1107fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1108fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1109fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1110fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1111fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q12, d0
1112fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q13, d1
1113fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q14, d2
1114fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q15, d3
1115fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1116fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
1117fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1118fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1119fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1120fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1121fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1122fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1123fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1124fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1125fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q8, d0
1126fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q9, d1
1127fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q10, d2
1128fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q11, d3
1129fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1130fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1131fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vsub.i16 q6, q7, q15        // q6 = 1 - src.a
1132fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q8, q8, q6
1133fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q9, q9, q6
1134fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q10, q10, q6
1135fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1136fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q8, q12, q11
1137fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q9, q13, q11
1138fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q10, q14, q11
1139fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1140fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1141fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d0, q8, #8
1142fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d1, q9, #8
1143fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d2, q10, #8
1144fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vshrn.i16 d3, q15, #8
1145fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1146fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1147fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1148fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1149fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1150fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1151fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1152fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1153fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1154fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
1155fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
1156fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1157fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
1158fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
1159fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
1160fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendSrcAtop_K)
1161fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1162fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
1163fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst = dst.rgb * src.a + (1.0 - dst.a) * src.rgb
1164fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst.a = src.a
1165fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1166fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
1167fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
1168fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
1169fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
1170fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendDstAtop_K)
1171fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
1172fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
1173fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
1174fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1175fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, #255
1176fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vdup.16 q7, r4
1177fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1178fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
1179fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
1180fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1181fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
1182fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1183fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1184fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1185fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1186fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1187fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1188fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1189fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1190fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q12, d0
1191fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q13, d1
1192fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q14, d2
1193fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q15, d3
1194fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1195fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
1196fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1197fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1198fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1199fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1200fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1201fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1202fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1203fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1204fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q8, d0
1205fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q9, d1
1206fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q10, d2
1207fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q11, d3
1208fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1209fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1210fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
1211fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q12, q12, q6
1212fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q13, q13, q6
1213fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q14, q14, q6
1214fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1215fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q12, q8, q15
1216fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q13, q9, q15
1217fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmla.i16 q14, q10, q15
1218fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1219fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1220fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d0, q12, #8
1221fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d1, q13, #8
1222fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d2, q14, #8
1223fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        //vshrn.i16 d3, q15, #8
1224fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1225fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1226fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1227fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1228fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1229fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1230fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1231fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1232fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1233fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
1234fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
1235fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1236fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
1237fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
1238fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
1239fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendDstAtop_K)
1240fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1241fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
1242fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst = dst ^ src
1243fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1244fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
1245fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
1246fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
1247fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
1248fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendXor_K)
1249fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
1250fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
1251fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
1252fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1253fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, #255
1254fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vdup.16 q7, r4
1255fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1256fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
1257fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
1258fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1259fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
1260fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1261fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1262fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1263fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1264fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1265fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1266fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1267fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1268fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmov.u8 d4, d0
1269fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmov.u8 d5, d1
1270fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmov.u8 d6, d2
1271fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmov.u8 d7, d3
1272fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1273fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
1274fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1275fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1276fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1277fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1278fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1279fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1280fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1281fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1282fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1283fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        veor d0, d0, d4
1284fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        veor d1, d1, d5
1285fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        veor d2, d2, d6
1286fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        veor d3, d3, d7
1287fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1288fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1289fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1290fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1291fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1292fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1293fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1294fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1295fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1296fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1297fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
1298fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
1299fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1300fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
1301fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
1302fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
1303fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendXor_K)
1304fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1305fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
1306fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst = dst * src
1307fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1308fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
1309fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
1310fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
1311fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
1312fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendMultiply_K)
1313fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
1314fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
1315fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
1316fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1317fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, #255
1318fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vdup.16 q7, r4
1319fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1320fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
1321fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
1322fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1323fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
1324fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1325fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1326fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1327fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1328fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1329fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1330fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1331fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1332fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q12, d0
1333fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q13, d1
1334fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q14, d2
1335fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q15, d3
1336fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1337fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
1338fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1339fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1340fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1341fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1342fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1343fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1344fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1345fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1346fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q8, d0
1347fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q9, d1
1348fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q10, d2
1349fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q11, d3
1350fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1351fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1352fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q8, q8, q12
1353fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q9, q9, q13
1354fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q10, q10, q14
1355fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmul.i16 q11, q11, q15
1356fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1357fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d0, q8, #8
1358fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d1, q9, #8
1359fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d2, q10, #8
1360fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vshrn.i16 d3, q11, #8
1361fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1362fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1363fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1364fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1365fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1366fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1367fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1368fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1369fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1370fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
1371fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
1372fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1373fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
1374fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
1375fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
1376fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendMultiply_K)
1377fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1378fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
1379fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst = min(src + dst, 1.0)
1380fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1381fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
1382fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
1383fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
1384fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
1385fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendAdd_K)
1386fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
1387fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
1388fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
1389fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1390fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, #255
1391fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vdup.16 q7, r4
1392fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1393fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
1394fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
1395fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1396fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
1397fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1398fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1399fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1400fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1401fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1402fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1403fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1404fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1405fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q12, d0
1406fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q13, d1
1407fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q14, d2
1408fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q15, d3
1409fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1410fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
1411fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1412fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1413fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1414fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1415fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1416fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1417fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1418fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1419fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q8, d0
1420fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q9, d1
1421fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q10, d2
1422fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q11, d3
1423fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1424fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1425fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vadd.i16 q8, q8, q12
1426fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vadd.i16 q9, q9, q13
1427fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vadd.i16 q10, q10, q14
1428fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vadd.i16 q11, q11, q15
1429fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1430fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vqmovun.s16 d0, q8
1431fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vqmovun.s16 d1, q9
1432fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vqmovun.s16 d2, q10
1433fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vqmovun.s16 d3, q11
1434fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1435fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1436fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1437fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1438fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1439fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1440fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1441fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1442fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1443fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
1444fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
1445fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1446fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
1447fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
1448fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
1449fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendAdd_K)
1450fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1451fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1452fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams/*
1453fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        dst = max(dst - src, 0.0)
1454fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1455fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r0 = dst
1456fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r1 = src
1457fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        r2 = length
1458fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams*/
1459fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsENTRY(rsdIntrinsicBlendSub_K)
1460fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        .save           {r4, lr}
1461fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        stmfd           sp!, {r4, lr}
1462fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpush           {q4-q7}
1463fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1464fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, #255
1465fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vdup.16 q7, r4
1466fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1467fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        mov r4, r0
1468fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams1:
1469fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1470fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* src */
1471fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
1472fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
1473fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
1474fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
1475fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
1476fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
1477fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
1478fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
1479fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q12, d0
1480fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q13, d1
1481fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q14, d2
1482fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q15, d3
1483fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1484fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        /* dst */
1485fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
1486fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
1487fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
1488fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
1489fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
1490fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
1491fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
1492fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
1493fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q8, d0
1494fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q9, d1
1495fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q10, d2
1496fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vmovl.u8 q11, d3
1497fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1498fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1499fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vsub.i16 q8, q8, q12
1500fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vsub.i16 q9, q9, q13
1501fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vsub.i16 q10, q10, q14
1502fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vsub.i16 q11, q11, q15
1503fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1504fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vqmovun.s16 d0, q8
1505fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vqmovun.s16 d1, q9
1506fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vqmovun.s16 d2, q10
1507fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vqmovun.s16 d3, q11
1508fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
1509fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
1510fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
1511fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
1512fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
1513fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
1514fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
1515fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
1516fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1517fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        subs r2, r2, #1
1518fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bne 1b
1519fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1520fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        vpop            {q4-q7}
1521fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        ldmfd           sp!, {r4, lr}
1522fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams        bx              lr
1523fa17cda2d7e0948677035890e40498ad0b639c92Jason SamsEND(rsdIntrinsicBlendSub_K)
1524fa17cda2d7e0948677035890e40498ad0b639c92Jason Sams
1525