rsdIntrinsics_Convolve.S revision 49202fbfe57d2cc92b183baa8cbce3141e9a9ead
1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 18 19#include <machine/cpu-features.h> 20#include <machine/asm.h> 21 22/* 23 r0 = dst 24 r1 = y0 base pointer 25 r2 = y1 base pointer 26 r3 = y2 base pointer 27 sp = coeffs 28 sp = length / 2 29*/ 30 31ENTRY(TestConvolveK) 32 .save {r4, lr} 33 stmfd sp!, {r4, lr} 34 vpush {q4-q7} 35 36 ldr r4, [sp, #8+64] 37 vld1.16 {q0}, [r4]! 38 vld1.16 {q1}, [r4] 39 ldr r4, [sp, #12+64] 40 411: 42 vld1.8 {q13}, [r1] 43 vld1.8 {q14}, [r2] 44 vld1.8 {q15}, [r3] 45 add r1, r1, #8 46 add r2, r2, #8 47 add r3, r3, #8 48 PLD (r1, #8) 49 PLD (r2, #8) 50 PLD (r3, #8) 51 52 vmovl.u8 q2, d26 53 vmovl.u8 q3, d27 54 vmovl.u8 q4, d28 55 vmovl.u8 q5, d29 56 vmovl.u8 q6, d30 57 vmovl.u8 q7, d31 58 59/* 60 The two pixel source array is 61 d4, d5, d6, d7 62 d8, d9, d10, d11 63 d12, d13, d14, d15 64*/ 65 66 vmull.s16 q8, d4, d0[0] 67 vmull.s16 q9, d5, d0[0] 68 69 vmlal.s16 q8, d5, d0[1] 70 vmlal.s16 q9, d6, d0[1] 71 72 vmlal.s16 q8, d6, d0[2] 73 vmlal.s16 q9, d7, d0[2] 74 75 vmlal.s16 q8, d4, d0[3] 76 vmlal.s16 q9, d9, d0[3] 77 78 vmlal.s16 q8, d9, d1[0] 79 vmlal.s16 q9, d10, d1[0] 80 81 vmlal.s16 q8, d10, d1[1] 82 vmlal.s16 q9, d11, d1[1] 83 84 vmlal.s16 q8, d12, d1[2] 85 vmlal.s16 q9, d13, d1[2] 86 87 vmlal.s16 q8, d13, d1[3] 88 vmlal.s16 q9, d14, d1[3] 89 90 vmlal.s16 q8, d14, d2[0] 91 vmlal.s16 q9, d15, d2[0] 92 93 vshrn.i32 d16, q8, #8 94 vshrn.i32 d17, q9, #8 95 96 vqmovun.s16 d16, q8 97 vst1.8 d16, [r0]! 98 99 subs r4, r4, #1 100 bne 1b 101 102 103 vpop {q4-q7} 104 ldmfd sp!, {r4, lr} 105 bx lr 106END(TestConvolveK) 107 108 109/* 110 r0 = dst 111 r1 = src 112 r2 = matrx 113 r3 = length 114*/ 115ENTRY(rsdIntrinsicColorMatrix4x4K) 116 .save {r4, lr} 117 stmfd sp!, {r4, lr} 118 vpush {q4-q7} 119 120 vld1.16 {q2}, [r2]! 121 vld1.16 {q3}, [r2]! 122 1231: 124 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 125 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 126 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 127 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 128 129 vmovl.u8 q12, d0 130 vmovl.u8 q13, d1 131 vmovl.u8 q14, d2 132 vmovl.u8 q15, d3 133 134 vmull.s16 q8, d24, d4[0] 135 vmull.s16 q9, d26, d4[1] 136 vmull.s16 q10, d28, d4[2] 137 vmull.s16 q11, d30, d4[3] 138 139 vmlal.s16 q8, d24, d5[0] 140 vmlal.s16 q9, d26, d5[1] 141 vmlal.s16 q10, d28, d5[2] 142 vmlal.s16 q11, d30, d5[3] 143 144 vmlal.s16 q8, d24, d6[0] 145 vmlal.s16 q9, d26, d6[1] 146 vmlal.s16 q10, d28, d6[2] 147 vmlal.s16 q11, d30, d6[3] 148 149 vmlal.s16 q8, d24, d7[0] 150 vmlal.s16 q9, d26, d7[1] 151 vmlal.s16 q10, d28, d7[2] 152 vmlal.s16 q11, d30, d7[3] 153 154 vshrn.i32 d24, q8, #8 155 vshrn.i32 d26, q9, #8 156 vshrn.i32 d28, q10, #8 157 vshrn.i32 d30, q11, #8 158 159 vqmovun.s16 d0, q12 160 vqmovun.s16 d1, q13 161 vqmovun.s16 d2, q14 162 vqmovun.s16 d3, q15 163 164 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 165 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 166 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 167 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 168 169 subs r3, r3, #1 170 bne 1b 171 172 vpop {q4-q7} 173 ldmfd sp!, {r4, lr} 174 bx lr 175END(rsdIntrinsicColorMatrix4x4K) 176 177/* 178 r0 = dst 179 r1 = src 180 r2 = matrx 181 r3 = length 182*/ 183ENTRY(rsdIntrinsicColorMatrix3x3K) 184 .save {r4, lr} 185 stmfd sp!, {r4, lr} 186 vpush {q4-q7} 187 188 vld1.16 {q2}, [r2]! 189 vld1.16 {q3}, [r2]! 190 1911: 192 vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! 193 vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! 194 vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! 195 vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! 196 197 vmovl.u8 q12, d0 198 vmovl.u8 q13, d1 199 vmovl.u8 q14, d2 200 201 vmull.s16 q8, d24, d4[0] 202 vmull.s16 q9, d26, d4[1] 203 vmull.s16 q10, d28, d4[2] 204 205 vmlal.s16 q8, d24, d5[0] 206 vmlal.s16 q9, d26, d5[1] 207 vmlal.s16 q10, d28, d5[2] 208 209 vmlal.s16 q8, d24, d6[0] 210 vmlal.s16 q9, d26, d6[1] 211 vmlal.s16 q10, d28, d6[2] 212 213 vshrn.i32 d24, q8, #8 214 vshrn.i32 d26, q9, #8 215 vshrn.i32 d28, q10, #8 216 217 vqmovun.s16 d0, q12 218 vqmovun.s16 d1, q13 219 vqmovun.s16 d2, q14 220 221 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! 222 vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! 223 vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! 224 vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! 225 226 subs r3, r3, #1 227 bne 1b 228 229 vpop {q4-q7} 230 ldmfd sp!, {r4, lr} 231 bx lr 232END(rsdIntrinsicColorMatrix3x3K) 233 234