1e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/* 2e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Copyright (C) 2012 The Android Open Source Project 3e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * 4e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Licensed under the Apache License, Version 2.0 (the "License"); 5e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * you may not use this file except in compliance with the License. 6e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * You may obtain a copy of the License at 7e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * 8e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * http://www.apache.org/licenses/LICENSE-2.0 9e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * 10e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Unless required by applicable law or agreed to in writing, software 11e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * distributed under the License is distributed on an "AS IS" BASIS, 12e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * See the License for the specific language governing permissions and 14e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * limitations under the License. 15e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams */ 16e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 17e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/* 18e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams r0 = dst 19e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams r1 = y0 base pointer 20e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams r2 = y1 base pointer 21e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams r3 = y2 base pointer 22e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams sp = coeffs 23e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams sp = length / 2 24e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/ 25e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 26eab7cd3de8639f8dfc01dcab59b2a472e4d7dbd1Elliott Hughes#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart 27eab7cd3de8639f8dfc01dcab59b2a472e4d7dbd1Elliott Hughes#define END(f) .fnend; .size f, .-f; 28eab7cd3de8639f8dfc01dcab59b2a472e4d7dbd1Elliott Hughes 29e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason SamsENTRY(rsdIntrinsicConvolve3x3_K) 30eca876089980799774bbe5f8bf341e780bd94348Jason Sams push {r4-r8, r10, r11, lr} 31e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vpush {q4-q7} 32e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 33e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams /* Get the coeffs pointer from the stack and load the 34eca876089980799774bbe5f8bf341e780bd94348Jason Sams coefficients in the q0, q1 NEON registers */ 35eca876089980799774bbe5f8bf341e780bd94348Jason Sams ldr r4, [sp, #32+64] 36eca876089980799774bbe5f8bf341e780bd94348Jason Sams vld1.16 {q0, q1}, [r4] 37eca876089980799774bbe5f8bf341e780bd94348Jason Sams 38eca876089980799774bbe5f8bf341e780bd94348Jason Sams /* Get count from the stack */ 39eca876089980799774bbe5f8bf341e780bd94348Jason Sams ldr r4, [sp, #36+64] 40eca876089980799774bbe5f8bf341e780bd94348Jason Sams 41eca876089980799774bbe5f8bf341e780bd94348Jason Sams /* Load the frequently used immediate in a register */ 42eca876089980799774bbe5f8bf341e780bd94348Jason Sams mov r5, #8 43e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 44e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams1: 45eca876089980799774bbe5f8bf341e780bd94348Jason Sams /* Load and post-increase the address by r5=#8 */ 46eca876089980799774bbe5f8bf341e780bd94348Jason Sams vld1.8 {q13}, [r1], r5 47eca876089980799774bbe5f8bf341e780bd94348Jason Sams vld1.8 {q14}, [r2], r5 48eca876089980799774bbe5f8bf341e780bd94348Jason Sams vld1.8 {q15}, [r3], r5 49eca876089980799774bbe5f8bf341e780bd94348Jason Sams 50eca876089980799774bbe5f8bf341e780bd94348Jason Sams /* Signal memory for data that will be used in the loop after the next */ 51d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes pld [r1, r5] 52d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes pld [r2, r5] 53d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes pld [r3, r5] 54e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 55e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmovl.u8 q2, d26 56e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmovl.u8 q3, d27 57e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmovl.u8 q4, d28 58e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmovl.u8 q5, d29 59e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmovl.u8 q6, d30 60e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmovl.u8 q7, d31 61e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 62e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/* 63e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams The two pixel source array is 64e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams d4, d5, d6, d7 65e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams d8, d9, d10, d11 66e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams d12, d13, d14, d15 67e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/ 68e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 69e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmull.s16 q8, d4, d0[0] 70e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmlal.s16 q8, d5, d0[1] 71e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmlal.s16 q8, d6, d0[2] 722207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams vmlal.s16 q8, d8, d0[3] 73eca876089980799774bbe5f8bf341e780bd94348Jason Sams vmlal.s16 q8, d9, d1[0] 74eca876089980799774bbe5f8bf341e780bd94348Jason Sams vmlal.s16 q8, d10, d1[1] 75eca876089980799774bbe5f8bf341e780bd94348Jason Sams vmlal.s16 q8, d12, d1[2] 76eca876089980799774bbe5f8bf341e780bd94348Jason Sams vmlal.s16 q8, d13, d1[3] 77eca876089980799774bbe5f8bf341e780bd94348Jason Sams vmlal.s16 q8, d14, d2[0] 78e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 79eca876089980799774bbe5f8bf341e780bd94348Jason Sams vmull.s16 q9, d5, d0[0] 80eca876089980799774bbe5f8bf341e780bd94348Jason Sams vmlal.s16 q9, d6, d0[1] 81eca876089980799774bbe5f8bf341e780bd94348Jason Sams vmlal.s16 q9, d7, d0[2] 82e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmlal.s16 q9, d9, d0[3] 83e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmlal.s16 q9, d10, d1[0] 84e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmlal.s16 q9, d11, d1[1] 85e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmlal.s16 q9, d13, d1[2] 86e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmlal.s16 q9, d14, d1[3] 87e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vmlal.s16 q9, d15, d2[0] 88e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 89e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vshrn.i32 d16, q8, #8 90e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vshrn.i32 d17, q9, #8 91e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 92e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vqmovun.s16 d16, q8 93e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vst1.8 d16, [r0]! 94e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 95eca876089980799774bbe5f8bf341e780bd94348Jason Sams /* Are we done yet? */ 96e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams subs r4, r4, #1 97e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams bne 1b 98e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 99eca876089980799774bbe5f8bf341e780bd94348Jason Sams /* We're done, bye! */ 100e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams vpop {q4-q7} 101eca876089980799774bbe5f8bf341e780bd94348Jason Sams pop {r4-r8, r10, r11, lr} 102e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams bx lr 103c1e6eb07263cd21c42bfd08bb7789406dc863fdbJason SamsEND(rsdIntrinsicConvolve3x3_K) 104e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 105e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams 106a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/* Convolve 5x5 */ 107a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 108a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/* 109a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams r0 = dst 110a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams r1 = y0 base pointer 111a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams r2 = y1 base pointer 112a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams r3 = y2 base pointer 113a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams r4 = y3 base pointer 114a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams r5 = y4 base pointer 115a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams r6 = coeffs 116a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams r7 = length 117a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/ 118a1b08e2cacf3891fcd6895422c6124887b75975eJason SamsENTRY(rsdIntrinsicConvolve5x5_K) 119a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams push {r4-r7, lr} 120a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vpush {q4-q7} 121a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 122a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* load y3 in r4 */ 123a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams ldr r4, [sp, #20 + 64] 124a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 125a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* load y4 in r5 */ 126a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams ldr r5, [sp, #24 + 64] 127a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 128a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Load the coefficients pointer */ 129a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams ldr r6, [sp, #28 + 64] 130a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 131a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Create the coefficients vector */ 132a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vld1.16 {d0, d1, d2, d3}, [r6]! 133a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vld1.16 {d4, d5, d6}, [r6] 134a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 135ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams vmov.u32 q15, #0x7f 136ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams 137a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* load the count */ 138a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams ldr r6, [sp, #32 + 64] 139a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 140a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Load the frequently used immediate in a register */ 141a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams mov r7, #8 142a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 143a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams1: 144a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 145a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vld1.8 {d24, d25, d26}, [r1], r7 @ y0 ( y - 2 ) 146a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vld1.8 {d27, d28, d29}, [r2], r7 @ y0 ( y - 1 ) 147a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 148a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Signal memory for data that will be used in the loop after the next */ 149d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes pld [r1, r7] 150d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes pld [r2, r7] 151a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 152a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Promoting the 8bit channels to 16bit */ 153a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q9, d24 154a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q10, d25 155a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q11, d26 156a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q12, d27 157a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q13, d28 158a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q14, d29 159a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 160a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/* 161a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams d18, d19, d20, d21, d22, d23, 162a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams d24, d25 163a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/ 164a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmull.s16 q4, d18, d0[0] 165a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d19, d0[1] 166a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d20, d0[2] 167a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d21, d0[3] 168a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d22, d1[0] 169a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 170a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d24, d1[1] 171a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d25, d1[2] 172a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d26, d1[3] 173a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d27, d2[0] 174a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d28, d2[1] 175a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 176a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmull.s16 q5, d19, d0[0] 177a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d20, d0[1] 178a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d21, d0[2] 179a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d22, d0[3] 180a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d23, d1[0] 181a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 182a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d25, d1[1] 183a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d26, d1[2] 184a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d27, d1[3] 185a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d28, d2[0] 186a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d29, d2[1] 187a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 188a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 189a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Next 2 rows */ 190a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 191a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vld1.8 {d24, d25, d26}, [r3], r7 @ y0 ( y ) 192a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vld1.8 {d27, d28, d29}, [r4], r7 @ y0 ( y + 1 ) 193a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 194a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Signal memory for data that will be used in the loop after the next */ 195d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes pld [r3, r7] 196d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes pld [r4, r7] 197a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 198a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Promoting the 8bit channels to 16bit */ 199a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q9, d24 200a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q10, d25 201a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q11, d26 202a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q12, d27 203a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q13, d28 204a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q14, d29 205a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 206a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/* 207a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams d18, d19, d20, d21, d22, d23, 208a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams d24, d25 209a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/ 210a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d18, d2[2] 211a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d19, d2[3] 212a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d20, d3[0] 213a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d21, d3[1] 214a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d22, d3[2] 215a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 216a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d24, d3[3] 217a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d25, d4[0] 218a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d26, d4[1] 219a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d27, d4[2] 220a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d28, d4[3] 221a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 222a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d19, d2[2] 223a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d20, d2[3] 224a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d21, d3[0] 225a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d22, d3[1] 226a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d23, d3[2] 227a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 228a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d25, d3[3] 229a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d26, d4[0] 230a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d27, d4[1] 231a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d28, d4[2] 232a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d29, d4[3] 233a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 234a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Last row */ 235a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ 236a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vld1.8 {d24, d25, d26}, [r5], r7 @ y0 ( y + 2 ) 237a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 238a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Signal memory for data that will be used in the loop after the next */ 239d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes pld [r5, r7] 240a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 241a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Promoting the 8bit channels to 16bit */ 242a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q9, d24 243a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q10, d25 244a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmovl.u8 q11, d26 245a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 246a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/* 247a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams d18, d19, d20, d21, d22, d23, 248a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams d24, d25 249a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/ 250a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 251a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d18, d5[0] 252a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d19, d5[1] 253a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d20, d5[2] 254a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d21, d5[3] 255a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q4, d22, d6[0] 256a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 257a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d19, d5[0] 258a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d20, d5[1] 259a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d21, d5[2] 260a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d22, d5[3] 261a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vmlal.s16 q5, d23, d6[0] 262a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 263a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 264a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 265ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams vadd.i32 q4, q4, q15 266ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams vadd.i32 q5, q5, q15 267a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 268a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/* Narrow it to a d-reg 32 -> 16 bit */ 269ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams vrshrn.i32 d8, q4, #8 270ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams vrshrn.i32 d9, q5, #8 271ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams 272a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 273a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */ 274a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vqmovun.s16 d8, q4 275a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 276a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vst1.8 d8, [r0]! @ return the output and increase the address of r0 277a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 278a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Are we done? */ 279a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams subs r6, r6, #1 280a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams bne 1b 281a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 282a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams /* Yup, bye */ 283a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams vpop {q4-q7} 284a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams pop {r4-r7, lr} 285a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams bx lr 286a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 287a1b08e2cacf3891fcd6895422c6124887b75975eJason SamsEND(rsdIntrinsicConvolve5x5_K) 288