14e5c414252846e96d2e353515134387d86150c68Simon Hosie/* 24e5c414252846e96d2e353515134387d86150c68Simon Hosie * Copyright (C) 2012,2014 The Android Open Source Project 34e5c414252846e96d2e353515134387d86150c68Simon Hosie * 44e5c414252846e96d2e353515134387d86150c68Simon Hosie * Licensed under the Apache License, Version 2.0 (the "License"); 54e5c414252846e96d2e353515134387d86150c68Simon Hosie * you may not use this file except in compliance with the License. 64e5c414252846e96d2e353515134387d86150c68Simon Hosie * You may obtain a copy of the License at 74e5c414252846e96d2e353515134387d86150c68Simon Hosie * 84e5c414252846e96d2e353515134387d86150c68Simon Hosie * http://www.apache.org/licenses/LICENSE-2.0 94e5c414252846e96d2e353515134387d86150c68Simon Hosie * 104e5c414252846e96d2e353515134387d86150c68Simon Hosie * Unless required by applicable law or agreed to in writing, software 114e5c414252846e96d2e353515134387d86150c68Simon Hosie * distributed under the License is distributed on an "AS IS" BASIS, 124e5c414252846e96d2e353515134387d86150c68Simon Hosie * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 134e5c414252846e96d2e353515134387d86150c68Simon Hosie * See the License for the specific language governing permissions and 144e5c414252846e96d2e353515134387d86150c68Simon Hosie * limitations under the License. 154e5c414252846e96d2e353515134387d86150c68Simon Hosie */ 164e5c414252846e96d2e353515134387d86150c68Simon Hosie 174e5c414252846e96d2e353515134387d86150c68Simon Hosie/* 184e5c414252846e96d2e353515134387d86150c68Simon Hosie x0 = dst 194e5c414252846e96d2e353515134387d86150c68Simon Hosie x1 = y0 base pointer 204e5c414252846e96d2e353515134387d86150c68Simon Hosie x2 = y1 base pointer 214e5c414252846e96d2e353515134387d86150c68Simon Hosie x3 = y2 base pointer 224e5c414252846e96d2e353515134387d86150c68Simon Hosie x4 = coeffs 234e5c414252846e96d2e353515134387d86150c68Simon Hosie x5 = length / 2 244e5c414252846e96d2e353515134387d86150c68Simon Hosie*/ 254e5c414252846e96d2e353515134387d86150c68Simon Hosie 264e5c414252846e96d2e353515134387d86150c68Simon Hosie#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f: 274e5c414252846e96d2e353515134387d86150c68Simon Hosie#define END(f) .size f, .-f; 284e5c414252846e96d2e353515134387d86150c68Simon Hosie 294e5c414252846e96d2e353515134387d86150c68Simon HosieENTRY(rsdIntrinsicConvolve3x3_K) 304e5c414252846e96d2e353515134387d86150c68Simon Hosie sub x6, sp, #64 314e5c414252846e96d2e353515134387d86150c68Simon Hosie sub sp, sp, #64 324e5c414252846e96d2e353515134387d86150c68Simon Hosie st1 {v8.1d-v11.1d}, [x6], #32 334e5c414252846e96d2e353515134387d86150c68Simon Hosie st1 {v12.1d-v15.1d}, [x6] 344e5c414252846e96d2e353515134387d86150c68Simon Hosie 354e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Load the coefficients in the v0, v1 registers */ 364e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v0.8h, v1.8h}, [x4] 374e5c414252846e96d2e353515134387d86150c68Simon Hosie 384e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Load the frequently used immediate in a register */ 394e5c414252846e96d2e353515134387d86150c68Simon Hosie mov x4, #8 404e5c414252846e96d2e353515134387d86150c68Simon Hosie 414e5c414252846e96d2e353515134387d86150c68Simon Hosie1: 424e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Load and post-increase the address by x4=#8 */ 434e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v13.16b}, [x1], x4 444e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v14.16b}, [x2], x4 454e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v15.16b}, [x3], x4 464e5c414252846e96d2e353515134387d86150c68Simon Hosie 474e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Signal memory for data that will be used in the loop after the next */ 484e5c414252846e96d2e353515134387d86150c68Simon Hosie// prfm PLDL1KEEP,[x1, x4] // TODO: test this 494e5c414252846e96d2e353515134387d86150c68Simon Hosie// prfm PLDL1KEEP,[x2, x4] // TODO: test this 504e5c414252846e96d2e353515134387d86150c68Simon Hosie// prfm PLDL1KEEP,[x3, x4] // TODO: test this 514e5c414252846e96d2e353515134387d86150c68Simon Hosie 524e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v2.8h, v13.8b 534e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl2 v3.8h, v13.16b 544e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v4.8h, v14.8b 554e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl2 v5.8h, v14.16b 564e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v6.8h, v15.8b 574e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl2 v7.8h, v15.16b 584e5c414252846e96d2e353515134387d86150c68Simon Hosie 594e5c414252846e96d2e353515134387d86150c68Simon Hosie/* 604e5c414252846e96d2e353515134387d86150c68Simon Hosie The two pixel source array is 614e5c414252846e96d2e353515134387d86150c68Simon Hosie v2, v2hi, v3lo, v3hi 624e5c414252846e96d2e353515134387d86150c68Simon Hosie v4, v4hi, v5lo, v5hi 634e5c414252846e96d2e353515134387d86150c68Simon Hosie v6, v6hi, v7lo, v7hi 644e5c414252846e96d2e353515134387d86150c68Simon Hosie*/ 654e5c414252846e96d2e353515134387d86150c68Simon Hosie 664e5c414252846e96d2e353515134387d86150c68Simon Hosie smull v8.4s, v2.4h, v0.h[0] 674e5c414252846e96d2e353515134387d86150c68Simon Hosie smull2 v9.4s, v2.8h, v0.h[0] 684e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v8.4s, v2.8h, v0.h[1] 694e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v9.4s, v3.4h, v0.h[1] 704e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v8.4s, v3.4h, v0.h[2] 714e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v9.4s, v3.8h, v0.h[2] 724e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v8.4s, v4.4h, v0.h[3] 734e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v9.4s, v4.8h, v0.h[3] 744e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v8.4s, v4.8h, v0.h[4] 754e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v9.4s, v5.4h, v0.h[4] 764e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v8.4s, v5.4h, v0.h[5] 774e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v9.4s, v5.8h, v0.h[5] 784e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v8.4s, v6.4h, v0.h[6] 794e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v9.4s, v6.8h, v0.h[6] 804e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v8.4s, v6.8h, v0.h[7] 814e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v9.4s, v7.4h, v0.h[7] 824e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v8.4s, v7.4h, v1.h[0] 834e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v9.4s, v7.8h, v1.h[0] 844e5c414252846e96d2e353515134387d86150c68Simon Hosie 854e5c414252846e96d2e353515134387d86150c68Simon Hosie shrn v8.4h, v8.4s, #8 864e5c414252846e96d2e353515134387d86150c68Simon Hosie shrn2 v8.8h, v9.4s, #8 874e5c414252846e96d2e353515134387d86150c68Simon Hosie 884e5c414252846e96d2e353515134387d86150c68Simon Hosie sqxtun v8.8b, v8.8h 894e5c414252846e96d2e353515134387d86150c68Simon Hosie st1 {v8.8b}, [x0], #8 904e5c414252846e96d2e353515134387d86150c68Simon Hosie 914e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Are we done yet? */ 924e5c414252846e96d2e353515134387d86150c68Simon Hosie subs x5, x5, #1 934e5c414252846e96d2e353515134387d86150c68Simon Hosie bne 1b 944e5c414252846e96d2e353515134387d86150c68Simon Hosie 954e5c414252846e96d2e353515134387d86150c68Simon Hosie /* We're done, bye! */ 964e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v8.1d-v11.1d}, [sp], #32 974e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v12.1d-v15.1d}, [sp], #32 984e5c414252846e96d2e353515134387d86150c68Simon Hosie ret 994e5c414252846e96d2e353515134387d86150c68Simon HosieEND(rsdIntrinsicConvolve3x3_K) 1004e5c414252846e96d2e353515134387d86150c68Simon Hosie 1014e5c414252846e96d2e353515134387d86150c68Simon Hosie 1024e5c414252846e96d2e353515134387d86150c68Simon Hosie/* Convolve 5x5 */ 1034e5c414252846e96d2e353515134387d86150c68Simon Hosie 1044e5c414252846e96d2e353515134387d86150c68Simon Hosie/* 1054e5c414252846e96d2e353515134387d86150c68Simon Hosie x0 = dst 1064e5c414252846e96d2e353515134387d86150c68Simon Hosie x1 = y0 base pointer 1074e5c414252846e96d2e353515134387d86150c68Simon Hosie x2 = y1 base pointer 1084e5c414252846e96d2e353515134387d86150c68Simon Hosie x3 = y2 base pointer 1094e5c414252846e96d2e353515134387d86150c68Simon Hosie x4 = y3 base pointer 1104e5c414252846e96d2e353515134387d86150c68Simon Hosie x5 = y4 base pointer 1114e5c414252846e96d2e353515134387d86150c68Simon Hosie x6 = coeffs 1124e5c414252846e96d2e353515134387d86150c68Simon Hosie x7 = length 1134e5c414252846e96d2e353515134387d86150c68Simon Hosie*/ 1144e5c414252846e96d2e353515134387d86150c68Simon HosieENTRY(rsdIntrinsicConvolve5x5_K) 1154e5c414252846e96d2e353515134387d86150c68Simon Hosie sub x8, sp, #64 1164e5c414252846e96d2e353515134387d86150c68Simon Hosie sub sp, sp, #64 1174e5c414252846e96d2e353515134387d86150c68Simon Hosie st1 {v8.1d-v11.1d}, [x8], #32 1184e5c414252846e96d2e353515134387d86150c68Simon Hosie st1 {v12.1d-v15.1d}, [x8] 1194e5c414252846e96d2e353515134387d86150c68Simon Hosie 1204e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Create the coefficients vector */ 1214e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v0.8h-v2.8h}, [x6], #48 1224e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v3.4h}, [x6], #8 1234e5c414252846e96d2e353515134387d86150c68Simon Hosie 1244e5c414252846e96d2e353515134387d86150c68Simon Hosie movi v15.4s, #0x7f 1254e5c414252846e96d2e353515134387d86150c68Simon Hosie 1264e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Load the frequently used immediate in a register */ 1274e5c414252846e96d2e353515134387d86150c68Simon Hosie mov x6, #8 1284e5c414252846e96d2e353515134387d86150c68Simon Hosie 1294e5c414252846e96d2e353515134387d86150c68Simon Hosie1: 1304e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */ 1314e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v9.8b-v11.8b}, [x1], x6 // y0 ( y - 2 ) 1324e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v12.8b-v14.8b}, [x2], x6 // y0 ( y - 1 ) 1334e5c414252846e96d2e353515134387d86150c68Simon Hosie 1344e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Signal memory for data that will be used in the loop after the next */ 1354e5c414252846e96d2e353515134387d86150c68Simon Hosie// prfm PLDL1KEEP,[x1, x6] // TODO: test this 1364e5c414252846e96d2e353515134387d86150c68Simon Hosie// prfm PLDL1KEEP,[x2, x6] // TODO: test this 1374e5c414252846e96d2e353515134387d86150c68Simon Hosie 1384e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Promoting the 8bit channels to 16bit */ 1394e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v9.8h, v9.8b 1404e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v10.8h, v10.8b 1414e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v11.8h, v11.8b 1424e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v12.8h, v12.8b 1434e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v13.8h, v13.8b 1444e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v14.8h, v14.8b 1454e5c414252846e96d2e353515134387d86150c68Simon Hosie 1464e5c414252846e96d2e353515134387d86150c68Simon Hosie/* 1474e5c414252846e96d2e353515134387d86150c68Simon Hosie v9, v9hi, v10lo, v10hi, v11lo, v11hi, 1484e5c414252846e96d2e353515134387d86150c68Simon Hosie v12, v12hi 1494e5c414252846e96d2e353515134387d86150c68Simon Hosie*/ 1504e5c414252846e96d2e353515134387d86150c68Simon Hosie smull v4.4s, v9.4h, v0.h[0] 1514e5c414252846e96d2e353515134387d86150c68Simon Hosie smull2 v5.4s, v9.8h, v0.h[0] 1524e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v4.4s, v9.8h, v0.h[1] 1534e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v5.4s, v10.4h, v0.h[1] 1544e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v10.4h, v0.h[2] 1554e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v10.8h, v0.h[2] 1564e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v4.4s, v10.8h, v0.h[3] 1574e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v5.4s, v11.4h, v0.h[3] 1584e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v11.4h, v0.h[4] 1594e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v11.8h, v0.h[4] 1604e5c414252846e96d2e353515134387d86150c68Simon Hosie 1614e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v12.4h, v0.h[5] 1624e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v12.8h, v0.h[5] 1634e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v4.4s, v12.8h, v0.h[6] 1644e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v5.4s, v13.4h, v0.h[6] 1654e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v13.4h, v0.h[7] 1664e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v13.8h, v0.h[7] 1674e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v4.4s, v13.8h, v1.h[0] 1684e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v5.4s, v14.4h, v1.h[0] 1694e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v14.4h, v1.h[1] 1704e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v14.8h, v1.h[1] 1714e5c414252846e96d2e353515134387d86150c68Simon Hosie 1724e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Next 2 rows */ 1734e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */ 1744e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v9.8b-v11.8b}, [x3], x6 // y0 ( y ) 1754e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v12.8b-v14.8b}, [x4], x6 // y0 ( y + 1 ) 1764e5c414252846e96d2e353515134387d86150c68Simon Hosie 1774e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Signal memory for data that will be used in the loop after the next */ 1784e5c414252846e96d2e353515134387d86150c68Simon Hosie// prfm PLDL1KEEP,[x3, x6] // TODO: test this 1794e5c414252846e96d2e353515134387d86150c68Simon Hosie// prfm PLDL1KEEP,[x4, x6] // TODO: test this 1804e5c414252846e96d2e353515134387d86150c68Simon Hosie 1814e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Promoting the 8bit channels to 16bit */ 1824e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v9.8h, v9.8b 1834e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v10.8h, v10.8b 1844e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v11.8h, v11.8b 1854e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v12.8h, v12.8b 1864e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v13.8h, v13.8b 1874e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v14.8h, v14.8b 1884e5c414252846e96d2e353515134387d86150c68Simon Hosie 1894e5c414252846e96d2e353515134387d86150c68Simon Hosie/* 1904e5c414252846e96d2e353515134387d86150c68Simon Hosie v9, v9hi, v10lo, v10hi, v11lo, v11hi, 1914e5c414252846e96d2e353515134387d86150c68Simon Hosie v12, v12hi 1924e5c414252846e96d2e353515134387d86150c68Simon Hosie*/ 1934e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v9.4h, v1.h[2] 1944e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v9.8h, v1.h[2] 1954e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v4.4s, v9.8h, v1.h[3] 1964e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v5.4s, v10.4h, v1.h[3] 1974e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v10.4h, v1.h[4] 1984e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v10.8h, v1.h[4] 1994e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v4.4s, v10.8h, v1.h[5] 2004e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v5.4s, v11.4h, v1.h[5] 2014e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v11.4h, v1.h[6] 2024e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v11.8h, v1.h[6] 2034e5c414252846e96d2e353515134387d86150c68Simon Hosie 2044e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v12.4h, v1.h[7] 2054e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v12.8h, v1.h[7] 2064e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v4.4s, v12.8h, v2.h[0] 2074e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v5.4s, v13.4h, v2.h[0] 2084e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v13.4h, v2.h[1] 2094e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v13.8h, v2.h[1] 2104e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v4.4s, v13.8h, v2.h[2] 2114e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v5.4s, v14.4h, v2.h[2] 2124e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v14.4h, v2.h[3] 2134e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v14.8h, v2.h[3] 2144e5c414252846e96d2e353515134387d86150c68Simon Hosie 2154e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Last row */ 2164e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */ 2174e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v9.8b- v11.8b}, [x5], x6 // y0 ( y + 2 ) 2184e5c414252846e96d2e353515134387d86150c68Simon Hosie 2194e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Signal memory for data that will be used in the loop after the next */ 2204e5c414252846e96d2e353515134387d86150c68Simon Hosie// prfm PLDL1KEEP,[x5, x6] // TODO: test this 2214e5c414252846e96d2e353515134387d86150c68Simon Hosie 2224e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Promoting the 8bit channels to 16bit */ 2234e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v9.8h, v9.8b 2244e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v10.8h, v10.8b 2254e5c414252846e96d2e353515134387d86150c68Simon Hosie uxtl v11.8h, v11.8b 2264e5c414252846e96d2e353515134387d86150c68Simon Hosie 2274e5c414252846e96d2e353515134387d86150c68Simon Hosie/* 2284e5c414252846e96d2e353515134387d86150c68Simon Hosie v9, v9hi, v10lo, v10hi, v11lo, v11hi, 2294e5c414252846e96d2e353515134387d86150c68Simon Hosie v12, v12hi 2304e5c414252846e96d2e353515134387d86150c68Simon Hosie*/ 2314e5c414252846e96d2e353515134387d86150c68Simon Hosie 2324e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v9.4h, v2.h[4] 2334e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v9.8h, v2.h[4] 2344e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v4.4s, v9.8h, v2.h[5] 2354e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v5.4s, v10.4h, v2.h[5] 2364e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v10.4h, v2.h[6] 2374e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v10.8h, v2.h[6] 2384e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v4.4s, v10.8h, v2.h[7] 2394e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v5.4s, v11.4h, v2.h[7] 2404e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal v4.4s, v11.4h, v3.h[0] 2414e5c414252846e96d2e353515134387d86150c68Simon Hosie smlal2 v5.4s, v11.8h, v3.h[0] 2424e5c414252846e96d2e353515134387d86150c68Simon Hosie 2434e5c414252846e96d2e353515134387d86150c68Simon Hosie add v4.4s, v4.4s, v15.4s 2444e5c414252846e96d2e353515134387d86150c68Simon Hosie add v5.4s, v5.4s, v15.4s 2454e5c414252846e96d2e353515134387d86150c68Simon Hosie 2464e5c414252846e96d2e353515134387d86150c68Simon Hosie/* Narrow it to a d-reg 32 -> 16 bit */ 2474e5c414252846e96d2e353515134387d86150c68Simon Hosie rshrn v4.4h, v4.4s, #8 2484e5c414252846e96d2e353515134387d86150c68Simon Hosie rshrn2 v4.8h, v5.4s, #8 2494e5c414252846e96d2e353515134387d86150c68Simon Hosie 2504e5c414252846e96d2e353515134387d86150c68Simon Hosie 2514e5c414252846e96d2e353515134387d86150c68Simon Hosie/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */ 2524e5c414252846e96d2e353515134387d86150c68Simon Hosie sqxtun v4.8b, v4.8h 2534e5c414252846e96d2e353515134387d86150c68Simon Hosie 2544e5c414252846e96d2e353515134387d86150c68Simon Hosie st1 {v4.8b}, [x0], #8 // return the output and increase the address of x0 2554e5c414252846e96d2e353515134387d86150c68Simon Hosie 2564e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Are we done? */ 2574e5c414252846e96d2e353515134387d86150c68Simon Hosie subs x7, x7, #1 2584e5c414252846e96d2e353515134387d86150c68Simon Hosie bne 1b 2594e5c414252846e96d2e353515134387d86150c68Simon Hosie 2604e5c414252846e96d2e353515134387d86150c68Simon Hosie /* Yup, bye */ 2614e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v8.1d-v11.1d}, [sp], #32 2624e5c414252846e96d2e353515134387d86150c68Simon Hosie ld1 {v12.1d-v15.1d}, [sp], #32 2634e5c414252846e96d2e353515134387d86150c68Simon Hosie ret 2644e5c414252846e96d2e353515134387d86150c68Simon Hosie 2654e5c414252846e96d2e353515134387d86150c68Simon HosieEND(rsdIntrinsicConvolve5x5_K) 266