14e5c414252846e96d2e353515134387d86150c68Simon Hosie/*
24e5c414252846e96d2e353515134387d86150c68Simon Hosie * Copyright (C) 2012,2014 The Android Open Source Project
34e5c414252846e96d2e353515134387d86150c68Simon Hosie *
44e5c414252846e96d2e353515134387d86150c68Simon Hosie * Licensed under the Apache License, Version 2.0 (the "License");
54e5c414252846e96d2e353515134387d86150c68Simon Hosie * you may not use this file except in compliance with the License.
64e5c414252846e96d2e353515134387d86150c68Simon Hosie * You may obtain a copy of the License at
74e5c414252846e96d2e353515134387d86150c68Simon Hosie *
84e5c414252846e96d2e353515134387d86150c68Simon Hosie *      http://www.apache.org/licenses/LICENSE-2.0
94e5c414252846e96d2e353515134387d86150c68Simon Hosie *
104e5c414252846e96d2e353515134387d86150c68Simon Hosie * Unless required by applicable law or agreed to in writing, software
114e5c414252846e96d2e353515134387d86150c68Simon Hosie * distributed under the License is distributed on an "AS IS" BASIS,
124e5c414252846e96d2e353515134387d86150c68Simon Hosie * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
134e5c414252846e96d2e353515134387d86150c68Simon Hosie * See the License for the specific language governing permissions and
144e5c414252846e96d2e353515134387d86150c68Simon Hosie * limitations under the License.
154e5c414252846e96d2e353515134387d86150c68Simon Hosie */
164e5c414252846e96d2e353515134387d86150c68Simon Hosie
174e5c414252846e96d2e353515134387d86150c68Simon Hosie/*
184e5c414252846e96d2e353515134387d86150c68Simon Hosie        x0 = dst
194e5c414252846e96d2e353515134387d86150c68Simon Hosie        x1 = y0 base pointer
204e5c414252846e96d2e353515134387d86150c68Simon Hosie        x2 = y1 base pointer
214e5c414252846e96d2e353515134387d86150c68Simon Hosie        x3 = y2 base pointer
224e5c414252846e96d2e353515134387d86150c68Simon Hosie        x4 = coeffs
234e5c414252846e96d2e353515134387d86150c68Simon Hosie        x5 = length / 2
244e5c414252846e96d2e353515134387d86150c68Simon Hosie*/
254e5c414252846e96d2e353515134387d86150c68Simon Hosie
264e5c414252846e96d2e353515134387d86150c68Simon Hosie#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f:
274e5c414252846e96d2e353515134387d86150c68Simon Hosie#define END(f) .size f, .-f;
284e5c414252846e96d2e353515134387d86150c68Simon Hosie
294e5c414252846e96d2e353515134387d86150c68Simon HosieENTRY(rsdIntrinsicConvolve3x3_K)
304e5c414252846e96d2e353515134387d86150c68Simon Hosie        sub             x6, sp, #64
314e5c414252846e96d2e353515134387d86150c68Simon Hosie        sub             sp, sp, #64
324e5c414252846e96d2e353515134387d86150c68Simon Hosie        st1             {v8.1d-v11.1d}, [x6], #32
334e5c414252846e96d2e353515134387d86150c68Simon Hosie        st1             {v12.1d-v15.1d}, [x6]
344e5c414252846e96d2e353515134387d86150c68Simon Hosie
354e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Load the coefficients in the v0, v1 registers */
364e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1     {v0.8h, v1.8h}, [x4]
374e5c414252846e96d2e353515134387d86150c68Simon Hosie
384e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Load the frequently used immediate in a register */
394e5c414252846e96d2e353515134387d86150c68Simon Hosie        mov x4, #8
404e5c414252846e96d2e353515134387d86150c68Simon Hosie
414e5c414252846e96d2e353515134387d86150c68Simon Hosie1:
424e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Load and post-increase the address by x4=#8 */
434e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1     {v13.16b}, [x1], x4
444e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1     {v14.16b}, [x2], x4
454e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1     {v15.16b}, [x3], x4
464e5c414252846e96d2e353515134387d86150c68Simon Hosie
474e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Signal memory for data that will be used in the loop after the next */
484e5c414252846e96d2e353515134387d86150c68Simon Hosie//        prfm        PLDL1KEEP,[x1, x4] // TODO: test this
494e5c414252846e96d2e353515134387d86150c68Simon Hosie//        prfm        PLDL1KEEP,[x2, x4] // TODO: test this
504e5c414252846e96d2e353515134387d86150c68Simon Hosie//        prfm        PLDL1KEEP,[x3, x4] // TODO: test this
514e5c414252846e96d2e353515134387d86150c68Simon Hosie
524e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v2.8h, v13.8b
534e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl2     v3.8h, v13.16b
544e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v4.8h, v14.8b
554e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl2     v5.8h, v14.16b
564e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v6.8h, v15.8b
574e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl2     v7.8h, v15.16b
584e5c414252846e96d2e353515134387d86150c68Simon Hosie
594e5c414252846e96d2e353515134387d86150c68Simon Hosie/*
604e5c414252846e96d2e353515134387d86150c68Simon Hosie        The two pixel source array is
614e5c414252846e96d2e353515134387d86150c68Simon Hosie        v2,  v2hi,  v3lo,  v3hi
624e5c414252846e96d2e353515134387d86150c68Simon Hosie        v4,  v4hi,  v5lo, v5hi
634e5c414252846e96d2e353515134387d86150c68Simon Hosie        v6, v6hi, v7lo, v7hi
644e5c414252846e96d2e353515134387d86150c68Simon Hosie*/
654e5c414252846e96d2e353515134387d86150c68Simon Hosie
664e5c414252846e96d2e353515134387d86150c68Simon Hosie        smull     v8.4s, v2.4h, v0.h[0]
674e5c414252846e96d2e353515134387d86150c68Simon Hosie        smull2    v9.4s, v2.8h, v0.h[0]
684e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v8.4s, v2.8h, v0.h[1]
694e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v9.4s, v3.4h, v0.h[1]
704e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v8.4s, v3.4h, v0.h[2]
714e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v9.4s, v3.8h, v0.h[2]
724e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v8.4s, v4.4h, v0.h[3]
734e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v9.4s, v4.8h, v0.h[3]
744e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v8.4s, v4.8h, v0.h[4]
754e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v9.4s, v5.4h, v0.h[4]
764e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v8.4s, v5.4h, v0.h[5]
774e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v9.4s, v5.8h, v0.h[5]
784e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v8.4s, v6.4h, v0.h[6]
794e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v9.4s, v6.8h, v0.h[6]
804e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v8.4s, v6.8h, v0.h[7]
814e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v9.4s, v7.4h, v0.h[7]
824e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v8.4s, v7.4h, v1.h[0]
834e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v9.4s, v7.8h, v1.h[0]
844e5c414252846e96d2e353515134387d86150c68Simon Hosie
854e5c414252846e96d2e353515134387d86150c68Simon Hosie        shrn      v8.4h, v8.4s, #8
864e5c414252846e96d2e353515134387d86150c68Simon Hosie        shrn2     v8.8h, v9.4s, #8
874e5c414252846e96d2e353515134387d86150c68Simon Hosie
884e5c414252846e96d2e353515134387d86150c68Simon Hosie        sqxtun      v8.8b, v8.8h
894e5c414252846e96d2e353515134387d86150c68Simon Hosie        st1         {v8.8b}, [x0], #8
904e5c414252846e96d2e353515134387d86150c68Simon Hosie
914e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Are we done yet? */
924e5c414252846e96d2e353515134387d86150c68Simon Hosie        subs x5, x5, #1
934e5c414252846e96d2e353515134387d86150c68Simon Hosie        bne 1b
944e5c414252846e96d2e353515134387d86150c68Simon Hosie
954e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* We're done, bye! */
964e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1             {v8.1d-v11.1d}, [sp], #32
974e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1             {v12.1d-v15.1d}, [sp], #32
984e5c414252846e96d2e353515134387d86150c68Simon Hosie        ret
994e5c414252846e96d2e353515134387d86150c68Simon HosieEND(rsdIntrinsicConvolve3x3_K)
1004e5c414252846e96d2e353515134387d86150c68Simon Hosie
1014e5c414252846e96d2e353515134387d86150c68Simon Hosie
1024e5c414252846e96d2e353515134387d86150c68Simon Hosie/* Convolve 5x5 */
1034e5c414252846e96d2e353515134387d86150c68Simon Hosie
1044e5c414252846e96d2e353515134387d86150c68Simon Hosie/*
1054e5c414252846e96d2e353515134387d86150c68Simon Hosie        x0 = dst
1064e5c414252846e96d2e353515134387d86150c68Simon Hosie        x1 = y0 base pointer
1074e5c414252846e96d2e353515134387d86150c68Simon Hosie        x2 = y1 base pointer
1084e5c414252846e96d2e353515134387d86150c68Simon Hosie        x3 = y2 base pointer
1094e5c414252846e96d2e353515134387d86150c68Simon Hosie        x4 = y3 base pointer
1104e5c414252846e96d2e353515134387d86150c68Simon Hosie        x5 = y4 base pointer
1114e5c414252846e96d2e353515134387d86150c68Simon Hosie        x6 = coeffs
1124e5c414252846e96d2e353515134387d86150c68Simon Hosie        x7 = length
1134e5c414252846e96d2e353515134387d86150c68Simon Hosie*/
1144e5c414252846e96d2e353515134387d86150c68Simon HosieENTRY(rsdIntrinsicConvolve5x5_K)
1154e5c414252846e96d2e353515134387d86150c68Simon Hosie        sub         x8, sp, #64
1164e5c414252846e96d2e353515134387d86150c68Simon Hosie        sub         sp, sp, #64
1174e5c414252846e96d2e353515134387d86150c68Simon Hosie        st1         {v8.1d-v11.1d}, [x8], #32
1184e5c414252846e96d2e353515134387d86150c68Simon Hosie        st1         {v12.1d-v15.1d}, [x8]
1194e5c414252846e96d2e353515134387d86150c68Simon Hosie
1204e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Create the coefficients vector  */
1214e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1         {v0.8h-v2.8h}, [x6], #48
1224e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1         {v3.4h}, [x6], #8
1234e5c414252846e96d2e353515134387d86150c68Simon Hosie
1244e5c414252846e96d2e353515134387d86150c68Simon Hosie        movi      v15.4s, #0x7f
1254e5c414252846e96d2e353515134387d86150c68Simon Hosie
1264e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Load the frequently used immediate in a register */
1274e5c414252846e96d2e353515134387d86150c68Simon Hosie        mov     x6, #8
1284e5c414252846e96d2e353515134387d86150c68Simon Hosie
1294e5c414252846e96d2e353515134387d86150c68Simon Hosie1:
1304e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
1314e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1     {v9.8b-v11.8b}, [x1], x6      //  y0 ( y - 2 )
1324e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1     {v12.8b-v14.8b}, [x2], x6      //  y0 ( y - 1 )
1334e5c414252846e96d2e353515134387d86150c68Simon Hosie
1344e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Signal memory for data that will be used in the loop after the next */
1354e5c414252846e96d2e353515134387d86150c68Simon Hosie//        prfm        PLDL1KEEP,[x1, x6] // TODO: test this
1364e5c414252846e96d2e353515134387d86150c68Simon Hosie//        prfm        PLDL1KEEP,[x2, x6] // TODO: test this
1374e5c414252846e96d2e353515134387d86150c68Simon Hosie
1384e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Promoting the 8bit channels to 16bit */
1394e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v9.8h,  v9.8b
1404e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v10.8h, v10.8b
1414e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v11.8h, v11.8b
1424e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v12.8h, v12.8b
1434e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v13.8h, v13.8b
1444e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v14.8h, v14.8b
1454e5c414252846e96d2e353515134387d86150c68Simon Hosie
1464e5c414252846e96d2e353515134387d86150c68Simon Hosie/*
1474e5c414252846e96d2e353515134387d86150c68Simon Hosie        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
1484e5c414252846e96d2e353515134387d86150c68Simon Hosie        v12,  v12hi
1494e5c414252846e96d2e353515134387d86150c68Simon Hosie*/
1504e5c414252846e96d2e353515134387d86150c68Simon Hosie        smull     v4.4s, v9.4h, v0.h[0]
1514e5c414252846e96d2e353515134387d86150c68Simon Hosie        smull2    v5.4s, v9.8h, v0.h[0]
1524e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v4.4s, v9.8h, v0.h[1]
1534e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v5.4s, v10.4h, v0.h[1]
1544e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v10.4h, v0.h[2]
1554e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v10.8h, v0.h[2]
1564e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v4.4s, v10.8h, v0.h[3]
1574e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v5.4s, v11.4h, v0.h[3]
1584e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v11.4h, v0.h[4]
1594e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v11.8h, v0.h[4]
1604e5c414252846e96d2e353515134387d86150c68Simon Hosie
1614e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v12.4h, v0.h[5]
1624e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v12.8h, v0.h[5]
1634e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v4.4s, v12.8h, v0.h[6]
1644e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v5.4s, v13.4h, v0.h[6]
1654e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v13.4h, v0.h[7]
1664e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v13.8h, v0.h[7]
1674e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v4.4s, v13.8h, v1.h[0]
1684e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v5.4s, v14.4h, v1.h[0]
1694e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v14.4h, v1.h[1]
1704e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v14.8h, v1.h[1]
1714e5c414252846e96d2e353515134387d86150c68Simon Hosie
1724e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Next 2 rows */
1734e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
1744e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1     {v9.8b-v11.8b}, [x3], x6      //  y0 ( y )
1754e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1     {v12.8b-v14.8b}, [x4], x6      //  y0 ( y + 1 )
1764e5c414252846e96d2e353515134387d86150c68Simon Hosie
1774e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Signal memory for data that will be used in the loop after the next */
1784e5c414252846e96d2e353515134387d86150c68Simon Hosie//        prfm        PLDL1KEEP,[x3, x6] // TODO: test this
1794e5c414252846e96d2e353515134387d86150c68Simon Hosie//        prfm        PLDL1KEEP,[x4, x6] // TODO: test this
1804e5c414252846e96d2e353515134387d86150c68Simon Hosie
1814e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Promoting the 8bit channels to 16bit */
1824e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v9.8h,  v9.8b
1834e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v10.8h, v10.8b
1844e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v11.8h, v11.8b
1854e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v12.8h, v12.8b
1864e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v13.8h, v13.8b
1874e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v14.8h, v14.8b
1884e5c414252846e96d2e353515134387d86150c68Simon Hosie
1894e5c414252846e96d2e353515134387d86150c68Simon Hosie/*
1904e5c414252846e96d2e353515134387d86150c68Simon Hosie        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
1914e5c414252846e96d2e353515134387d86150c68Simon Hosie        v12,  v12hi
1924e5c414252846e96d2e353515134387d86150c68Simon Hosie*/
1934e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v9.4h, v1.h[2]
1944e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v9.8h, v1.h[2]
1954e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v4.4s, v9.8h, v1.h[3]
1964e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v5.4s, v10.4h, v1.h[3]
1974e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v10.4h, v1.h[4]
1984e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v10.8h, v1.h[4]
1994e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v4.4s, v10.8h, v1.h[5]
2004e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v5.4s, v11.4h, v1.h[5]
2014e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v11.4h, v1.h[6]
2024e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v11.8h, v1.h[6]
2034e5c414252846e96d2e353515134387d86150c68Simon Hosie
2044e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v12.4h, v1.h[7]
2054e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v12.8h, v1.h[7]
2064e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v4.4s, v12.8h, v2.h[0]
2074e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v5.4s, v13.4h, v2.h[0]
2084e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v13.4h, v2.h[1]
2094e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v13.8h, v2.h[1]
2104e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v4.4s, v13.8h, v2.h[2]
2114e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v5.4s, v14.4h, v2.h[2]
2124e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v14.4h, v2.h[3]
2134e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v14.8h, v2.h[3]
2144e5c414252846e96d2e353515134387d86150c68Simon Hosie
2154e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Last row */
2164e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
2174e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1     {v9.8b- v11.8b}, [x5], x6      //  y0 ( y + 2 )
2184e5c414252846e96d2e353515134387d86150c68Simon Hosie
2194e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Signal memory for data that will be used in the loop after the next */
2204e5c414252846e96d2e353515134387d86150c68Simon Hosie//        prfm        PLDL1KEEP,[x5, x6] // TODO: test this
2214e5c414252846e96d2e353515134387d86150c68Simon Hosie
2224e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Promoting the 8bit channels to 16bit */
2234e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v9.8h,  v9.8b
2244e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v10.8h, v10.8b
2254e5c414252846e96d2e353515134387d86150c68Simon Hosie        uxtl      v11.8h, v11.8b
2264e5c414252846e96d2e353515134387d86150c68Simon Hosie
2274e5c414252846e96d2e353515134387d86150c68Simon Hosie/*
2284e5c414252846e96d2e353515134387d86150c68Simon Hosie        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
2294e5c414252846e96d2e353515134387d86150c68Simon Hosie        v12,  v12hi
2304e5c414252846e96d2e353515134387d86150c68Simon Hosie*/
2314e5c414252846e96d2e353515134387d86150c68Simon Hosie
2324e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v9.4h, v2.h[4]
2334e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v9.8h, v2.h[4]
2344e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v4.4s, v9.8h, v2.h[5]
2354e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v5.4s, v10.4h, v2.h[5]
2364e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v10.4h, v2.h[6]
2374e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v10.8h, v2.h[6]
2384e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v4.4s, v10.8h, v2.h[7]
2394e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v5.4s, v11.4h, v2.h[7]
2404e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal     v4.4s, v11.4h, v3.h[0]
2414e5c414252846e96d2e353515134387d86150c68Simon Hosie        smlal2    v5.4s, v11.8h, v3.h[0]
2424e5c414252846e96d2e353515134387d86150c68Simon Hosie
2434e5c414252846e96d2e353515134387d86150c68Simon Hosie        add      v4.4s, v4.4s, v15.4s
2444e5c414252846e96d2e353515134387d86150c68Simon Hosie        add      v5.4s, v5.4s, v15.4s
2454e5c414252846e96d2e353515134387d86150c68Simon Hosie
2464e5c414252846e96d2e353515134387d86150c68Simon Hosie/*      Narrow it to a d-reg 32 -> 16 bit */
2474e5c414252846e96d2e353515134387d86150c68Simon Hosie        rshrn      v4.4h, v4.4s, #8
2484e5c414252846e96d2e353515134387d86150c68Simon Hosie        rshrn2     v4.8h, v5.4s, #8
2494e5c414252846e96d2e353515134387d86150c68Simon Hosie
2504e5c414252846e96d2e353515134387d86150c68Simon Hosie
2514e5c414252846e96d2e353515134387d86150c68Simon Hosie/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
2524e5c414252846e96d2e353515134387d86150c68Simon Hosie        sqxtun      v4.8b, v4.8h
2534e5c414252846e96d2e353515134387d86150c68Simon Hosie
2544e5c414252846e96d2e353515134387d86150c68Simon Hosie        st1     {v4.8b}, [x0], #8        // return the output and increase the address of x0
2554e5c414252846e96d2e353515134387d86150c68Simon Hosie
2564e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Are we done? */
2574e5c414252846e96d2e353515134387d86150c68Simon Hosie        subs x7, x7, #1
2584e5c414252846e96d2e353515134387d86150c68Simon Hosie        bne 1b
2594e5c414252846e96d2e353515134387d86150c68Simon Hosie
2604e5c414252846e96d2e353515134387d86150c68Simon Hosie        /* Yup, bye */
2614e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1         {v8.1d-v11.1d}, [sp], #32
2624e5c414252846e96d2e353515134387d86150c68Simon Hosie        ld1         {v12.1d-v15.1d}, [sp], #32
2634e5c414252846e96d2e353515134387d86150c68Simon Hosie        ret
2644e5c414252846e96d2e353515134387d86150c68Simon Hosie
2654e5c414252846e96d2e353515134387d86150c68Simon HosieEND(rsdIntrinsicConvolve5x5_K)
266