1ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/* 2ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * Copyright (C) 2014 The Android Open Source Project 3ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * 4ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * Licensed under the Apache License, Version 2.0 (the "License"); 5ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * you may not use this file except in compliance with the License. 6ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * You may obtain a copy of the License at 7ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * 8ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * http://www.apache.org/licenses/LICENSE-2.0 9ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * 10ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * Unless required by applicable law or agreed to in writing, software 11ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * distributed under the License is distributed on an "AS IS" BASIS, 12ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * See the License for the specific language governing permissions and 14ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * limitations under the License. 15ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */ 16ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 17ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie#define END(f) .size f, .-f; 19ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 20ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/* Perform the actual YuvToRGB conversion in a macro, from register to 21ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * register. This macro will be called from within several different wrapper 22ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * variants for different data layouts. Y data starts with the even and odd 23ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * bytes split into the low parts of v8 and v9 respectively. U and V are in 24e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v10 and v11. Working constants are pre-loaded into v24-v31, and v3 and v7 25e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * are pre-loaded with a constant 0xff alpha channel. 26ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * 27ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * The complicated arithmetic is the result of refactoring the original 28ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * equations to avoid 16-bit overflow without losing any precision. 29ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */ 30e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie.macro yuvkern, regu=v10, regv=v11 31e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie /* v0 out R_lo / even R_lo accumulator 32e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v1 out G_lo / even G_lo accumulator 33e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v2 out B_lo / even B_lo accumulator 34e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v3 out A_lo / const 0xff*ff 35e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v4 out R_hi / even R_hi accumulator 36e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v5 out G_hi / even G_hi accumulator 37e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v6 out B_hi / even B_hi accumulator 38e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v7 out A_hi / const 0xff*ff 39e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v8 even Y / G_lo luma tmp 40e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v9 odd Y / G_lo luma tmp 41e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * \regu in U 42e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * \regv in V 43e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v12 R_lo luma tmp 44e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v13 B_lo luma tmp 45e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v14 R_hi luma tmp 46e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v15 B_hi luma tmp 47e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v16 odd R_lo accumulator 48e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v17 odd G_lo accumulator 49e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v18 odd B_lo accumulator 50e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v19 multiplier extra bits low 51e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v20 odd R_hi accumulator 52e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v21 odd G_hi accumulator 53e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v22 odd B_hi accumulator 54e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v23 multiplier extra bits high 55e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v24 constant 149 56e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v25 constant 50 57e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v26 constant 104 58e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v27 constant 204 59e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v28 constant 254 60e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v29 constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1) 61e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v30 constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0) 62e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * v31 constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1) 63e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie */ 64e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 65e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umull v1.8h, v8.8b, v24.8b // g0 = y0 * 149 66e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umull v17.8h, v9.8b, v24.8b // g1 = y1 * 149 67e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umull2 v5.8h, v8.16b, v24.16b // g0_hi = y0_hi * 149 68e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umull2 v21.8h, v9.16b, v24.16b // g1_hi = y1_hi * 149 69e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 70e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umull v8.8h, \regu\().8b, v25.8b // g2 = u * 50 + v * 104 71e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umlal v8.8h, \regv\().8b, v26.8b 72e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umull2 v9.8h, \regu\().16b, v25.16b // g2_hi = u_hi * 50 + v_hi * 104 73e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umlal2 v9.8h, \regv\().16b, v26.16b 74e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 75e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ushr v19.16b, \regv\().16b, #1 76e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uaddw v0.8h, v1.8h, v19.8b // r0 = g0 + (v >> 1) 77e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uaddw v16.8h, v17.8h, v19.8b // r1 = g1 + (v >> 1) 78e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 79e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uaddw2 v4.8h, v5.8h, v19.16b // r0_hi = g0_hi + (v_hi >> 1) 80e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uaddw2 v20.8h, v21.8h, v19.16b // r1_hi = g1_hi + (v_hi >> 1) 81e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 82e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ushll v19.8h, \regu\().8b, #2 83e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ushll2 v23.8h, \regu\().16b, #2 84e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie add v2.8h, v1.8h, v19.8h // b0 = g0 + (u << 2) 85e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie add v18.8h, v17.8h, v19.8h // b1 = g1 + (u << 2) 86e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 87e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie add v6.8h, v5.8h, v23.8h // b0_hi = g0_hi + (u_hi << 2) 88e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie add v22.8h, v21.8h, v23.8h // b1_hi = g1_hi + (u_hi << 2) 89e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 90e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umull v12.8h, \regv\().8b, v27.8b // r2 = v * 204 91e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umull v13.8h, \regu\().8b, v28.8b // b2 = u * 254 92e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 93e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umull2 v14.8h, \regv\().16b, v27.16b // r2_hi = v_hi * 204 94e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie umull2 v15.8h, \regu\().16b, v28.16b // b2_hi = u_hi * 254 95e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 96e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uhadd v0.8h, v0.8h, v12.8h // r0 = (r0 + r2) >> 1 97e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uhadd v16.8h, v16.8h, v12.8h // r1 = (r1 + r2) >> 1 98e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqadd v1.8h, v1.8h, v30.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 99e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqadd v17.8h, v17.8h, v30.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 100e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uhadd v2.8h, v2.8h, v13.8h // b0 = (b0 + b2) >> 1 101e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uhadd v18.8h, v18.8h, v13.8h // b1 = (b1 + b2) >> 1 102e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 103e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uhadd v4.8h, v4.8h, v14.8h // r0_hi = (r0_hi + r2_hi) >> 1 104e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uhadd v20.8h, v20.8h, v14.8h // r1_hi = (r1_hi + r2_hi) >> 1 105e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqadd v5.8h, v5.8h, v30.8h // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 106e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqadd v21.8h, v21.8h, v30.8h // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 107e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uhadd v6.8h, v6.8h, v15.8h // b0_hi = (b0_hi + b2_hi) >> 1 108e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uhadd v22.8h, v22.8h, v15.8h // b1_hi = (b1_hi + b2_hi) >> 1 109e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 110e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v0.8h, v0.8h, v29.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 111e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v16.8h, v16.8h, v29.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 112e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2) 113e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v17.8h, v17.8h, v8.8h // g1 = satu16(g1 - g2) 114e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v2.8h, v2.8h, v31.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 115e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v18.8h, v18.8h, v31.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 116e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 117e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v4.8h, v4.8h, v29.8h // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 118e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v20.8h, v20.8h, v29.8h // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 119e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v5.8h, v5.8h, v9.8h // g0_hi = satu16(g0_hi - g2_hi) 120e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v21.8h, v21.8h, v9.8h // g1_hi = satu16(g1_hi - g2_hi) 121e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v6.8h, v6.8h, v31.8h // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 122e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqsub v22.8h, v22.8h, v31.8h // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 123e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 124e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v0.8b, v0.8h, #6 125e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v16.8b, v16.8h, #6 126e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v1.8b, v1.8h, #7 127e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v17.8b, v17.8h, #7 128e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v2.8b, v2.8h, #6 129e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v18.8b, v18.8h, #6 130e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 131e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v4.8b, v4.8h, #6 132e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v20.8b, v20.8h, #6 133e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v5.8b, v5.8h, #7 134e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v21.8b, v21.8h, #7 135e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v6.8b, v6.8h, #6 136e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uqrshrn v22.8b, v22.8h, #6 137e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 138e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip1 v0.16b, v0.16b, v16.16b 139e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip1 v1.16b, v1.16b, v17.16b 140e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip1 v2.16b, v2.16b, v18.16b 141e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 142e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip1 v4.16b, v4.16b, v20.16b 143e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip1 v5.16b, v5.16b, v21.16b 144e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip1 v6.16b, v6.16b, v22.16b 145ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie.endm 146ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 147ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/* Define the wrapper code which will load and store the data, iterate the 148ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * correct number of times, and safely handle the remainder at the end of the 149ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * loop. Some sections of code are switched out depending on the data packing 150ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * being handled. 151ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */ 152ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie.macro wrap_line kernel, interleaved=0, swapuv=0 153e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie movi v24.16b, #149 154e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie movi v25.16b, #50 155e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie movi v26.16b, #104 156e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie movi v27.16b, #204 157e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie movi v28.16b, #254 158ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1) 159e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie dup v29.8h, w5 160ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0) 161e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie dup v30.8h, w5 162ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1) 163e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie dup v31.8h, w5 164ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 165ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie movi v3.16b, #0xff 166e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie movi v7.16b, #0xff 167ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 168e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie subs x2, x2, #32 169ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie bhs 1f 170ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie b 2f 171ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 172ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .align 4 173e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie1: ld2 {v8.16b,v9.16b}, [x1], #32 174ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .if \interleaved 175e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld2 {v10.16b,v11.16b}, [x3], #32 176ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .else 177e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v10.16b}, [x3], #16 178e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v11.16b}, [x4], #16 179ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .endif 180ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 181e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie .if \swapuv 182e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie \kernel regu=v11, regv=v10 183e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie .else 184ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie \kernel 185e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie .endif 186ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 187e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie subs x2, x2, #32 188ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 189e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie st4 {v0.16b - v3.16b}, [x0], #64 190e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie st4 {v4.16b - v7.16b}, [x0], #64 191ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 192ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie bhs 1b 193ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 194e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie2: adds x2, x2, #32 195ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie beq 2f 196ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 197e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie /* To handle the tail portion of the data (something less than 32 198ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * bytes) load small power-of-two chunks into working registers. It 199ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * doesn't matter where they end up in the register; the same process 200ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * will store them back out using the same positions and the 201ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * interaction between neighbouring pixels is constrained to odd 202ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * boundaries where the load operations don't interfere. 203ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */ 204ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie movi v8.8b, #0 205ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie movi v9.8b, #0 206e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie movi v10.8b, #0 207e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie movi v11.8b, #0 208ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 209e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie tbz x2, #4, 1f 210e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v9.16b}, [x1], #16 211ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .if \interleaved 212e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v11.16b}, [x3], #16 213ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .else 214e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v10.d}[1], [x3], #8 215e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v11.d}[1], [x4], #8 216e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie .endif 217e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie1: tbz x2, #3, 1f 218e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v8.d}[1], [x1], #8 219e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie .if \interleaved 220e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v10.d}[1], [x3], #8 221e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie .else 222e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v10.s}[1], [x3], #4 223e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v11.s}[1], [x4], #4 224ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .endif 225ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1: tbz x2, #2, 1f 226ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ld1 {v8.s}[1], [x1], #4 227ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .if \interleaved 228e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v10.s}[1], [x3], #4 229ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .else 230e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v10.h}[1], [x3], #2 231e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v11.h}[1], [x4], #2 232ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .endif 233ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1: tbz x2, #1, 1f 234ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ld1 {v8.h}[1], [x1], #2 235ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .if \interleaved 236e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v10.h}[1], [x3], #2 237ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .else 238e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v10.b}[1], [x3], #1 239e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v11.b}[1], [x4], #1 240ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .endif 241ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1: tbz x2, #0, 1f 242ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ld1 {v8.b}[1], [x1], #1 243ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .if \interleaved 244e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v10.h}[0], [x3], #2 245ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .else 246e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v10.b}[0], [x3], #1 247e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie ld1 {v11.b}[0], [x4], #1 248ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .endif 249ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 250ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie /* One small impediment in the process above is that some of the load 251ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * operations can't perform byte-wise structure deinterleaving at the 252ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * same time as loading only part of a register. So the data is loaded 253ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * linearly and unpacked manually at this point if necessary. 254ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */ 255e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie1: mov v12.16b, v8.16b 256e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uzp1 v8.16b, v12.16b, v9.16b 257e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uzp2 v9.16b, v12.16b, v9.16b 258ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .if \interleaved 259e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie mov v12.16b, v10.16b 260e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uzp1 v10.16b, v12.16b, v11.16b 261e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie uzp2 v11.16b, v12.16b, v11.16b 262ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie .endif 263ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 264e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie .if \swapuv 265e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie \kernel regu=v11, regv=v10 266e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie .else 267ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie \kernel 268e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie .endif 269ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 270ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie /* As above but with the output; structured stores for partial vectors 271ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * aren't available, so the data is re-packed first and stored linearly. 272ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */ 273e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip1 v16.16b, v0.16b, v2.16b 274e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip2 v18.16b, v0.16b, v2.16b 275e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip1 v17.16b, v1.16b, v3.16b 276e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip2 v19.16b, v1.16b, v3.16b 277e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip1 v0.16b, v16.16b, v17.16b 278e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip2 v1.16b, v16.16b, v17.16b 279e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip1 v2.16b, v18.16b, v19.16b 280e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie zip2 v3.16b, v18.16b, v19.16b 281e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie 282e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie /* Luckily v4-v7 don't need to be unzipped because the complete set of 283e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie * four and can be stored using st4. */ 284ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 285e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie tbz x2, #4, 1f 286e8814f7c80f84f08e60150e70b1a4e6a11b588bdSimon Hosie st4 {v4.16b - v7.16b}, [x0], #64 287ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1: tbz x2, #3, 1f 288ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie st1 {v2.16b,v3.16b}, [x0], #32 289ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1: tbz x2, #2, 1f 290ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie st1 {v1.16b}, [x0], #16 291ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1: tbz x2, #1, 1f 292ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie st1 {v0.d}[1], [x0], #8 293ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1: tbz x2, #0, 2f 294ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie st1 {v0.s}[1], [x0], #4 295ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie2: 296ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie.endm 297ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 298ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 299ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/* void rsdIntrinsicYuv2_K( 300ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * void *out, // x0 301ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * void const *yin, // x1 302ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * void const *uin, // x2 303ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * void const *vin, // x3 304ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * size_t xstart, // x4 305ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * size_t xend); // x5 306ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */ 307ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieENTRY(rsdIntrinsicYuv2_K) 308ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie lsr x6, x4, #1 309ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie add x0, x0, x4, LSL #2 310ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie add x1, x1, x4 311ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie add x4, x3, x6 312ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie add x3, x2, x6 3139732e859ff5d1911915eb83411c9b1ae991c7523Simon Hosie sub x2, x5, x6, LSL #1 314ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 315ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie sub x6, sp, #32 316ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie sub sp, sp, #64 317ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie st1 {v8.1d - v11.1d}, [sp] 318ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie st1 {v12.1d - v15.1d}, [x6] 319ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 320ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie wrap_line yuvkern, 0 321ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 322ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ld1 {v8.1d - v11.1d}, [sp], #32 323ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ld1 {v12.1d - v15.1d}, [sp], #32 324ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ret 325ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieEND(rsdIntrinsicYuv2_K) 326ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 327ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/* void rsdIntrinsicYuv_K( 328ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * void *out, // x0 329ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * void const *yin, // x1 330ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * void const *uvin, // x2 331ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * size_t xstart, // x3 332ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * size_t xend); // x4 333ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */ 334ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieENTRY(rsdIntrinsicYuv_K) 3351d9c887c58d115975e01c9d500595f503803dc8cSimon Hosie bic x5, x3, #1 336ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie add x0, x0, x5, LSL #2 337ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie add x1, x1, x5 338ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie add x3, x2, x5 339ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie sub x2, x4, x5 340ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 341ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie sub x5, sp, #32 342ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie sub sp, sp, #64 343ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie st1 {v8.1d - v11.1d}, [sp] 344ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie st1 {v12.1d - v15.1d}, [x5] 345ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 346ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie wrap_line yuvkern, 1, 1 347ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 348ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ld1 {v8.1d - v11.1d}, [sp], #32 349ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ld1 {v12.1d - v15.1d}, [sp], #32 350ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ret 351ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieEND(rsdIntrinsicYuv_K) 352ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 353ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/* void rsdIntrinsicYuvR_K( 354ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * void *out, // x0 355ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * void const *yin, // x1 356ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * void const *uvin, // x2 357ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * size_t xstart, // x3 358ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * size_t xend); // x4 359ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */ 360ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieENTRY(rsdIntrinsicYuvR_K) 3611d9c887c58d115975e01c9d500595f503803dc8cSimon Hosie bic x5, x3, #1 362ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie add x0, x0, x5, LSL #2 363ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie add x1, x1, x5 364ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie add x3, x2, x5 365ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie sub x2, x4, x5 366ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 367ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie sub x5, sp, #32 368ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie sub sp, sp, #64 369ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie st1 {v8.1d - v11.1d}, [sp] 370ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie st1 {v12.1d - v15.1d}, [x5] 371ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 372ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie wrap_line yuvkern, 1 373ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie 374ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ld1 {v8.1d - v11.1d}, [sp], #32 375ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ld1 {v12.1d - v15.1d}, [sp], #32 376ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie ret 377ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieEND(rsdIntrinsicYuvR_K) 378