rsCpuIntrinsics_advsimd_YuvToRGB.S revision 9732e859ff5d1911915eb83411c9b1ae991c7523
1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18#define END(f) .size f, .-f; 19 20/* Perform the actual YuvToRGB conversion in a macro, from register to 21 * register. This macro will be called from within several different wrapper 22 * variants for different data layouts. Y data starts with the even and odd 23 * bytes split into the low parts of v8 and v9 respectively. U and V are in 24 * v16 and v17. Working constants are pre-loaded into v13-v15, and v3 is 25 * pre-loaded with a constant 0xff alpha channel. 26 * 27 * The complicated arithmetic is the result of refactoring the original 28 * equations to avoid 16-bit overflow without losing any precision. 29 */ 30.macro yuvkern 31 movi v7.8b, #149 32 33 umull v1.8h, v8.8b, v7.8b // g0 = y0 * 149 34 umull v5.8h, v9.8b, v7.8b // g1 = y1 * 149 35 36 movi v7.8b, #50 37 movi v10.8b, #104 38 umull v8.8h, v16.8b, v7.8b // g2 = u * 50 + v * 104 39 umlal v8.8h, v17.8b, v10.8b 40 41 ushr v7.8b, v17.8b, #1 42 uaddw v0.8h, v1.8h, v7.8b // r0 = y0 * 149 + (v >> 1) 43 uaddw v4.8h, v5.8h, v7.8b // r1 = y1 * 149 + (v >> 1) 44 45 ushll v7.8h, v16.8b, #2 46 add v2.8h, v1.8h, v7.8h // b0 = y0 * 149 + (u << 2) 47 add v6.8h, v5.8h, v7.8h // b1 = y1 * 149 + (u << 2) 48 49 movi v7.16b, #204 50 movi v10.8b, #254 51 umull v11.8h, v17.8b, v7.8b // r2 = v * 204 52 umull v12.8h, v16.8b, v10.8b // b2 = u * 254 53 54 uhadd v0.8h, v0.8h, v11.8h // r0 = (r0 + r2) >> 1 55 uhadd v4.8h, v4.8h, v11.8h // r1 = (r1 + r2) >> 1 56 uqadd v1.8h, v1.8h, v14.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 57 uqadd v5.8h, v5.8h, v14.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 58 uhadd v2.8h, v2.8h, v12.8h // b0 = (b0 + b2) >> 1 59 uhadd v6.8h, v6.8h, v12.8h // b1 = (b1 + b2) >> 1 60 61 uqsub v0.8h, v0.8h, v13.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 62 uqsub v4.8h, v4.8h, v13.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 63 uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2) 64 uqsub v5.8h, v5.8h, v8.8h // g1 = satu16(g1 - g2) 65 uqsub v2.8h, v2.8h, v15.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 66 uqsub v6.8h, v6.8h, v15.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 67 68 uqrshrn v0.8b, v0.8h, #6 69 uqrshrn v4.8b, v4.8h, #6 70 uqrshrn v1.8b, v1.8h, #7 71 uqrshrn v5.8b, v5.8h, #7 72 uqrshrn v2.8b, v2.8h, #6 73 uqrshrn v6.8b, v6.8h, #6 74 75 zip1 v0.16b, v0.16b, v4.16b 76 zip1 v1.16b, v1.16b, v5.16b 77 zip1 v2.16b, v2.16b, v6.16b 78.endm 79 80/* Define the wrapper code which will load and store the data, iterate the 81 * correct number of times, and safely handle the remainder at the end of the 82 * loop. Some sections of code are switched out depending on the data packing 83 * being handled. 84 */ 85.macro wrap_line kernel, interleaved=0, swapuv=0 86 87 mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1) 88 dup v13.8h, w5 89 mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0) 90 dup v14.8h, w5 91 mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1) 92 dup v15.8h, w5 93 94 movi v3.16b, #0xff 95 96 subs x2, x2, #16 97 bhs 1f 98 b 2f 99 100 .align 4 1011: ld2 {v8.8b,v9.8b}, [x1], #16 102// prfm PLDL1STRM, [x1, #256] 103 .if \interleaved 104 .if \swapuv 105 ld2 {v17.8b,v18.8b}, [x3], #16 106 mov v16.8b, v18.8b 107 .else 108 ld2 {v16.8b,v17.8b}, [x3], #16 109 .endif 110// prfm PLD1STRM, [x3, #256] 111 .else 112 ld1 {v16.8b}, [x3], #8 113 ld1 {v17.8b}, [x4], #8 114// prfm PLD1STRM, [x3, #128] 115// prfm PLD1STRM, [x4, #128] 116 .endif 117 118 \kernel 119 120 subs x2, x2, #16 121 122 st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64 123 124 bhs 1b 125 1262: adds x2, x2, #16 127 beq 2f 128 129 /* To handle the tail portion of the data (something less than 16 130 * bytes) load small power-of-two chunks into working registers. It 131 * doesn't matter where they end up in the register; the same process 132 * will store them back out using the same positions and the 133 * interaction between neighbouring pixels is constrained to odd 134 * boundaries where the load operations don't interfere. 135 */ 136 movi v8.8b, #0 137 movi v9.8b, #0 138 movi v16.8b, #0 139 movi v17.8b, #0 140 141 tbz x2, #3, 1f 142 ld1 {v9.8b}, [x1], #8 143 .if \interleaved 144 ld1 {v17.8b}, [x3], #8 145 .else 146 ld1 {v16.s}[1], [x3], #4 147 ld1 {v17.s}[1], [x4], #4 148 .endif 1491: tbz x2, #2, 1f 150 ld1 {v8.s}[1], [x1], #4 151 .if \interleaved 152 ld1 {v16.s}[1], [x3], #4 153 .else 154 ld1 {v16.h}[1], [x3], #2 155 ld1 {v17.h}[1], [x4], #2 156 .endif 1571: tbz x2, #1, 1f 158 ld1 {v8.h}[1], [x1], #2 159 .if \interleaved 160 ld1 {v16.h}[1], [x3], #2 161 .else 162 ld1 {v16.b}[1], [x3], #1 163 ld1 {v17.b}[1], [x4], #1 164 .endif 1651: tbz x2, #0, 1f 166 ld1 {v8.b}[1], [x1], #1 167 .if \interleaved 168 ld1 {v16.h}[0], [x3], #2 169 .else 170 ld1 {v16.b}[0], [x3], #1 171 ld1 {v17.b}[0], [x4], #1 172 .endif 173 174 /* One small impediment in the process above is that some of the load 175 * operations can't perform byte-wise structure deinterleaving at the 176 * same time as loading only part of a register. So the data is loaded 177 * linearly and unpacked manually at this point if necessary. 178 */ 1791: mov v18.8b, v8.8b 180 uzp1 v8.8b, v18.8b, v9.8b 181 uzp2 v9.8b, v18.8b, v9.8b 182 .if \interleaved 183 mov v18.8b, v16.8b 184 .if \swapuv 185 uzp1 v16.8b, v17.8b, v18.8b 186 uzp2 v17.8b, v17.8b, v18.8b 187 .else 188 uzp1 v16.8b, v18.8b, v17.8b 189 uzp2 v17.8b, v18.8b, v17.8b 190 .endif 191 .endif 192 193 \kernel 194 195 /* As above but with the output; structured stores for partial vectors 196 * aren't available, so the data is re-packed first and stored linearly. 197 */ 198 zip1 v4.16b, v0.16b, v2.16b 199 zip2 v6.16b, v0.16b, v2.16b 200 zip1 v5.16b, v1.16b, v3.16b 201 zip2 v7.16b, v1.16b, v3.16b 202 zip1 v0.16b, v4.16b, v5.16b 203 zip2 v1.16b, v4.16b, v5.16b 204 zip1 v2.16b, v6.16b, v7.16b 205 zip2 v3.16b, v6.16b, v7.16b 206 2071: tbz x2, #3, 1f 208 st1 {v2.16b,v3.16b}, [x0], #32 2091: tbz x2, #2, 1f 210 st1 {v1.16b}, [x0], #16 2111: tbz x2, #1, 1f 212 st1 {v0.d}[1], [x0], #8 2131: tbz x2, #0, 2f 214 st1 {v0.s}[1], [x0], #4 2152: 216.endm 217 218 219/* void rsdIntrinsicYuv2_K( 220 * void *out, // x0 221 * void const *yin, // x1 222 * void const *uin, // x2 223 * void const *vin, // x3 224 * size_t xstart, // x4 225 * size_t xend); // x5 226 */ 227ENTRY(rsdIntrinsicYuv2_K) 228 lsr x6, x4, #1 229 add x0, x0, x4, LSL #2 230 add x1, x1, x4 231 add x4, x3, x6 232 add x3, x2, x6 233 sub x2, x5, x6, LSL #1 234 235 sub x6, sp, #32 236 sub sp, sp, #64 237 st1 {v8.1d - v11.1d}, [sp] 238 st1 {v12.1d - v15.1d}, [x6] 239 240 wrap_line yuvkern, 0 241 242 ld1 {v8.1d - v11.1d}, [sp], #32 243 ld1 {v12.1d - v15.1d}, [sp], #32 244 ret 245END(rsdIntrinsicYuv2_K) 246 247/* void rsdIntrinsicYuv_K( 248 * void *out, // x0 249 * void const *yin, // x1 250 * void const *uvin, // x2 251 * size_t xstart, // x3 252 * size_t xend); // x4 253 */ 254ENTRY(rsdIntrinsicYuv_K) 255 bic x5, x3, #1 256 add x0, x0, x5, LSL #2 257 add x1, x1, x5 258 add x3, x2, x5 259 sub x2, x4, x5 260 261 sub x5, sp, #32 262 sub sp, sp, #64 263 st1 {v8.1d - v11.1d}, [sp] 264 st1 {v12.1d - v15.1d}, [x5] 265 266 wrap_line yuvkern, 1, 1 267 268 ld1 {v8.1d - v11.1d}, [sp], #32 269 ld1 {v12.1d - v15.1d}, [sp], #32 270 ret 271END(rsdIntrinsicYuv_K) 272 273/* void rsdIntrinsicYuvR_K( 274 * void *out, // x0 275 * void const *yin, // x1 276 * void const *uvin, // x2 277 * size_t xstart, // x3 278 * size_t xend); // x4 279 */ 280ENTRY(rsdIntrinsicYuvR_K) 281 bic x5, x3, #1 282 add x0, x0, x5, LSL #2 283 add x1, x1, x5 284 add x3, x2, x5 285 sub x2, x4, x5 286 287 sub x5, sp, #32 288 sub sp, sp, #64 289 st1 {v8.1d - v11.1d}, [sp] 290 st1 {v12.1d - v15.1d}, [x5] 291 292 wrap_line yuvkern, 1 293 294 ld1 {v8.1d - v11.1d}, [sp], #32 295 ld1 {v12.1d - v15.1d}, [sp], #32 296 ret 297END(rsdIntrinsicYuvR_K) 298