rs/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S

/*
 * Copyright (C) 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
#define END(f) .size f, .-f;

/* Perform the actual YuvToRGB conversion in a macro, from register to
 * register.  This macro will be called from within several different wrapper
 * variants for different data layouts.  Y data starts with the even and odd
 * bytes split into the low parts of v8 and v9 respectively.  U and V are in
 * v16 and v17.  Working constants are pre-loaded into v13-v15, and v3 is
 * pre-loaded with a constant 0xff alpha channel.
 *
 * The complicated arithmetic is the result of refactoring the original
 * equations to avoid 16-bit overflow without losing any precision.
 */
.macro yuvkern
        movi        v7.8b, #149

        umull       v1.8h, v8.8b, v7.8b        // g0 = y0 * 149
        umull       v5.8h, v9.8b, v7.8b        // g1 = y1 * 149

        movi        v7.8b, #50
        movi        v10.8b, #104
        umull       v8.8h, v16.8b, v7.8b       // g2 = u * 50 + v * 104
        umlal       v8.8h, v17.8b, v10.8b

        ushr        v7.8b, v17.8b, #1
        uaddw       v0.8h, v1.8h, v7.8b        // r0 = y0 * 149 + (v >> 1)
        uaddw       v4.8h, v5.8h, v7.8b        // r1 = y1 * 149 + (v >> 1)

        ushll       v7.8h, v16.8b, #2
        add         v2.8h, v1.8h, v7.8h        // b0 = y0 * 149 + (u << 2)
        add         v6.8h, v5.8h, v7.8h        // b1 = y1 * 149 + (u << 2)

        movi        v7.16b, #204
        movi        v10.8b, #254
        umull       v11.8h, v17.8b, v7.8b     // r2 = v * 204
        umull       v12.8h, v16.8b, v10.8b      // b2 = u * 254

        uhadd       v0.8h, v0.8h, v11.8h       // r0 = (r0 + r2) >> 1
        uhadd       v4.8h, v4.8h, v11.8h       // r1 = (r1 + r2) >> 1
        uqadd       v1.8h, v1.8h, v14.8h       // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
        uqadd       v5.8h, v5.8h, v14.8h       // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
        uhadd       v2.8h, v2.8h, v12.8h       // b0 = (b0 + b2) >> 1
        uhadd       v6.8h, v6.8h, v12.8h       // b1 = (b1 + b2) >> 1

        uqsub       v0.8h, v0.8h, v13.8h       // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
        uqsub       v4.8h, v4.8h, v13.8h       // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
        uqsub       v1.8h, v1.8h, v8.8h        // g0 = satu16(g0 - g2)
        uqsub       v5.8h, v5.8h, v8.8h        // g1 = satu16(g1 - g2)
        uqsub       v2.8h, v2.8h, v15.8h       // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
        uqsub       v6.8h, v6.8h, v15.8h       // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)

        uqrshrn     v0.8b, v0.8h, #6
        uqrshrn     v4.8b, v4.8h, #6
        uqrshrn     v1.8b, v1.8h, #7
        uqrshrn     v5.8b, v5.8h, #7
        uqrshrn     v2.8b, v2.8h, #6
        uqrshrn     v6.8b, v6.8h, #6

        zip1        v0.16b, v0.16b, v4.16b
        zip1        v1.16b, v1.16b, v5.16b
        zip1        v2.16b, v2.16b, v6.16b
.endm

/* Define the wrapper code which will load and store the data, iterate the
 * correct number of times, and safely handle the remainder at the end of the
 * loop.  Some sections of code are switched out depending on the data packing
 * being handled.
 */
.macro wrap_line kernel, interleaved=0, swapuv=0

        mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
        dup         v13.8h, w5
        mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
        dup         v14.8h, w5
        mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
        dup         v15.8h, w5

        movi        v3.16b, #0xff

        subs        x2, x2, #16
        bhs         1f
        b           2f

        .align 4
1:      ld2         {v8.8b,v9.8b}, [x1], #16
//      prfm PLDL1STRM, [x1, #256]
  .if \interleaved
    .if \swapuv
        ld2         {v17.8b,v18.8b}, [x3], #16
        mov         v16.8b, v18.8b
    .else
        ld2         {v16.8b,v17.8b}, [x3], #16
    .endif
//      prfm PLD1STRM,  [x3, #256]
  .else
        ld1         {v16.8b}, [x3], #8
        ld1         {v17.8b}, [x4], #8
//      prfm PLD1STRM,  [x3, #128]
//      prfm PLD1STRM,  [x4, #128]
  .endif

        \kernel

        subs        x2, x2, #16

        st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64

        bhs         1b

2:      adds        x2, x2, #16
        beq         2f

        /* To handle the tail portion of the data (something less than 16
         * bytes) load small power-of-two chunks into working registers.  It
         * doesn't matter where they end up in the register; the same process
         * will store them back out using the same positions and the
         * interaction between neighbouring pixels is constrained to odd
         * boundaries where the load operations don't interfere.
         */
        movi        v8.8b, #0
        movi        v9.8b, #0
        movi        v16.8b, #0
        movi        v17.8b, #0

        tbz         x2, #3, 1f
        ld1         {v9.8b}, [x1], #8
  .if \interleaved
        ld1         {v17.8b}, [x3], #8
  .else
        ld1         {v16.s}[1], [x3], #4
        ld1         {v17.s}[1], [x4], #4
  .endif
1:      tbz         x2, #2, 1f
        ld1         {v8.s}[1], [x1], #4
  .if \interleaved
        ld1         {v16.s}[1], [x3], #4
  .else
        ld1         {v16.h}[1], [x3], #2
        ld1         {v17.h}[1], [x4], #2
  .endif
1:      tbz         x2, #1, 1f
        ld1         {v8.h}[1], [x1], #2
  .if \interleaved
        ld1         {v16.h}[1], [x3], #2
  .else
        ld1         {v16.b}[1], [x3], #1
        ld1         {v17.b}[1], [x4], #1
  .endif
1:      tbz         x2, #0, 1f
        ld1         {v8.b}[1], [x1], #1
  .if \interleaved
        ld1         {v16.b}[1], [x3], #1
  .else
        ld1         {v16.b}[0], [x3], #1
        ld1         {v17.b}[0], [x4], #1
  .endif

        /* One small impediment in the process above is that some of the load
         * operations can't perform byte-wise structure deinterleaving at the
         * same time as loading only part of a register.  So the data is loaded
         * linearly and unpacked manually at this point if necessary.
         */
1:      uzp1        v8.16b, v8.16b, v9.16b
  .if \interleaved
    .if \swapuv
        uzp1        v16.16b, v17.16b, v16.16b
    .else
        uzp1        v16.16b, v16.16b, v17.16b
    .endif
  .endif

        \kernel

        /* As above but with the output; structured stores for partial vectors
         * aren't available, so the data is re-packed first and stored linearly.
         */
        zip1        v4.16b, v0.16b, v2.16b
        zip2        v6.16b, v0.16b, v2.16b
        zip1        v5.16b, v1.16b, v3.16b
        zip2        v7.16b, v1.16b, v3.16b
        zip1        v0.16b, v4.16b, v5.16b
        zip2        v1.16b, v4.16b, v5.16b
        zip1        v2.16b, v6.16b, v7.16b
        zip2        v3.16b, v6.16b, v7.16b

1:      tbz         x2, #3, 1f
        st1         {v2.16b,v3.16b}, [x0], #32
1:      tbz         x2, #2, 1f
        st1         {v1.16b}, [x0], #16
1:      tbz         x2, #1, 1f
        st1         {v0.d}[1], [x0], #8
1:      tbz         x2, #0, 2f
        st1         {v0.s}[1], [x0], #4
2:
.endm


/*  void rsdIntrinsicYuv2_K(
 *          void *out,          // x0
 *          void const *yin,    // x1
 *          void const *uin,    // x2
 *          void const *vin,    // x3
 *          size_t xstart,      // x4
 *          size_t xend);       // x5
 */
ENTRY(rsdIntrinsicYuv2_K)
        lsr         x6, x4, #1
        add         x0, x0, x4, LSL #2
        add         x1, x1, x4
        add         x4, x3, x6
        add         x3, x2, x6
        sub         x2, x5, x6, LSL #2

        sub         x6, sp, #32
        sub         sp, sp, #64
        st1         {v8.1d - v11.1d}, [sp]
        st1         {v12.1d - v15.1d}, [x6]

        wrap_line yuvkern, 0

        ld1         {v8.1d - v11.1d}, [sp], #32
        ld1         {v12.1d - v15.1d}, [sp], #32
        ret
END(rsdIntrinsicYuv2_K)

/*  void rsdIntrinsicYuv_K(
 *          void *out,          // x0
 *          void const *yin,    // x1
 *          void const *uvin,   // x2
 *          size_t xstart,      // x3
 *          size_t xend);       // x4
 */
ENTRY(rsdIntrinsicYuv_K)
        bic         x5, x4, #1
        add         x0, x0, x5, LSL #2
        add         x1, x1, x5
        add         x3, x2, x5
        sub         x2, x4, x5

        sub         x5, sp, #32
        sub         sp, sp, #64
        st1         {v8.1d - v11.1d}, [sp]
        st1         {v12.1d - v15.1d}, [x5]

        wrap_line yuvkern, 1, 1

        ld1         {v8.1d - v11.1d}, [sp], #32
        ld1         {v12.1d - v15.1d}, [sp], #32
        ret
END(rsdIntrinsicYuv_K)

/*  void rsdIntrinsicYuvR_K(
 *          void *out,          // x0
 *          void const *yin,    // x1
 *          void const *uvin,   // x2
 *          size_t xstart,      // x3
 *          size_t xend);       // x4
 */
ENTRY(rsdIntrinsicYuvR_K)
        bic         x5, x4, #1
        add         x0, x0, x5, LSL #2
        add         x1, x1, x5
        add         x3, x2, x5
        sub         x2, x4, x5

        sub         x5, sp, #32
        sub         sp, sp, #64
        st1         {v8.1d - v11.1d}, [sp]
        st1         {v12.1d - v15.1d}, [x5]

        wrap_line yuvkern, 1

        ld1         {v8.1d - v11.1d}, [sp], #32
        ld1         {v12.1d - v15.1d}, [sp], #32
        ret
END(rsdIntrinsicYuvR_K)