decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s

///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///*******************************************************************************
//* //file
//*  ihevcd_fmt_conv_420sp_to_rgba8888.s
//*
//* //brief
//*  contains function definitions for format conversions
//*
//* //author
//*  ittiam
//*
//* //par list of functions:
//*
//*
//* //remarks
//*  none
//*
//*******************************************************************************/

    .equ DO1STROUNDING, 0

    // ARM
    //
    // PRESERVE8

.text
.p2align 2

.include "ihevc_neon_macros.s"


///*****************************************************************************
//*                                                                            *
//*  Function Name    : ihevcd_fmt_conv_420sp_to_rgba8888()                    *
//*                                                                            *
//*  Description      : This function conversts the image from YUV422 color    *
//*                     space to RGB888 color space. The function can be       *
//*                     invoked at the MB level.                               *
//*                                                                            *
//*  Arguments        : x0           pubY                                      *
//*                     x1           pubUV                                     *
//*                     x2           pusRGB                                    *
//*                     x3           pusRGB                                    *
//*                     [x13 #40]    usHeight                                  *
//*                     [x13 #44]    usWidth                                   *
//*                     [x13 #48]    usStrideY                                 *
//*                     [x13 #52]    usStrideU                                 *
//*                     [x13 #56]    usStrideV                                 *
//*                     [x13 #60]    usStrideRGB                               *
//*                                                                            *
//*  Values Returned  : None                                                   *
//*                                                                            *
//*  Register Usage   : x0 - x14                                               *
//*                                                                            *
//*  Stack Usage      : 40 Bytes                                               *
//*                                                                            *
//*  Interruptibility : Interruptible                                          *
//*                                                                            *
//*  Known Limitations                                                         *
//*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
//*                     greater than or equal to 16                  *
//*                     Image Height:    Assumed to be even.                   *
//*                                                                            *
//*  Revision History :                                                        *
//*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
//*         07 06 2010   Varshita        Draft                                 *
//*         07 06 2010   Naveen Kr T     Completed                             *
//*         05 08 2013   Naveen K P      Modified for HEVC                     *
//*****************************************************************************/
    .global ihevcd_fmt_conv_420sp_to_rgba8888_av8
.type ihevcd_fmt_conv_420sp_to_rgba8888_av8, function
ihevcd_fmt_conv_420sp_to_rgba8888_av8:

    //// push the registers on the stack
    // STMFD sp!,{x4-x12,x14}

    stp         d12,d14,[sp,#-16]!
    stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
                                            // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
    stp         x19, x20,[sp,#-16]!


    ////x0 - Y PTR
    ////x1 - UV PTR
    ////x2 - RGB PTR
    ////x3 - RGB PTR
    ////x4 - PIC WIDTH
    ////x5 - PIC HT
    ////x6 - STRIDE Y
    ////x7 - STRIDE U
    ////x8 - STRIDE V
    ////x9 - STRIDE RGB

    ////ONE ROW PROCESSING AT A TIME

    ////THE FOUR CONSTANTS ARE:
    ////C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092

    //PLD        [x0]
    //PLD        [x1]
    //PLD        [x2]


    ///* can be loaded from a defined const type */
    mov         x10,#0x3311
    mov         v0.4h[0], w10               ////C1

    mov         x10,#0xF379
    mov         v0.4h[1], w10               ////C2

    mov         x10,#0xE5F8
    mov         v0.4h[2], w10               ////C3

    mov         x10,#0x4092
    mov         v0.4h[3], w10               ////C4

    ////LOAD CONSTANT 128 INTO A CORTEX REGISTER
    MOV         x10,#128
    dup         v1.8b,w10

    ////D0 HAS C1-C2-C3-C4
    //// load other parameters from stack
    mov         x9, x7
    mov         x7, x6
    mov         x6, x5
    mov         x5, x4
    //LDR  x4,[sp,#44]
    //LDR  x8,[sp,#52]

    //// calculate offsets, offset = stride - width
    SUB         x10,x6,x3                   //// luma offset
    SUB         x11,x7,x3
    //, LSR #1    @// u offset
    //SUB     x12,x8,x3, LSR #1    @// v offset
    SUB         x14,x9,x3                   //// rgb offset in pixels

    //// calculate height loop count
    LSR         x5, x5, #1                  //// height_cnt = height / 16

    //// create next row pointers for rgb and luma data
    ADD         x7,x0,x6                    //// luma_next_row = luma + luma_stride
    ADD         x8,x2,x9,LSL #2             //// rgb_next_row = rgb + rgb_stride

LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:

    ////LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
    LD1         {v2.8b, v3.8b},[x1],#16     ////LOAD 8 VALUES OF UV
    ////VLD1.8 {D3},[x2]!             @//LOAD 8 VALUES OF V

    //// calculate width loop count
    LSR         x6, x3, #4                  //// width_cnt = width / 16

    ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
    ////LOAD VALUES OF Y 8-BIT VALUES
    LD2         {v30.8b, v31.8b},[x0],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    LD2         {v28.8b, v29.8b},[x7],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15

    SUBS        x6,x6,#1
    BEQ         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP

LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
    //VMOV.I8 Q1,#128
    UZP1        v27.8b, v2.8b, v3.8b
    UZP2        v3.8b, v2.8b, v3.8b
    mov         v2.d[0], v27.d[0]

    ////NEED TO SUBTRACT (U-128) AND (V-128)
    ////(D2-D1),(D3-D1)
    uSUBL       v4.8h, v2.8b, v1.8b         ////(U-128)
    uSUBL       v6.8h, v3.8b, v1.8b         ////(V-128)

    ////LOAD VALUES OF U&V for next row
    LD1         {v2.8b, v3.8b},[x1],#16     ////LOAD 8 VALUES OF U
    ////VLD1.8 {D3},[x2]!             @//LOAD 8 VALUES OF V

    //PLD        [x0]
    prfm        PLDL1KEEP,[x1]

    ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
    sMULL       v5.4s, v4.4h, v0.4h[3]      ////(U-128)*C4 FOR B
    sMULL2      v7.4s, v4.8h, v0.4h[3]      ////(U-128)*C4 FOR B

    sMULL       v20.4s, v6.4h, v0.4h[0]     ////(V-128)*C1 FOR R
    sMULL2      v22.4s, v6.8h, v0.4h[0]     ////(V-128)*C1 FOR R

    sMULL       v12.4s, v4.4h, v0.4h[1]     ////(U-128)*C2 FOR G
    sMLAL       v12.4s, v6.4h, v0.4h[2]     ////Q6 = (U-128)*C2 + (V-128)*C3
    sMULL2      v14.4s, v4.8h, v0.4h[1]     ////(U-128)*C2 FOR G
    sMLAL2      v14.4s, v6.8h, v0.4h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3

    ////NARROW RIGHT SHIFT BY 13 FOR R&B
    sqshrn      v5.4h, v5.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
    sqshrn2     v5.8h, v7.4s,#13            ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
    ////Q4 - WEIGHT FOR B

    ////NARROW RIGHT SHIFT BY 13 FOR R&B
    sqshrn      v7.4h, v20.4s,#13           ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
    sqshrn2     v7.8h, v22.4s,#13           ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
    ////Q5 - WEIGHT FOR R

    ////NARROW RIGHT SHIFT BY 13 FOR G
    sqshrn      v12.4h, v12.4s,#13          ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    ////Q6 - WEIGHT FOR G

    UADDW       v14.8h,  v5.8h ,  v30.8b    ////Q7 - HAS Y + B
    UADDW       v16.8h,  v7.8h ,  v30.8b    ////Q8 - HAS Y + R
    UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G

    UADDW       v20.8h,  v5.8h ,  v31.8b    ////Q10 - HAS Y + B
    UADDW       v22.8h,  v7.8h ,  v31.8b    ////Q11 - HAS Y + R
    UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G

    sqxtun      v14.8b, v14.8h
    sqxtun      v15.8b, v18.8h
    sqxtun      v16.8b, v16.8h
    movi        v17.8b, #0

    sqxtun      v20.8b, v20.8h
    sqxtun      v21.8b, v24.8h
    sqxtun      v22.8b, v22.8h
    movi        v23.8b, #0

    ZIP1        v27.8b, v14.8b, v15.8b
    ZIP2        v15.8b, v14.8b, v15.8b
    mov         v14.d[0], v27.d[0]
    ZIP1        v27.8b, v16.8b, v17.8b
    ZIP2        v17.8b, v16.8b, v17.8b
    mov         v16.d[0], v27.d[0]

    ZIP1        v27.8b, v20.8b, v21.8b
    ZIP2        v21.8b, v20.8b, v21.8b
    mov         v20.d[0], v27.d[0]
    ZIP1        v27.8b, v22.8b, v23.8b
    ZIP2        v23.8b, v22.8b, v23.8b
    mov         v22.d[0], v27.d[0]

    mov         v14.d[1], v15.d[0]
    mov         v20.d[1], v21.d[0]
    mov         v16.d[1], v17.d[0]
    mov         v22.d[1], v23.d[0]

    ZIP1        v27.8h, v14.8h, v16.8h
    ZIP2        v26.8h, v14.8h, v16.8h

    ZIP1        v25.8h, v20.8h, v22.8h
    ZIP2        v19.8h, v20.8h, v22.8h

    ZIP1        v14.4s, v27.4s, v25.4s
    ZIP2        v20.4s, v27.4s, v25.4s

    ZIP1        v16.4s, v26.4s, v19.4s
    ZIP2        v22.4s, v26.4s, v19.4s

    ST1         {v14.4s},[x2],#16
    ST1         {v20.4s},[x2],#16
    ST1         {v16.4s},[x2],#16
    ST1         {v22.4s},[x2],#16

    ////D14-D20 - TOALLY HAVE 16 VALUES
    ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
    UADDW       v14.8h,  v5.8h ,  v28.8b    ////Q7 - HAS Y + B
    UADDW       v16.8h,  v7.8h ,  v28.8b    ////Q2 - HAS Y + R
    UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G

    UADDW       v20.8h,  v5.8h ,  v29.8b    ////Q10 - HAS Y + B
    UADDW       v22.8h,  v7.8h ,  v29.8b    ////Q11 - HAS Y + R
    UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G

    ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
    ////LOAD VALUES OF Y 8-BIT VALUES
    LD2         {v30.8b, v31.8b},[x0],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    LD2         {v28.8b, v29.8b},[x7],#16   ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
                                            ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15

    prfm        PLDL1KEEP,[x0]
    prfm        PLDL1KEEP,[x7]

    sqxtun      v14.8b, v14.8h
    sqxtun      v15.8b, v18.8h
    sqxtun      v16.8b, v16.8h
    movi        v17.8b, #0

    sqxtun      v20.8b, v20.8h
    sqxtun      v21.8b, v24.8h
    sqxtun      v22.8b, v22.8h
    movi        v23.8b, #0

    ZIP1        v27.8b, v14.8b, v15.8b
    ZIP2        v15.8b, v14.8b, v15.8b
    mov         v14.d[0], v27.d[0]
    ZIP1        v27.8b, v16.8b, v17.8b
    ZIP2        v17.8b, v16.8b, v17.8b
    mov         v16.d[0], v27.d[0]

    ZIP1        v27.8b, v20.8b, v21.8b
    ZIP2        v21.8b, v20.8b, v21.8b
    mov         v20.d[0], v27.d[0]
    ZIP1        v27.8b, v22.8b, v23.8b
    ZIP2        v23.8b, v22.8b, v23.8b
    mov         v22.d[0], v27.d[0]

    mov         v14.d[1], v15.d[0]
    mov         v20.d[1], v21.d[0]
    mov         v16.d[1], v17.d[0]
    mov         v22.d[1], v23.d[0]

    ZIP1        v27.8h, v14.8h, v16.8h
    ZIP2        v26.8h, v14.8h, v16.8h

    ZIP1        v25.8h, v20.8h, v22.8h
    ZIP2        v19.8h, v20.8h, v22.8h

    ZIP1        v14.4s, v27.4s, v25.4s
    ZIP2        v20.4s, v27.4s, v25.4s

    ZIP1        v16.4s, v26.4s, v19.4s
    ZIP2        v22.4s, v26.4s, v19.4s

    ST1         {v14.4s},[x8],#16
    ST1         {v20.4s},[x8],#16
    ST1         {v16.4s},[x8],#16
    ST1         {v22.4s},[x8],#16

    SUBS        x6,x6,#1                    //// width_cnt -= 1
    BNE         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP

LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
    //VMOV.I8 Q1,#128
    UZP1        v27.8b, v2.8b, v3.8b
    UZP2        v3.8b, v2.8b, v3.8b
    mov         v2.d[0], v27.d[0]


    ////NEED TO SUBTRACT (U-128) AND (V-128)
    ////(D2-D1),(D3-D1)
    uSUBL       v4.8h, v2.8b, v1.8b         ////(U-128)
    uSUBL       v6.8h, v3.8b, v1.8b         ////(V-128)


    ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
    sMULL       v5.4s, v4.4h, v0.4h[3]      ////(U-128)*C4 FOR B
    sMULL2      v7.4s, v4.8h, v0.4h[3]      ////(U-128)*C4 FOR B

    sMULL       v20.4s, v6.4h, v0.4h[0]     ////(V-128)*C1 FOR R
    sMULL2      v22.4s, v6.8h, v0.4h[0]     ////(V-128)*C1 FOR R

    sMULL       v12.4s, v4.4h, v0.4h[1]     ////(U-128)*C2 FOR G
    sMLAL       v12.4s, v6.4h, v0.4h[2]     ////Q6 = (U-128)*C2 + (V-128)*C3
    sMULL2      v14.4s, v4.8h, v0.4h[1]     ////(U-128)*C2 FOR G
    sMLAL2      v14.4s, v6.8h, v0.4h[2]     ////Q7 = (U-128)*C2 + (V-128)*C3

    ////NARROW RIGHT SHIFT BY 13 FOR R&B
    sqshrn      v5.4h, v5.4s,#13            ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
    sqshrn2     v5.8h, v7.4s,#13            ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
    ////Q4 - WEIGHT FOR B

    ////NARROW RIGHT SHIFT BY 13 FOR R&B
    sqshrn      v7.4h, v20.4s,#13           ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
    sqshrn2     v7.8h, v22.4s,#13           ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
    ////Q5 - WEIGHT FOR R

    ////NARROW RIGHT SHIFT BY 13 FOR G
    sqshrn      v12.4h, v12.4s,#13          ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    sqshrn2     v12.8h, v14.4s,#13          ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    ////Q6 - WEIGHT FOR G

    UADDW       v14.8h,  v5.8h ,  v30.8b    ////Q7 - HAS Y + B
    UADDW       v16.8h,  v7.8h ,  v30.8b    ////Q8 - HAS Y + R
    UADDW       v18.8h,  v12.8h ,  v30.8b   ////Q9 - HAS Y + G

    UADDW       v20.8h,  v5.8h ,  v31.8b    ////Q10 - HAS Y + B
    UADDW       v22.8h,  v7.8h ,  v31.8b    ////Q11 - HAS Y + R
    UADDW       v24.8h,  v12.8h ,  v31.8b   ////Q12 - HAS Y + G

    sqxtun      v14.8b, v14.8h
    sqxtun      v15.8b, v18.8h
    sqxtun      v16.8b, v16.8h
    movi        v17.8b, #0

    sqxtun      v20.8b, v20.8h
    sqxtun      v21.8b, v24.8h
    sqxtun      v22.8b, v22.8h
    movi        v23.8b, #0

    ZIP1        v27.8b, v14.8b, v15.8b
    ZIP2        v15.8b, v14.8b, v15.8b
    mov         v14.d[0], v27.d[0]
    ZIP1        v27.8b, v16.8b, v17.8b
    ZIP2        v17.8b, v16.8b, v17.8b
    mov         v16.d[0], v27.d[0]

    ZIP1        v27.8b, v20.8b, v21.8b
    ZIP2        v21.8b, v20.8b, v21.8b
    mov         v20.d[0], v27.d[0]
    ZIP1        v27.8b, v22.8b, v23.8b
    ZIP2        v23.8b, v22.8b, v23.8b
    mov         v22.d[0], v27.d[0]

    mov         v14.d[1], v15.d[0]
    mov         v20.d[1], v21.d[0]
    mov         v16.d[1], v17.d[0]
    mov         v22.d[1], v23.d[0]

    ZIP1        v27.8h, v14.8h, v16.8h
    ZIP2        v26.8h, v14.8h, v16.8h

    ZIP1        v25.8h, v20.8h, v22.8h
    ZIP2        v19.8h, v20.8h, v22.8h

    ZIP1        v14.4s, v27.4s, v25.4s
    ZIP2        v20.4s, v27.4s, v25.4s

    ZIP1        v16.4s, v26.4s, v19.4s
    ZIP2        v22.4s, v26.4s, v19.4s

    ST1         {v14.4s},[x2],#16
    ST1         {v20.4s},[x2],#16
    ST1         {v16.4s},[x2],#16
    ST1         {v22.4s},[x2],#16

    ////D14-D20 - TOALLY HAVE 16 VALUES
    ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
    UADDW       v14.8h,  v5.8h ,  v28.8b    ////Q7 - HAS Y + B
    UADDW       v16.8h,  v7.8h ,  v28.8b    ////Q2 - HAS Y + R
    UADDW       v18.8h,  v12.8h ,  v28.8b   ////Q3 - HAS Y + G

    UADDW       v20.8h,  v5.8h ,  v29.8b    ////Q10 - HAS Y + B
    UADDW       v22.8h,  v7.8h ,  v29.8b    ////Q11 - HAS Y + R
    UADDW       v24.8h,  v12.8h ,  v29.8b   ////Q12 - HAS Y + G

    sqxtun      v14.8b, v14.8h
    sqxtun      v15.8b, v18.8h
    sqxtun      v16.8b, v16.8h
    movi        v17.8b, #0

    sqxtun      v20.8b, v20.8h
    sqxtun      v21.8b, v24.8h
    sqxtun      v22.8b, v22.8h
    movi        v23.8b, #0

    ZIP1        v27.8b, v14.8b, v15.8b
    ZIP2        v15.8b, v14.8b, v15.8b
    mov         v14.d[0], v27.d[0]
    ZIP1        v27.8b, v16.8b, v17.8b
    ZIP2        v17.8b, v16.8b, v17.8b
    mov         v16.d[0], v27.d[0]

    ZIP1        v27.8b, v20.8b, v21.8b
    ZIP2        v21.8b, v20.8b, v21.8b
    mov         v20.d[0], v27.d[0]
    ZIP1        v27.8b, v22.8b, v23.8b
    ZIP2        v23.8b, v22.8b, v23.8b
    mov         v22.d[0], v27.d[0]

    mov         v14.d[1], v15.d[0]
    mov         v20.d[1], v21.d[0]
    mov         v16.d[1], v17.d[0]
    mov         v22.d[1], v23.d[0]

    ZIP1        v27.8h, v14.8h, v16.8h
    ZIP2        v26.8h, v14.8h, v16.8h

    ZIP1        v25.8h, v20.8h, v22.8h
    ZIP2        v19.8h, v20.8h, v22.8h

    ZIP1        v14.4s, v27.4s, v25.4s
    ZIP2        v20.4s, v27.4s, v25.4s

    ZIP1        v16.4s, v26.4s, v19.4s
    ZIP2        v22.4s, v26.4s, v19.4s

    ST1         {v14.4s},[x8],#16
    ST1         {v20.4s},[x8],#16
    ST1         {v16.4s},[x8],#16
    ST1         {v22.4s},[x8],#16

    //// Adjust the address pointers
    ADD         x0,x7,x10                   //// luma = luma_next + offset
    ADD         x2,x8,x14,LSL #2            //// rgb = rgb_next + offset

    ADD         x7,x0,x3                    //// luma_next = luma + width
    ADD         x8,x2,x3,LSL #2             //// rgb_next_row = rgb + width

    ADD         x1,x1,x11                   //// adjust u pointer
    //ADD        x2,x2,x12            @// adjust v pointer

    ADD         x7,x7,x10                   //// luma_next = luma + width + offset (because of register crunch)
    ADD         x8,x8,x14,LSL #2            //// rgb_next_row = rgb + width + offset

    SUBS        x5,x5,#1                    //// height_cnt -= 1

    BNE         LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP

    ////POP THE REGISTERS
    // LDMFD sp!,{x4-x12,PC}
    ldp         x19, x20,[sp],#16
    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
    ldp         d12,d14,[sp],#16
    ret


    .section .note.GNU-stack,"",%progbits