18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//******************************************************************************
28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Copyright (C) 2015 The Android Open Source Project
48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Licensed under the Apache License, Version 2.0 (the "License");
68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* you may not use this file except in compliance with the License.
78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* You may obtain a copy of the License at:
88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* http://www.apache.org/licenses/LICENSE-2.0
108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Unless required by applicable law or agreed to in writing, software
128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* distributed under the License is distributed on an "AS IS" BASIS,
138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* See the License for the specific language governing permissions and
158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* limitations under the License.
168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*****************************************************************************
188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*/
208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S///**
218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S///**
228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*******************************************************************************
238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @brief
258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*     Interprediction luma function for copy
268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @par Description:
288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*   Copies the array of width 'wd' and height 'ht' from the  location pointed
298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*   by 'src' to the location pointed by 'dst'
308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] pu1_src
328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  UWORD8 pointer to the source
338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[out] pu1_dst
358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  UWORD8 pointer to the destination
368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] src_strd
388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  integer source stride
398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] dst_strd
418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  integer destination stride
428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] ht
458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  integer height of the array
468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @param[in] wd
488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  integer width of the array
498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @returns
518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//* @remarks
538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*  None
548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*
558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*******************************************************************************
568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//*/
578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//void ih264_inter_pred_luma_copy (
588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            UWORD8 *pu1_src,
598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            UWORD8 *pu1_dst,
608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 src_strd,
618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 dst_strd,
628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 ht,
638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 wd   )
648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//**************Variables Vs Registers*****************************************
668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//    x0 => *pu1_src
678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//    x1 => *pu1_dst
688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//    x2 =>  src_strd
698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//    x3 =>  dst_strd
708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//    x7 =>  ht
718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//    x12 => wd
728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.text
748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.p2align 2
758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.include "ih264_neon_macros.s"
768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_inter_pred_luma_copy_av8
808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_inter_pred_luma_copy_av8:
828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    push_v_regs
848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stp       x19, x20, [sp, #-16]!
858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov       x12, x5
878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov       x7, x4
888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    cmp       x7, #0                    //checks ht == 0
898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ble       end_loops
908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    tst       x12, #15                  //checks wd for multiples for 4 & 8
918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    beq       core_loop_wd_16
928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    tst       x12, #7                   //checks wd for multiples for 4 & 8
938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    beq       core_loop_wd_8
948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub       x11, x12, #4
958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Souter_loop_wd_4:
978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x4, x12, #0               //checks wd == 0
988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ble       end_inner_loop_wd_4
998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sinner_loop_wd_4:
1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.s}[0], [x0]           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x5, x0, x2                //pu1_src_tmp += src_strd
1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v0.s}[0], [x1]           //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x0, x0, #4                //pu1_src += 4
1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x4, x4, #4                //(wd -4)
1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.s}[0], [x5], x2       //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x1, x1, #4                //pu1_dst += 4
1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v0.s}[0], [x6], x3       //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bgt       inner_loop_wd_4
1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Send_inner_loop_wd_4:
1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x7, x7, #4                //ht - 4
1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bgt       outer_loop_wd_4
1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Send_loops:
1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // LDMFD sp!,{x4-x12,x15}                  //Reload the registers from SP
1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldp       x19, x20, [sp], #16
1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pop_v_regs
1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ret
1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Score_loop_wd_8:
1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub       x11, x12, #8
1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Souter_loop_wd_8:
1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x4, x12, #0               //checks wd
1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ble       end_inner_loop_wd_8
1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sinner_loop_wd_8:
1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x5, x0, x2                //pu1_src_tmp += src_strd
1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v0.8b}, [x0], #8         //vld1_u8(pu1_src_tmp)
1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v0.8b}, [x1], #8         //vst1_u8(pu1_dst_tmp, tmp_src)
1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v1.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v1.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x4, x4, #8                //wd - 8(Loop condition)
1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v2.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v2.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v3.8b}, [x5], x2         //vld1_u8(pu1_src_tmp)
1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v3.8b}, [x6], x3         //vst1_u8(pu1_dst_tmp, tmp_src)
1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bgt       inner_loop_wd_8
1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Send_inner_loop_wd_8:
1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x7, x7, #4                //ht -= 4
1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bgt       outer_loop_wd_8
1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // LDMFD sp!,{x4-x12,x15}                  //Reload the registers from SP
1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldp       x19, x20, [sp], #16
1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pop_v_regs
1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ret
1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Score_loop_wd_16:
1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub       x11, x12, #16
1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Souter_loop_wd_16:
1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x4, x12, #0               //checks wd
1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ble       end_inner_loop_wd_16
1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sinner_loop_wd_16:
1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x5, x0, x2                //pu1_src_tmp += src_strd
1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       { v0.16b}, [x0], #16      //vld1_u8(pu1_src_tmp)
1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    add       x6, x1, x3                //pu1_dst_tmp += dst_strd
1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       { v0.16b}, [x1], #16      //vst1_u8(pu1_dst_tmp, tmp_src)
1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       { v2.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       { v2.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x4, x4, #16               //wd - 8(Loop condition)
1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       { v4.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       { v4.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       { v6.16b}, [x5], x2       //vld1_u8(pu1_src_tmp)
1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       { v6.16b}, [x6], x3       //vst1_u8(pu1_dst_tmp, tmp_src)
1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bgt       inner_loop_wd_16
1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Send_inner_loop_wd_16:
1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs      x7, x7, #4                //ht -= 4
1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub       x0, x5, x11               //pu1_src = pu1_src_tmp
1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub       x1, x6, x11               //pu1_dst = pu1_dst_tmp
1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bgt       outer_loop_wd_16
1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldp       x19, x20, [sp], #16
1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pop_v_regs
1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ret
1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// /*
1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// ********************************************************************************
1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *
1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// * @brief This function copies a 4x4 block to destination
1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *
2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// * @par Description:
2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// * Copies a 4x4 block to destination, where both src and dst are interleaved
2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *
2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// * @param[in] pi2_src
2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *  Source
2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *
2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// * @param[in] pu1_out
2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *  Output pointer
2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *
2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// * @param[in] pred_strd,
2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *  Prediction buffer stride
2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *
2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// * @param[in] out_strd
2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *  output buffer buffer Stride
2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *
2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// * @returns none
2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *
2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// * @remarks none
2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// * Currently wd and height is not used, ie a 4x4 block is always copied
2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *
2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// *******************************************************************************
2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// */
2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// void ih264_interleave_copy(WORD16 *pi2_src,
2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            UWORD8 *pu1_out,
2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 pred_strd,
2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 out_strd
2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 wd
2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S//                            WORD32 ht)
2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// Register Usage
2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// x0 : pi2_src
2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// x1 : pu1_out
2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// x2 : src_strd
2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// x3 : out_strd
2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// Neon registers d0-d7, d16-d30 are used
2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// No need for pushing  arm and neon registers
2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_interleave_copy_av8
2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_interleave_copy_av8:
2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    push_v_regs
2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v2.8b}, [x0], x2         //load src plane 1 => d2 &pred palne 2 => d3
2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v3.8b}, [x0], x2
2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov       v2.d[1], v3.d[0]
2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v4.8b}, [x0], x2
2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v5.8b}, [x0], x2
2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov       v4.d[1], v5.d[0]
2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov       x0, x1
2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v18.8b}, [x1], x3        //load out [8 bit size) -8 coeffs
2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v19.8b}, [x1], x3
2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov       v18.d[1], v19.d[0]
2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    movi      v30.8h, #0x00ff
2528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v20.8b}, [x1], x3
2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ld1       {v21.8b}, [x1], x3
2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mov       v20.d[1], v21.d[0]
2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bit       v18.16b, v2.16b , v30.16b
2578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bit       v20.16b, v4.16b , v30.16b
2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v18.8b}, [x0], x3        //store  out
2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v18.d}[1], [x0], x3
2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v20.8b}, [x0], x3
2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    st1       {v20.d}[1], [x0], x3
2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pop_v_regs
2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ret
2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
268