1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12
13#include "vp8/common/blockd.h"
14
15void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
16                                           unsigned char * yabove_row,
17                                           unsigned char * yleft,
18                                           int left_stride,
19                                           unsigned char * ypred_ptr,
20                                           int y_stride) {
21  const int mode = x->mode_info_context->mbmi.mode;
22  int i;
23
24  switch (mode) {
25    case DC_PRED:
26    {
27      int shift = x->up_available + x->left_available;
28      uint8x16_t v_expected_dc = vdupq_n_u8(128);
29
30      if (shift) {
31        unsigned int average = 0;
32        int expected_dc;
33        if (x->up_available) {
34          const uint8x16_t v_above = vld1q_u8(yabove_row);
35          const uint16x8_t a = vpaddlq_u8(v_above);
36          const uint32x4_t b = vpaddlq_u16(a);
37          const uint64x2_t c = vpaddlq_u32(b);
38          const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
39                                        vreinterpret_u32_u64(vget_high_u64(c)));
40          average = vget_lane_u32(d, 0);
41        }
42        if (x->left_available) {
43          for (i = 0; i < 16; ++i) {
44              average += yleft[0];
45              yleft += left_stride;
46          }
47        }
48        shift += 3;
49        expected_dc = (average + (1 << (shift - 1))) >> shift;
50        v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
51      }
52      for (i = 0; i < 16; ++i) {
53        vst1q_u8(ypred_ptr, v_expected_dc);
54        ypred_ptr += y_stride;
55      }
56    }
57    break;
58    case V_PRED:
59    {
60      const uint8x16_t v_above = vld1q_u8(yabove_row);
61      for (i = 0; i < 16; ++i) {
62        vst1q_u8(ypred_ptr, v_above);
63        ypred_ptr += y_stride;
64      }
65    }
66    break;
67    case H_PRED:
68    {
69      for (i = 0; i < 16; ++i) {
70        const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
71        yleft += left_stride;
72        vst1q_u8(ypred_ptr, v_yleft);
73        ypred_ptr += y_stride;
74      }
75    }
76    break;
77    case TM_PRED:
78    {
79      const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
80      const uint8x16_t v_above = vld1q_u8(yabove_row);
81      for (i = 0; i < 16; ++i) {
82        const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
83        const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
84        const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
85        const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
86                                         vreinterpretq_s16_u16(v_ytop_left));
87        const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
88                                         vreinterpretq_s16_u16(v_ytop_left));
89        const uint8x8_t pred_lo = vqmovun_s16(b_lo);
90        const uint8x8_t pred_hi = vqmovun_s16(b_hi);
91
92        vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
93        ypred_ptr += y_stride;
94        yleft += left_stride;
95      }
96    }
97    break;
98  }
99}
100
101void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
102                                            unsigned char * uabove_row,
103                                            unsigned char * vabove_row,
104                                            unsigned char * uleft,
105                                            unsigned char * vleft,
106                                            int left_stride,
107                                            unsigned char * upred_ptr,
108                                            unsigned char * vpred_ptr,
109                                            int pred_stride) {
110  const int mode = x->mode_info_context->mbmi.uv_mode;
111  int i;
112
113  switch (mode) {
114    case DC_PRED:
115    {
116      int shift = x->up_available + x->left_available;
117      uint8x8_t v_expected_udc = vdup_n_u8(128);
118      uint8x8_t v_expected_vdc = vdup_n_u8(128);
119
120      if (shift) {
121        unsigned int average_u = 0;
122        unsigned int average_v = 0;
123        int expected_udc;
124        int expected_vdc;
125        if (x->up_available) {
126          const uint8x8_t v_uabove = vld1_u8(uabove_row);
127          const uint8x8_t v_vabove = vld1_u8(vabove_row);
128          const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
129          const uint32x4_t b = vpaddlq_u16(a);
130          const uint64x2_t c = vpaddlq_u32(b);
131          average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
132          average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
133        }
134        if (x->left_available) {
135          for (i = 0; i < 8; ++i) {
136              average_u += uleft[0];
137              uleft += left_stride;
138              average_v += vleft[0];
139              vleft += left_stride;
140          }
141        }
142        shift += 2;
143        expected_udc = (average_u + (1 << (shift - 1))) >> shift;
144        expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
145        v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
146        v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
147      }
148      for (i = 0; i < 8; ++i) {
149        vst1_u8(upred_ptr, v_expected_udc);
150        upred_ptr += pred_stride;
151        vst1_u8(vpred_ptr, v_expected_vdc);
152        vpred_ptr += pred_stride;
153      }
154    }
155    break;
156    case V_PRED:
157    {
158      const uint8x8_t v_uabove = vld1_u8(uabove_row);
159      const uint8x8_t v_vabove = vld1_u8(vabove_row);
160      for (i = 0; i < 8; ++i) {
161        vst1_u8(upred_ptr, v_uabove);
162        upred_ptr += pred_stride;
163        vst1_u8(vpred_ptr, v_vabove);
164        vpred_ptr += pred_stride;
165      }
166    }
167    break;
168    case H_PRED:
169    {
170      for (i = 0; i < 8; ++i) {
171        const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
172        const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
173        uleft += left_stride;
174        vleft += left_stride;
175        vst1_u8(upred_ptr, v_uleft);
176        upred_ptr += pred_stride;
177        vst1_u8(vpred_ptr, v_vleft);
178        vpred_ptr += pred_stride;
179      }
180    }
181    break;
182    case TM_PRED:
183    {
184      const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
185      const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
186      const uint8x8_t v_uabove = vld1_u8(uabove_row);
187      const uint8x8_t v_vabove = vld1_u8(vabove_row);
188      for (i = 0; i < 8; ++i) {
189        const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
190        const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
191        const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
192        const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
193        const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
194                                        vreinterpretq_s16_u16(v_utop_left));
195        const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
196                                        vreinterpretq_s16_u16(v_vtop_left));
197        const uint8x8_t pred_u = vqmovun_s16(b_u);
198        const uint8x8_t pred_v = vqmovun_s16(b_v);
199
200        vst1_u8(upred_ptr, pred_u);
201        vst1_u8(vpred_ptr, pred_v);
202        upred_ptr += pred_stride;
203        vpred_ptr += pred_stride;
204        uleft += left_stride;
205        vleft += left_stride;
206      }
207    }
208    break;
209  }
210}
211