1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vp8_rtcd.h" 12#include "vp8/common/mips/msa/vp8_macros_msa.h" 13 14static void temporal_filter_apply_16size_msa( 15 uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, 16 int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { 17 uint32_t row; 18 v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b; 19 v16u8 frame_l, frame_h; 20 v16i8 zero = { 0 }; 21 v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; 22 v8i16 diff0, diff1, cnt0, cnt1; 23 v4i32 const3, const16, filter_wt, strength; 24 v4i32 mod0_w, mod1_w, mod2_w, mod3_w; 25 v4i32 diff0_r, diff0_l, diff1_r, diff1_l; 26 v4i32 frame2_0, frame2_1, frame2_2, frame2_3; 27 v4i32 acc0, acc1, acc2, acc3; 28 29 filter_wt = __msa_fill_w(filter_wt_in); 30 strength = __msa_fill_w(strength_in); 31 const3 = __msa_ldi_w(3); 32 const16 = __msa_ldi_w(16); 33 34 for (row = 8; row--;) { 35 frame1_0_b = LD_SB(frame1_ptr); 36 frame2_0_b = LD_SB(frame2_ptr); 37 frame1_ptr += stride; 38 frame2_ptr += 16; 39 frame1_1_b = LD_SB(frame1_ptr); 40 frame2_1_b = LD_SB(frame2_ptr); 41 LD_SW2(acc, 4, acc0, acc1); 42 LD_SW2(acc + 8, 4, acc2, acc3); 43 LD_SH2(cnt, 8, cnt0, cnt1); 44 ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h); 45 HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); 46 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 47 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 48 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, 49 mod0_w, mod1_w, mod2_w, mod3_w); 50 MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, 51 mod1_w, mod2_w, mod3_w); 52 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 53 diff0_r = (mod0_w < const16); 54 diff0_l = (mod1_w < const16); 55 diff1_r = (mod2_w < const16); 56 diff1_l = (mod3_w < const16); 57 SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, 58 mod0_w, mod1_w, mod2_w, mod3_w); 59 mod0_w = diff0_r & mod0_w; 60 mod1_w = diff0_l & mod1_w; 61 mod2_w = diff1_r & mod2_w; 62 mod3_w = diff1_l & mod3_w; 63 MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, 64 filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); 65 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h) 66 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 67 ST_SH2(mod0_h, mod1_h, cnt, 8); 68 cnt += 16; 69 ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h); 70 UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); 71 UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); 72 MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, 73 mod0_w, mod1_w, mod2_w, mod3_w); 74 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, 75 mod2_w, mod3_w); 76 ST_SW2(mod0_w, mod1_w, acc, 4); 77 ST_SW2(mod2_w, mod3_w, acc + 8, 4); 78 acc += 16; 79 LD_SW2(acc, 4, acc0, acc1); 80 LD_SW2(acc + 8, 4, acc2, acc3); 81 LD_SH2(cnt, 8, cnt0, cnt1); 82 ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h); 83 HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); 84 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 85 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 86 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, 87 mod0_w, mod1_w, mod2_w, mod3_w); 88 MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, 89 mod1_w, mod2_w, mod3_w); 90 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 91 diff0_r = (mod0_w < const16); 92 diff0_l = (mod1_w < const16); 93 diff1_r = (mod2_w < const16); 94 diff1_l = (mod3_w < const16); 95 SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, 96 mod0_w, mod1_w, mod2_w, mod3_w); 97 mod0_w = diff0_r & mod0_w; 98 mod1_w = diff0_l & mod1_w; 99 mod2_w = diff1_r & mod2_w; 100 mod3_w = diff1_l & mod3_w; 101 MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, 102 filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); 103 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); 104 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 105 ST_SH2(mod0_h, mod1_h, cnt, 8); 106 cnt += 16; 107 108 UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h); 109 UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); 110 UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); 111 MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, 112 mod0_w, mod1_w, mod2_w, mod3_w); 113 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, 114 mod2_w, mod3_w); 115 ST_SW2(mod0_w, mod1_w, acc, 4); 116 ST_SW2(mod2_w, mod3_w, acc + 8, 4); 117 acc += 16; 118 frame1_ptr += stride; 119 frame2_ptr += 16; 120 } 121} 122 123static void temporal_filter_apply_8size_msa( 124 uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, 125 int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { 126 uint32_t row; 127 uint64_t f0, f1, f2, f3, f4, f5, f6, f7; 128 v16i8 frame1 = { 0 }; 129 v16i8 frame2 = { 0 }; 130 v16i8 frame3 = { 0 }; 131 v16i8 frame4 = { 0 }; 132 v16u8 frame_l, frame_h; 133 v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; 134 v8i16 diff0, diff1, cnt0, cnt1; 135 v4i32 const3, const16; 136 v4i32 filter_wt, strength; 137 v4i32 mod0_w, mod1_w, mod2_w, mod3_w; 138 v4i32 diff0_r, diff0_l, diff1_r, diff1_l; 139 v4i32 frame2_0, frame2_1, frame2_2, frame2_3; 140 v4i32 acc0, acc1, acc2, acc3; 141 142 filter_wt = __msa_fill_w(filter_wt_in); 143 strength = __msa_fill_w(strength_in); 144 const3 = __msa_ldi_w(3); 145 const16 = __msa_ldi_w(16); 146 147 for (row = 2; row--;) { 148 LD2(frame1_ptr, stride, f0, f1); 149 frame1_ptr += (2 * stride); 150 LD2(frame2_ptr, 8, f2, f3); 151 frame2_ptr += 16; 152 LD2(frame1_ptr, stride, f4, f5); 153 frame1_ptr += (2 * stride); 154 LD2(frame2_ptr, 8, f6, f7); 155 frame2_ptr += 16; 156 157 LD_SW2(acc, 4, acc0, acc1); 158 LD_SW2(acc + 8, 4, acc2, acc3); 159 LD_SH2(cnt, 8, cnt0, cnt1); 160 INSERT_D2_SB(f0, f1, frame1); 161 INSERT_D2_SB(f2, f3, frame2); 162 INSERT_D2_SB(f4, f5, frame3); 163 INSERT_D2_SB(f6, f7, frame4); 164 ILVRL_B2_UB(frame1, frame2, frame_l, frame_h); 165 HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); 166 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 167 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 168 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, 169 mod0_w, mod1_w, mod2_w, mod3_w); 170 MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, 171 mod1_w, mod2_w, mod3_w); 172 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 173 diff0_r = (mod0_w < const16); 174 diff0_l = (mod1_w < const16); 175 diff1_r = (mod2_w < const16); 176 diff1_l = (mod3_w < const16); 177 SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, 178 mod0_w, mod1_w, mod2_w, mod3_w); 179 mod0_w = diff0_r & mod0_w; 180 mod1_w = diff0_l & mod1_w; 181 mod2_w = diff1_r & mod2_w; 182 mod3_w = diff1_l & mod3_w; 183 MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, 184 filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); 185 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); 186 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 187 ST_SH2(mod0_h, mod1_h, cnt, 8); 188 cnt += 16; 189 190 UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h); 191 UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); 192 UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); 193 MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, 194 mod0_w, mod1_w, mod2_w, mod3_w); 195 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, 196 mod2_w, mod3_w); 197 ST_SW2(mod0_w, mod1_w, acc, 4); 198 ST_SW2(mod2_w, mod3_w, acc + 8, 4); 199 acc += 16; 200 201 LD_SW2(acc, 4, acc0, acc1); 202 LD_SW2(acc + 8, 4, acc2, acc3); 203 LD_SH2(cnt, 8, cnt0, cnt1); 204 ILVRL_B2_UB(frame3, frame4, frame_l, frame_h); 205 HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); 206 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 207 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 208 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, 209 mod0_w, mod1_w, mod2_w, mod3_w); 210 MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, 211 mod1_w, mod2_w, mod3_w); 212 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 213 diff0_r = (mod0_w < const16); 214 diff0_l = (mod1_w < const16); 215 diff1_r = (mod2_w < const16); 216 diff1_l = (mod3_w < const16); 217 SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, 218 mod0_w, mod1_w, mod2_w, mod3_w); 219 mod0_w = diff0_r & mod0_w; 220 mod1_w = diff0_l & mod1_w; 221 mod2_w = diff1_r & mod2_w; 222 mod3_w = diff1_l & mod3_w; 223 MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, 224 filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); 225 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); 226 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 227 ST_SH2(mod0_h, mod1_h, cnt, 8); 228 cnt += 16; 229 230 UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h); 231 UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); 232 UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); 233 MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, 234 mod0_w, mod1_w, mod2_w, mod3_w); 235 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, 236 mod2_w, mod3_w); 237 ST_SW2(mod0_w, mod1_w, acc, 4); 238 ST_SW2(mod2_w, mod3_w, acc + 8, 4); 239 acc += 16; 240 } 241} 242 243void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride, 244 uint8_t *frame2, uint32_t block_size, 245 int32_t strength, int32_t filter_weight, 246 uint32_t *accumulator, uint16_t *count) { 247 if (8 == block_size) { 248 temporal_filter_apply_8size_msa(frame1, stride, frame2, strength, 249 filter_weight, accumulator, count); 250 } else if (16 == block_size) { 251 temporal_filter_apply_16size_msa(frame1, stride, frame2, strength, 252 filter_weight, accumulator, count); 253 } else { 254 uint32_t i, j, k; 255 int32_t modifier; 256 int32_t byte = 0; 257 const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0; 258 259 for (i = 0, k = 0; i < block_size; ++i) { 260 for (j = 0; j < block_size; ++j, ++k) { 261 int src_byte = frame1[byte]; 262 int pixel_value = *frame2++; 263 264 modifier = src_byte - pixel_value; 265 modifier *= modifier; 266 modifier *= 3; 267 modifier += rounding; 268 modifier >>= strength; 269 270 if (modifier > 16) modifier = 16; 271 272 modifier = 16 - modifier; 273 modifier *= filter_weight; 274 275 count[k] += modifier; 276 accumulator[k] += modifier * pixel_value; 277 278 byte++; 279 } 280 281 byte += stride - block_size; 282 } 283 } 284} 285