1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <string.h> 12#include "vpx_dsp/mips/macros_msa.h" 13 14static void copy_width8_msa(const uint8_t *src, int32_t src_stride, 15 uint8_t *dst, int32_t dst_stride, int32_t height) { 16 int32_t cnt; 17 uint64_t out0, out1, out2, out3, out4, out5, out6, out7; 18 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 19 20 if (0 == height % 12) { 21 for (cnt = (height / 12); cnt--;) { 22 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 23 src += (8 * src_stride); 24 25 out0 = __msa_copy_u_d((v2i64)src0, 0); 26 out1 = __msa_copy_u_d((v2i64)src1, 0); 27 out2 = __msa_copy_u_d((v2i64)src2, 0); 28 out3 = __msa_copy_u_d((v2i64)src3, 0); 29 out4 = __msa_copy_u_d((v2i64)src4, 0); 30 out5 = __msa_copy_u_d((v2i64)src5, 0); 31 out6 = __msa_copy_u_d((v2i64)src6, 0); 32 out7 = __msa_copy_u_d((v2i64)src7, 0); 33 34 SD4(out0, out1, out2, out3, dst, dst_stride); 35 dst += (4 * dst_stride); 36 SD4(out4, out5, out6, out7, dst, dst_stride); 37 dst += (4 * dst_stride); 38 39 LD_UB4(src, src_stride, src0, src1, src2, src3); 40 src += (4 * src_stride); 41 42 out0 = __msa_copy_u_d((v2i64)src0, 0); 43 out1 = __msa_copy_u_d((v2i64)src1, 0); 44 out2 = __msa_copy_u_d((v2i64)src2, 0); 45 out3 = __msa_copy_u_d((v2i64)src3, 0); 46 SD4(out0, out1, out2, out3, dst, dst_stride); 47 dst += (4 * dst_stride); 48 } 49 } else if (0 == height % 8) { 50 for (cnt = height >> 3; cnt--;) { 51 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 52 src += (8 * src_stride); 53 54 out0 = __msa_copy_u_d((v2i64)src0, 0); 55 out1 = __msa_copy_u_d((v2i64)src1, 0); 56 out2 = __msa_copy_u_d((v2i64)src2, 0); 57 out3 = __msa_copy_u_d((v2i64)src3, 0); 58 out4 = __msa_copy_u_d((v2i64)src4, 0); 59 out5 = __msa_copy_u_d((v2i64)src5, 0); 60 out6 = __msa_copy_u_d((v2i64)src6, 0); 61 out7 = __msa_copy_u_d((v2i64)src7, 0); 62 63 SD4(out0, out1, out2, out3, dst, dst_stride); 64 dst += (4 * dst_stride); 65 SD4(out4, out5, out6, out7, dst, dst_stride); 66 dst += (4 * dst_stride); 67 } 68 } else if (0 == height % 4) { 69 for (cnt = (height / 4); cnt--;) { 70 LD_UB4(src, src_stride, src0, src1, src2, src3); 71 src += (4 * src_stride); 72 out0 = __msa_copy_u_d((v2i64)src0, 0); 73 out1 = __msa_copy_u_d((v2i64)src1, 0); 74 out2 = __msa_copy_u_d((v2i64)src2, 0); 75 out3 = __msa_copy_u_d((v2i64)src3, 0); 76 77 SD4(out0, out1, out2, out3, dst, dst_stride); 78 dst += (4 * dst_stride); 79 } 80 } else if (0 == height % 2) { 81 for (cnt = (height / 2); cnt--;) { 82 LD_UB2(src, src_stride, src0, src1); 83 src += (2 * src_stride); 84 out0 = __msa_copy_u_d((v2i64)src0, 0); 85 out1 = __msa_copy_u_d((v2i64)src1, 0); 86 87 SD(out0, dst); 88 dst += dst_stride; 89 SD(out1, dst); 90 dst += dst_stride; 91 } 92 } 93} 94 95static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, 96 uint8_t *dst, int32_t dst_stride, 97 int32_t height, int32_t width) { 98 int32_t cnt, loop_cnt; 99 const uint8_t *src_tmp; 100 uint8_t *dst_tmp; 101 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 102 103 for (cnt = (width >> 4); cnt--;) { 104 src_tmp = src; 105 dst_tmp = dst; 106 107 for (loop_cnt = (height >> 3); loop_cnt--;) { 108 LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6, 109 src7); 110 src_tmp += (8 * src_stride); 111 112 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp, 113 dst_stride); 114 dst_tmp += (8 * dst_stride); 115 } 116 117 src += 16; 118 dst += 16; 119 } 120} 121 122static void copy_width16_msa(const uint8_t *src, int32_t src_stride, 123 uint8_t *dst, int32_t dst_stride, int32_t height) { 124 int32_t cnt; 125 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 126 127 if (0 == height % 12) { 128 for (cnt = (height / 12); cnt--;) { 129 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 130 src += (8 * src_stride); 131 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); 132 dst += (8 * dst_stride); 133 134 LD_UB4(src, src_stride, src0, src1, src2, src3); 135 src += (4 * src_stride); 136 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 137 dst += (4 * dst_stride); 138 } 139 } else if (0 == height % 8) { 140 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); 141 } else if (0 == height % 4) { 142 for (cnt = (height >> 2); cnt--;) { 143 LD_UB4(src, src_stride, src0, src1, src2, src3); 144 src += (4 * src_stride); 145 146 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 147 dst += (4 * dst_stride); 148 } 149 } 150} 151 152static void copy_width32_msa(const uint8_t *src, int32_t src_stride, 153 uint8_t *dst, int32_t dst_stride, int32_t height) { 154 int32_t cnt; 155 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 156 157 if (0 == height % 12) { 158 for (cnt = (height / 12); cnt--;) { 159 LD_UB4(src, src_stride, src0, src1, src2, src3); 160 LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 161 src += (4 * src_stride); 162 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 163 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 164 dst += (4 * dst_stride); 165 166 LD_UB4(src, src_stride, src0, src1, src2, src3); 167 LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 168 src += (4 * src_stride); 169 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 170 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 171 dst += (4 * dst_stride); 172 173 LD_UB4(src, src_stride, src0, src1, src2, src3); 174 LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 175 src += (4 * src_stride); 176 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 177 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 178 dst += (4 * dst_stride); 179 } 180 } else if (0 == height % 8) { 181 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); 182 } else if (0 == height % 4) { 183 for (cnt = (height >> 2); cnt--;) { 184 LD_UB4(src, src_stride, src0, src1, src2, src3); 185 LD_UB4(src + 16, src_stride, src4, src5, src6, src7); 186 src += (4 * src_stride); 187 ST_UB4(src0, src1, src2, src3, dst, dst_stride); 188 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); 189 dst += (4 * dst_stride); 190 } 191 } 192} 193 194static void copy_width64_msa(const uint8_t *src, int32_t src_stride, 195 uint8_t *dst, int32_t dst_stride, int32_t height) { 196 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); 197} 198 199void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, 200 uint8_t *dst, ptrdiff_t dst_stride, 201 const int16_t *filter_x, int32_t filter_x_stride, 202 const int16_t *filter_y, int32_t filter_y_stride, 203 int32_t w, int32_t h) { 204 (void)filter_x; 205 (void)filter_y; 206 (void)filter_x_stride; 207 (void)filter_y_stride; 208 209 switch (w) { 210 case 4: { 211 uint32_t cnt, tmp; 212 /* 1 word storage */ 213 for (cnt = h; cnt--;) { 214 tmp = LW(src); 215 SW(tmp, dst); 216 src += src_stride; 217 dst += dst_stride; 218 } 219 break; 220 } 221 case 8: { 222 copy_width8_msa(src, src_stride, dst, dst_stride, h); 223 break; 224 } 225 case 16: { 226 copy_width16_msa(src, src_stride, dst, dst_stride, h); 227 break; 228 } 229 case 32: { 230 copy_width32_msa(src, src_stride, dst, dst_stride, h); 231 break; 232 } 233 case 64: { 234 copy_width64_msa(src, src_stride, dst, dst_stride, h); 235 break; 236 } 237 default: { 238 uint32_t cnt; 239 for (cnt = h; cnt--;) { 240 memcpy(dst, src, w); 241 src += src_stride; 242 dst += dst_stride; 243 } 244 break; 245 } 246 } 247} 248