1/****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19*/ 20 21/** 22 ******************************************************************************* 23 * @file 24 * impeg2_inter_pred_sse42_intr.c 25 * 26 * @brief 27 * Contains Motion compensation function definitions for MPEG2 decoder 28 * 29 * @author 30 * Mohit [100664] 31 * 32 * - impeg2_copy_mb_sse42() 33 * - impeg2_interpolate_sse42() 34 * - impeg2_mc_halfx_halfy_8x8_sse42() 35 * - impeg2_mc_halfx_fully_8x8_sse42() 36 * - impeg2_mc_fullx_halfy_8x8_sse42() 37 * - impeg2_mc_fullx_fully_8x8_sse42() 38 * 39 * @remarks 40 * None 41 * 42 ******************************************************************************* 43 */ 44#include <stdio.h> 45#include <string.h> 46#include "iv_datatypedef.h" 47#include "impeg2_macros.h" 48#include "impeg2_defs.h" 49#include "impeg2_inter_pred.h" 50 51#include <immintrin.h> 52#include <emmintrin.h> 53#include <smmintrin.h> 54#include <tmmintrin.h> 55 56/******************************************************************************* 57* Function Name : impeg2_copy_mb 58* 59* Description : copies 3 components to the frame from mc_buf 60* 61* Arguments : 62* src_buf : Source Buffer 63* dst_buf : Destination Buffer 64* src_wd : Source Width 65* dst_wd : destination Width 66* 67* Values Returned : None 68*******************************************************************************/ 69void impeg2_copy_mb_sse42(yuv_buf_t *src_buf, 70 yuv_buf_t *dst_buf, 71 UWORD32 src_wd, 72 UWORD32 dst_wd) 73{ 74 UWORD8 *src; 75 UWORD8 *dst; 76 __m128i src_r0, src_r1, src_r2, src_r3; 77 78 /*******************************************************/ 79 /* copy Y */ 80 /*******************************************************/ 81 src = src_buf->pu1_y; 82 dst = dst_buf->pu1_y; 83 // Row 0-3 84 src_r0 = _mm_loadu_si128((__m128i *) (src)); 85 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); 86 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); 87 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); 88 89 _mm_storeu_si128((__m128i *) dst, src_r0); 90 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); 91 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); 92 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); 93 94 // Row 4-7 95 src += 4 * src_wd; 96 dst += 4 * dst_wd; 97 src_r0 = _mm_loadu_si128((__m128i *) (src)); 98 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); 99 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); 100 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); 101 102 _mm_storeu_si128((__m128i *) dst, src_r0); 103 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); 104 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); 105 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); 106 107 // Row 8-11 108 src += 4 * src_wd; 109 dst += 4 * dst_wd; 110 src_r0 = _mm_loadu_si128((__m128i *) (src)); 111 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); 112 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); 113 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); 114 115 _mm_storeu_si128((__m128i *) dst, src_r0); 116 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); 117 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); 118 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); 119 120 // Row 12-15 121 src += 4 * src_wd; 122 dst += 4 * dst_wd; 123 src_r0 = _mm_loadu_si128((__m128i *) (src)); 124 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); 125 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); 126 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); 127 128 _mm_storeu_si128((__m128i *) dst, src_r0); 129 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); 130 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); 131 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); 132 133 src_wd >>= 1; 134 dst_wd >>= 1; 135 136 /*******************************************************/ 137 /* copy U */ 138 /*******************************************************/ 139 src = src_buf->pu1_u; 140 dst = dst_buf->pu1_u; 141 142 // Row 0-3 143 src_r0 = _mm_loadl_epi64((__m128i *)src); 144 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); 145 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); 146 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); 147 148 _mm_storel_epi64((__m128i *)dst, src_r0); 149 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); 150 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); 151 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); 152 153 // Row 4-7 154 src += 4 * src_wd; 155 dst += 4 * dst_wd; 156 157 src_r0 = _mm_loadl_epi64((__m128i *)src); 158 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); 159 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); 160 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); 161 162 _mm_storel_epi64((__m128i *)dst, src_r0); 163 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); 164 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); 165 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); 166 167 /*******************************************************/ 168 /* copy V */ 169 /*******************************************************/ 170 src = src_buf->pu1_v; 171 dst = dst_buf->pu1_v; 172 // Row 0-3 173 src_r0 = _mm_loadl_epi64((__m128i *)src); 174 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); 175 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); 176 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); 177 178 _mm_storel_epi64((__m128i *)dst, src_r0); 179 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); 180 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); 181 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); 182 183 // Row 4-7 184 src += 4 * src_wd; 185 dst += 4 * dst_wd; 186 187 src_r0 = _mm_loadl_epi64((__m128i *)src); 188 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); 189 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); 190 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); 191 192 _mm_storel_epi64((__m128i *)dst, src_r0); 193 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); 194 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); 195 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); 196} 197 198/*****************************************************************************/ 199/* */ 200/* Function Name : impeg2_interpolate */ 201/* */ 202/* Description : averages the contents of buf_src1 and buf_src2 and stores*/ 203/* result in buf_dst */ 204/* */ 205/* Inputs : buf_src1 - First Source */ 206/* buf_src2 - Second Source */ 207/* */ 208/* Globals : None */ 209/* */ 210/* Processing : Avg the values from two sources and store the result in */ 211/* destination buffer */ 212/* */ 213/* Outputs : buf_dst - Avg of contents of buf_src1 and buf_src2 */ 214/* */ 215/* Returns : None */ 216/* */ 217/* Issues : Assumes that all 3 buffers are of same size */ 218/* */ 219/*****************************************************************************/ 220void impeg2_interpolate_sse42(yuv_buf_t *buf_src1, 221 yuv_buf_t *buf_src2, 222 yuv_buf_t *buf_dst, 223 UWORD32 stride) 224{ 225 UWORD8 *src1, *src2; 226 UWORD8 *dst; 227 __m128i src1_r0, src1_r1, src1_r2, src1_r3; 228 __m128i src2_r0, src2_r1, src2_r2, src2_r3; 229 230 /*******************************************************/ 231 /* interpolate Y */ 232 /*******************************************************/ 233 src1 = buf_src1->pu1_y; 234 src2 = buf_src2->pu1_y; 235 dst = buf_dst->pu1_y; 236 // Row 0-3 237 src1_r0 = _mm_loadu_si128((__m128i *) (src1)); 238 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); 239 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); 240 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); 241 242 src2_r0 = _mm_loadu_si128((__m128i *) (src2)); 243 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); 244 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); 245 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); 246 247 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 248 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 249 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 250 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 251 252 _mm_storeu_si128((__m128i *) dst, src1_r0); 253 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); 254 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); 255 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); 256 257 // Row 4-7 258 src1 += 4 * 16; 259 src2 += 4 * 16; 260 dst += 4 * stride; 261 src1_r0 = _mm_loadu_si128((__m128i *) (src1)); 262 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); 263 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); 264 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); 265 266 src2_r0 = _mm_loadu_si128((__m128i *) (src2)); 267 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); 268 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); 269 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); 270 271 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 272 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 273 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 274 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 275 276 _mm_storeu_si128((__m128i *) dst, src1_r0); 277 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); 278 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); 279 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); 280 281 // Row 8-11 282 src1 += 4 * 16; 283 src2 += 4 * 16; 284 dst += 4 * stride; 285 src1_r0 = _mm_loadu_si128((__m128i *) (src1)); 286 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); 287 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); 288 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); 289 290 src2_r0 = _mm_loadu_si128((__m128i *) (src2)); 291 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); 292 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); 293 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); 294 295 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 296 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 297 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 298 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 299 300 _mm_storeu_si128((__m128i *) dst, src1_r0); 301 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); 302 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); 303 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); 304 305 // Row 12-15 306 src1 += 4 * 16; 307 src2 += 4 * 16; 308 dst += 4 * stride; 309 src1_r0 = _mm_loadu_si128((__m128i *) (src1)); 310 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); 311 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); 312 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); 313 314 src2_r0 = _mm_loadu_si128((__m128i *) (src2)); 315 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); 316 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); 317 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); 318 319 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 320 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 321 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 322 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 323 324 _mm_storeu_si128((__m128i *) dst, src1_r0); 325 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); 326 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); 327 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); 328 329 stride >>= 1; 330 331 /*******************************************************/ 332 /* interpolate U */ 333 /*******************************************************/ 334 src1 = buf_src1->pu1_u; 335 src2 = buf_src2->pu1_u; 336 dst = buf_dst->pu1_u; 337 // Row 0-3 338 src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); 339 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); 340 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); 341 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); 342 343 src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); 344 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); 345 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); 346 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); 347 348 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 349 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 350 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 351 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 352 353 _mm_storel_epi64((__m128i *) dst, src1_r0); 354 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); 355 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); 356 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); 357 358 // Row 4-7 359 src1 += 4 * 8; 360 src2 += 4 * 8; 361 dst += 4 * stride; 362 363 src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); 364 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); 365 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); 366 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); 367 368 src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); 369 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); 370 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); 371 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); 372 373 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 374 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 375 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 376 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 377 378 _mm_storel_epi64((__m128i *) dst, src1_r0); 379 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); 380 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); 381 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); 382 383 /*******************************************************/ 384 /* interpolate V */ 385 /*******************************************************/ 386 src1 = buf_src1->pu1_v; 387 src2 = buf_src2->pu1_v; 388 dst = buf_dst->pu1_v; 389 390 // Row 0-3 391 src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); 392 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); 393 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); 394 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); 395 396 src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); 397 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); 398 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); 399 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); 400 401 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 402 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 403 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 404 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 405 406 _mm_storel_epi64((__m128i *) dst, src1_r0); 407 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); 408 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); 409 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); 410 411 // Row 4-7 412 src1 += 4 * 8; 413 src2 += 4 * 8; 414 dst += 4 * stride; 415 416 src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); 417 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); 418 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); 419 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); 420 421 src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); 422 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); 423 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); 424 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); 425 426 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 427 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 428 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 429 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 430 431 _mm_storel_epi64((__m128i *) dst, src1_r0); 432 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); 433 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); 434 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); 435} 436 437/*****************************************************************************/ 438/* */ 439/* Function Name : impeg2_mc_halfx_halfy_8x8_sse42() */ 440/* */ 441/* Description : Gets the buffer from (0.5,0.5) to (8.5,8.5) */ 442/* and the above block of size 8 x 8 will be placed as a */ 443/* block from the current position of out_buf */ 444/* */ 445/* Inputs : ref - Reference frame from which the block will be */ 446/* block will be extracted. */ 447/* ref_wid - WIdth of reference frame */ 448/* out_wid - WIdth of the output frame */ 449/* blk_width - width of the block */ 450/* blk_width - height of the block */ 451/* */ 452/* Globals : None */ 453/* */ 454/* Processing : Point to the (0,0),(1,0),(0,1),(1,1) position in */ 455/* the ref frame.Interpolate these four values to get the */ 456/* value at(0.5,0.5).Repeat this to get an 8 x 8 block */ 457/* using 9 x 9 block from reference frame */ 458/* */ 459/* Outputs : out - Output containing the extracted block */ 460/* */ 461/* Returns : None */ 462/* */ 463/* Issues : None */ 464/* */ 465/*****************************************************************************/ 466void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out, 467 UWORD8 *ref, 468 UWORD32 ref_wid, 469 UWORD32 out_wid) 470{ 471 UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3; 472 /* P0-P3 are the pixels in the reference frame and Q is the value being */ 473 /* estimated */ 474 /* 475 P0 P1 476 Q 477 P2 P3 478 */ 479 __m128i src_r0, src_r0_1, src_r1, src_r1_1; 480 __m128i tmp0, tmp1; 481 __m128i value_2 = _mm_set1_epi16(2); 482 483 ref_p0 = ref; 484 ref_p1 = ref + 1; 485 ref_p2 = ref + ref_wid; 486 ref_p3 = ref + ref_wid + 1; 487 488 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0 489 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); 490 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 1 491 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 492 493 src_r0 = _mm_cvtepu8_epi16(src_r0); 494 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 495 src_r1 = _mm_cvtepu8_epi16(src_r1); 496 src_r1_1 = _mm_cvtepu8_epi16(src_r1_1); 497 498 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 0 horizontal interpolation 499 tmp1 = _mm_add_epi16(src_r1, src_r1_1); //Row 1 horizontal interpolation 500 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 0 vertical interpolation 501 tmp0 = _mm_add_epi16(tmp0, value_2); 502 tmp0 = _mm_srli_epi16(tmp0, 2); 503 tmp0 = _mm_packus_epi16(tmp0, value_2); 504 505 _mm_storel_epi64((__m128i *)out, tmp0); 506 507 //Row 1 508 ref_p2 += ref_wid; 509 ref_p3 += ref_wid; 510 out += out_wid; 511 512 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 2 513 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 514 515 src_r0 = _mm_cvtepu8_epi16(src_r0); 516 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 517 518 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 2 horizontal interpolation 519 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 1 vertical interpolation 520 tmp1 = _mm_add_epi16(tmp1, value_2); 521 tmp1 = _mm_srli_epi16(tmp1, 2); 522 tmp1 = _mm_packus_epi16(tmp1, value_2); 523 524 _mm_storel_epi64((__m128i *)out, tmp1); 525 526 //Row 2 527 ref_p2 += ref_wid; 528 ref_p3 += ref_wid; 529 out += out_wid; 530 531 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 3 532 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 533 534 src_r0 = _mm_cvtepu8_epi16(src_r0); 535 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 536 537 tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 3 horizontal interpolation 538 539 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 2 vertical interpolation 540 tmp0 = _mm_add_epi16(tmp0, value_2); 541 tmp0 = _mm_srli_epi16(tmp0, 2); 542 tmp0 = _mm_packus_epi16(tmp0, value_2); 543 544 _mm_storel_epi64((__m128i *)out, tmp0); 545 546 //Row 3 547 ref_p2 += ref_wid; 548 ref_p3 += ref_wid; 549 out += out_wid; 550 551 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 4 552 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 553 554 src_r0 = _mm_cvtepu8_epi16(src_r0); 555 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 556 557 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 4 horizontal interpolation 558 559 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 3 vertical interpolation 560 tmp1 = _mm_add_epi16(tmp1, value_2); 561 tmp1 = _mm_srli_epi16(tmp1, 2); 562 tmp1 = _mm_packus_epi16(tmp1, value_2); 563 564 _mm_storel_epi64((__m128i *)out, tmp1); 565 566 //Row 4 567 ref_p2 += ref_wid; 568 ref_p3 += ref_wid; 569 out += out_wid; 570 571 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 5 572 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 573 574 src_r0 = _mm_cvtepu8_epi16(src_r0); 575 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 576 577 tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 5 horizontal interpolation 578 579 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 4 vertical interpolation 580 tmp0 = _mm_add_epi16(tmp0, value_2); 581 tmp0 = _mm_srli_epi16(tmp0, 2); 582 tmp0 = _mm_packus_epi16(tmp0, value_2); 583 584 _mm_storel_epi64((__m128i *)out, tmp0); 585 586 //Row 5 587 ref_p2 += ref_wid; 588 ref_p3 += ref_wid; 589 out += out_wid; 590 591 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 6 592 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 593 594 src_r0 = _mm_cvtepu8_epi16(src_r0); 595 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 596 597 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 6 horizontal interpolation 598 599 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 5 vertical interpolation 600 tmp1 = _mm_add_epi16(tmp1, value_2); 601 tmp1 = _mm_srli_epi16(tmp1, 2); 602 tmp1 = _mm_packus_epi16(tmp1, value_2); 603 604 _mm_storel_epi64((__m128i *)out, tmp1); 605 606 //Row 6 607 ref_p2 += ref_wid; 608 ref_p3 += ref_wid; 609 out += out_wid; 610 611 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 7 612 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 613 614 src_r0 = _mm_cvtepu8_epi16(src_r0); 615 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 616 617 tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 7 horizontal interpolation 618 619 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 6 vertical interpolation 620 tmp0 = _mm_add_epi16(tmp0, value_2); 621 tmp0 = _mm_srli_epi16(tmp0, 2); 622 tmp0 = _mm_packus_epi16(tmp0, value_2); 623 624 _mm_storel_epi64((__m128i *)out, tmp0); 625 626 //Row 7 627 ref_p2 += ref_wid; 628 ref_p3 += ref_wid; 629 out += out_wid; 630 631 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 8 632 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 633 634 src_r0 = _mm_cvtepu8_epi16(src_r0); 635 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 636 637 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 8 horizontal interpolation 638 639 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 7 vertical interpolation 640 tmp1 = _mm_add_epi16(tmp1, value_2); 641 tmp1 = _mm_srli_epi16(tmp1, 2); 642 tmp1 = _mm_packus_epi16(tmp1, value_2); 643 644 _mm_storel_epi64((__m128i *)out, tmp1); 645 646 return; 647} 648 649/*****************************************************************************/ 650/* */ 651/* Function Name : impeg2_mc_halfx_fully_8x8_sse42() */ 652/* */ 653/* Description : Gets the buffer from (0.5,0) to (8.5,8) */ 654/* and the above block of size 8 x 8 will be placed as a */ 655/* block from the current position of out_buf */ 656/* */ 657/* Inputs : ref - Reference frame from which the block will be */ 658/* block will be extracted. */ 659/* ref_wid - WIdth of reference frame */ 660/* out_wid - WIdth of the output frame */ 661/* blk_width - width of the block */ 662/* blk_width - height of the block */ 663/* */ 664/* Globals : None */ 665/* */ 666/* Processing : Point to the (0,0) and (1,0) position in the ref frame */ 667/* Interpolate these two values to get the value at(0.5,0) */ 668/* Repeat this to get an 8 x 8 block using 9 x 8 block from */ 669/* reference frame */ 670/* */ 671/* Outputs : out - Output containing the extracted block */ 672/* */ 673/* Returns : None */ 674/* */ 675/* Issues : None */ 676/* */ 677/*****************************************************************************/ 678void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out, 679 UWORD8 *ref, 680 UWORD32 ref_wid, 681 UWORD32 out_wid) 682{ 683 UWORD8 *ref_p0,*ref_p1; 684 __m128i src_r0, src_r0_1, src_r1, src_r1_1; 685 /* P0-P3 are the pixels in the reference frame and Q is the value being */ 686 /* estimated */ 687 /* 688 P0 Q P1 689 */ 690 691 ref_p0 = ref; 692 ref_p1 = ref + 1; 693 694 // Row 0 and 1 695 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0 696 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); 697 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 1 698 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); 699 700 src_r0 = _mm_avg_epu8(src_r0, src_r0_1); 701 src_r1 = _mm_avg_epu8(src_r1, src_r1_1); 702 703 _mm_storel_epi64((__m128i *)out, src_r0); 704 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 705 706 // Row 2 and 3 707 ref_p0 += 2*ref_wid; 708 ref_p1 += 2*ref_wid; 709 out += 2*out_wid; 710 711 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 2 712 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); 713 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 3 714 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); 715 716 src_r0 = _mm_avg_epu8(src_r0, src_r0_1); 717 src_r1 = _mm_avg_epu8(src_r1, src_r1_1); 718 719 _mm_storel_epi64((__m128i *)out, src_r0); 720 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 721 722 // Row 4 and 5 723 ref_p0 += 2*ref_wid; 724 ref_p1 += 2*ref_wid; 725 out += 2*out_wid; 726 727 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 4 728 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); 729 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 5 730 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); 731 732 src_r0 = _mm_avg_epu8(src_r0, src_r0_1); 733 src_r1 = _mm_avg_epu8(src_r1, src_r1_1); 734 735 _mm_storel_epi64((__m128i *)out, src_r0); 736 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 737 738 // Row 6 and 7 739 ref_p0 += 2*ref_wid; 740 ref_p1 += 2*ref_wid; 741 out += 2*out_wid; 742 743 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 6 744 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); 745 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 7 746 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); 747 748 src_r0 = _mm_avg_epu8(src_r0, src_r0_1); 749 src_r1 = _mm_avg_epu8(src_r1, src_r1_1); 750 751 _mm_storel_epi64((__m128i *)out, src_r0); 752 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 753 754 return; 755} 756 757 758/*****************************************************************************/ 759/* */ 760/* Function Name : impeg2_mc_fullx_halfy_8x8_sse42() */ 761/* */ 762/* Description : Gets the buffer from (0,0.5) to (8,8.5) */ 763/* and the above block of size 8 x 8 will be placed as a */ 764/* block from the current position of out_buf */ 765/* */ 766/* Inputs : ref - Reference frame from which the block will be */ 767/* block will be extracted. */ 768/* ref_wid - WIdth of reference frame */ 769/* out_wid - WIdth of the output frame */ 770/* blk_width - width of the block */ 771/* blk_width - height of the block */ 772/* */ 773/* Globals : None */ 774/* */ 775/* Processing : Point to the (0,0) and (0,1) position in the ref frame */ 776/* Interpolate these two values to get the value at(0,0.5) */ 777/* Repeat this to get an 8 x 8 block using 8 x 9 block from */ 778/* reference frame */ 779/* */ 780/* Outputs : out - Output containing the extracted block */ 781/* */ 782/* Returns : None */ 783/* */ 784/* Issues : None */ 785/* */ 786/*****************************************************************************/ 787void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out, 788 UWORD8 *ref, 789 UWORD32 ref_wid, 790 UWORD32 out_wid) 791{ 792 __m128i src_r0, src_r1, src_r2, temp0, temp1; 793 /* P0-P3 are the pixels in the reference frame and Q is the value being */ 794 /* estimated */ 795 /* 796 P0 797 x 798 P1 799 */ 800 src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 0 801 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 1 802 src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); //Row 2 803 temp0 = _mm_avg_epu8(src_r0, src_r1); 804 temp1 = _mm_avg_epu8(src_r1, src_r2); 805 _mm_storel_epi64((__m128i *)out, temp0); //Row 0 806 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 1 807 808 ref+= 3*ref_wid; 809 out+= 2*out_wid; 810 811 src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 3 812 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 4 813 temp0 = _mm_avg_epu8(src_r2, src_r0); 814 temp1 = _mm_avg_epu8(src_r0, src_r1); 815 _mm_storel_epi64((__m128i *)out, temp0); //Row 2 816 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 3 817 818 ref += 2*ref_wid; 819 out+= 2*out_wid; 820 821 src_r2 = _mm_loadl_epi64((__m128i *)ref); //Row 5 822 src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 6 823 temp0 = _mm_avg_epu8(src_r1, src_r2); 824 temp1 = _mm_avg_epu8(src_r2, src_r0); 825 _mm_storel_epi64((__m128i *)out, temp0); //Row 4 826 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 5 827 828 ref += 2*ref_wid; 829 out+= 2*out_wid; 830 831 src_r1 = _mm_loadl_epi64((__m128i *)ref); //Row 7 832 src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid)); //Row 8 833 temp0 = _mm_avg_epu8(src_r0, src_r1); 834 temp1 = _mm_avg_epu8(src_r1, src_r2); 835 _mm_storel_epi64((__m128i *)out, temp0); //Row 6 836 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 7 837 838 return; 839} 840 841/*****************************************************************************/ 842/* */ 843/* Function Name : impeg2_mc_fullx_fully_8x8_sse42() */ 844/* */ 845/* Description : Gets the buffer from (x,y) to (x+8,y+8) */ 846/* and the above block of size 8 x 8 will be placed as a */ 847/* block from the current position of out_buf */ 848/* */ 849/* Inputs : ref - Reference frame from which the block will be */ 850/* block will be extracted. */ 851/* ref_wid - WIdth of reference frame */ 852/* out_wid - WIdth of the output frame */ 853/* blk_width - width of the block */ 854/* blk_width - height of the block */ 855/* */ 856/* Globals : None */ 857/* */ 858/* Processing : Point to the (0,0) position in the ref frame */ 859/* Get an 8 x 8 block from reference frame */ 860/* */ 861/* Outputs : out - Output containing the extracted block */ 862/* */ 863/* Returns : None */ 864/* */ 865/* Issues : None */ 866/* */ 867/*****************************************************************************/ 868void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out, 869 UWORD8 *ref, 870 UWORD32 ref_wid, 871 UWORD32 out_wid) 872{ 873 __m128i src_r0, src_r1, src_r2, src_r3; 874 // Row 0-3 875 src_r0 = _mm_loadl_epi64((__m128i *)ref); 876 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); 877 src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); 878 src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid)); 879 880 _mm_storel_epi64((__m128i *)out, src_r0); 881 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 882 _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2); 883 _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3); 884 885 // Row 4-7 886 ref += 4 * ref_wid; 887 out += 4 * out_wid; 888 889 src_r0 = _mm_loadl_epi64((__m128i *)ref); 890 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); 891 src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); 892 src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid)); 893 894 _mm_storel_epi64((__m128i *)out, src_r0); 895 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 896 _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2); 897 _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3); 898 return; 899} 900