loopfilter_avx2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <immintrin.h> /* AVX2 */ 12 13#include "./vpx_dsp_rtcd.h" 14#include "vpx_ports/mem.h" 15 16static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, 17 const unsigned char *_blimit, const unsigned char *_limit, 18 const unsigned char *_thresh) { 19 __m128i mask, hev, flat, flat2; 20 const __m128i zero = _mm_set1_epi16(0); 21 const __m128i one = _mm_set1_epi8(1); 22 __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; 23 __m128i abs_p1p0; 24 25 const __m128i thresh = _mm_broadcastb_epi8( 26 _mm_cvtsi32_si128((int) _thresh[0])); 27 const __m128i limit = _mm_broadcastb_epi8( 28 _mm_cvtsi32_si128((int) _limit[0])); 29 const __m128i blimit = _mm_broadcastb_epi8( 30 _mm_cvtsi32_si128((int) _blimit[0])); 31 32 q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p)); 33 q4p4 = _mm_castps_si128( 34 _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p))); 35 q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p)); 36 q3p3 = _mm_castps_si128( 37 _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p))); 38 q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p)); 39 q2p2 = _mm_castps_si128( 40 _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p))); 41 q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p)); 42 q1p1 = _mm_castps_si128( 43 _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p))); 44 p1q1 = _mm_shuffle_epi32(q1p1, 78); 45 q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p)); 46 q0p0 = _mm_castps_si128( 47 _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p))); 48 p0q0 = _mm_shuffle_epi32(q0p0, 78); 49 50 { 51 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; 52 abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), 53 _mm_subs_epu8(q0p0, q1p1)); 54 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); 55 fe = _mm_set1_epi8(0xfe); 56 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 57 abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), 58 _mm_subs_epu8(p0q0, q0p0)); 59 abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), 60 _mm_subs_epu8(p1q1, q1p1)); 61 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 62 hev = _mm_subs_epu8(flat, thresh); 63 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 64 65 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); 66 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 67 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 68 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 69 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 70 mask = _mm_max_epu8(abs_p1p0, mask); 71 // mask |= (abs(p1 - p0) > limit) * -1; 72 // mask |= (abs(q1 - q0) > limit) * -1; 73 74 work = _mm_max_epu8( 75 _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), 76 _mm_subs_epu8(q1p1, q2p2)), 77 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), 78 _mm_subs_epu8(q2p2, q3p3))); 79 mask = _mm_max_epu8(work, mask); 80 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); 81 mask = _mm_subs_epu8(mask, limit); 82 mask = _mm_cmpeq_epi8(mask, zero); 83 } 84 85 // lp filter 86 { 87 const __m128i t4 = _mm_set1_epi8(4); 88 const __m128i t3 = _mm_set1_epi8(3); 89 const __m128i t80 = _mm_set1_epi8(0x80); 90 const __m128i t1 = _mm_set1_epi16(0x1); 91 __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); 92 __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); 93 __m128i qs0 = _mm_xor_si128(p0q0, t80); 94 __m128i qs1 = _mm_xor_si128(p1q1, t80); 95 __m128i filt; 96 __m128i work_a; 97 __m128i filter1, filter2; 98 __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; 99 __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; 100 101 filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); 102 work_a = _mm_subs_epi8(qs0, qs0ps0); 103 filt = _mm_adds_epi8(filt, work_a); 104 filt = _mm_adds_epi8(filt, work_a); 105 filt = _mm_adds_epi8(filt, work_a); 106 /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ 107 filt = _mm_and_si128(filt, mask); 108 109 filter1 = _mm_adds_epi8(filt, t4); 110 filter2 = _mm_adds_epi8(filt, t3); 111 112 filter1 = _mm_unpacklo_epi8(zero, filter1); 113 filter1 = _mm_srai_epi16(filter1, 0xB); 114 filter2 = _mm_unpacklo_epi8(zero, filter2); 115 filter2 = _mm_srai_epi16(filter2, 0xB); 116 117 /* Filter1 >> 3 */ 118 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); 119 qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); 120 121 /* filt >> 1 */ 122 filt = _mm_adds_epi16(filter1, t1); 123 filt = _mm_srai_epi16(filt, 1); 124 filt = _mm_andnot_si128( 125 _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt); 126 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); 127 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); 128 // loopfilter done 129 130 { 131 __m128i work; 132 flat = _mm_max_epu8( 133 _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), 134 _mm_subs_epu8(q0p0, q2p2)), 135 _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), 136 _mm_subs_epu8(q0p0, q3p3))); 137 flat = _mm_max_epu8(abs_p1p0, flat); 138 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); 139 flat = _mm_subs_epu8(flat, one); 140 flat = _mm_cmpeq_epi8(flat, zero); 141 flat = _mm_and_si128(flat, mask); 142 143 q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p)); 144 q5p5 = _mm_castps_si128( 145 _mm_loadh_pi(_mm_castsi128_ps(q5p5), 146 (__m64 *) (s + 5 * p))); 147 148 q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p)); 149 q6p6 = _mm_castps_si128( 150 _mm_loadh_pi(_mm_castsi128_ps(q6p6), 151 (__m64 *) (s + 6 * p))); 152 153 flat2 = _mm_max_epu8( 154 _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), 155 _mm_subs_epu8(q0p0, q4p4)), 156 _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), 157 _mm_subs_epu8(q0p0, q5p5))); 158 159 q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p)); 160 q7p7 = _mm_castps_si128( 161 _mm_loadh_pi(_mm_castsi128_ps(q7p7), 162 (__m64 *) (s + 7 * p))); 163 164 work = _mm_max_epu8( 165 _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), 166 _mm_subs_epu8(q0p0, q6p6)), 167 _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), 168 _mm_subs_epu8(q0p0, q7p7))); 169 170 flat2 = _mm_max_epu8(work, flat2); 171 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); 172 flat2 = _mm_subs_epu8(flat2, one); 173 flat2 = _mm_cmpeq_epi8(flat2, zero); 174 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 175 } 176 177 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 178 // flat and wide flat calculations 179 { 180 const __m128i eight = _mm_set1_epi16(8); 181 const __m128i four = _mm_set1_epi16(4); 182 __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; 183 __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; 184 __m128i pixelFilter_p, pixelFilter_q; 185 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; 186 __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; 187 188 p7_16 = _mm_unpacklo_epi8(q7p7, zero); 189 p6_16 = _mm_unpacklo_epi8(q6p6, zero); 190 p5_16 = _mm_unpacklo_epi8(q5p5, zero); 191 p4_16 = _mm_unpacklo_epi8(q4p4, zero); 192 p3_16 = _mm_unpacklo_epi8(q3p3, zero); 193 p2_16 = _mm_unpacklo_epi8(q2p2, zero); 194 p1_16 = _mm_unpacklo_epi8(q1p1, zero); 195 p0_16 = _mm_unpacklo_epi8(q0p0, zero); 196 q0_16 = _mm_unpackhi_epi8(q0p0, zero); 197 q1_16 = _mm_unpackhi_epi8(q1p1, zero); 198 q2_16 = _mm_unpackhi_epi8(q2p2, zero); 199 q3_16 = _mm_unpackhi_epi8(q3p3, zero); 200 q4_16 = _mm_unpackhi_epi8(q4p4, zero); 201 q5_16 = _mm_unpackhi_epi8(q5p5, zero); 202 q6_16 = _mm_unpackhi_epi8(q6p6, zero); 203 q7_16 = _mm_unpackhi_epi8(q7p7, zero); 204 205 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), 206 _mm_add_epi16(p4_16, p3_16)); 207 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), 208 _mm_add_epi16(q4_16, q3_16)); 209 210 pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, 211 _mm_add_epi16(p2_16, p1_16)); 212 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); 213 214 pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, 215 _mm_add_epi16(q2_16, q1_16)); 216 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); 217 pixelFilter_p = _mm_add_epi16(eight, 218 _mm_add_epi16(pixelFilter_p, pixelFilter_q)); 219 pixetFilter_p2p1p0 = _mm_add_epi16(four, 220 _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); 221 res_p = _mm_srli_epi16( 222 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 223 4); 224 res_q = _mm_srli_epi16( 225 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 226 4); 227 flat2_q0p0 = _mm_packus_epi16(res_p, res_q); 228 res_p = _mm_srli_epi16( 229 _mm_add_epi16(pixetFilter_p2p1p0, 230 _mm_add_epi16(p3_16, p0_16)), 3); 231 res_q = _mm_srli_epi16( 232 _mm_add_epi16(pixetFilter_p2p1p0, 233 _mm_add_epi16(q3_16, q0_16)), 3); 234 235 flat_q0p0 = _mm_packus_epi16(res_p, res_q); 236 237 sum_p7 = _mm_add_epi16(p7_16, p7_16); 238 sum_q7 = _mm_add_epi16(q7_16, q7_16); 239 sum_p3 = _mm_add_epi16(p3_16, p3_16); 240 sum_q3 = _mm_add_epi16(q3_16, q3_16); 241 242 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); 243 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); 244 res_p = _mm_srli_epi16( 245 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 246 4); 247 res_q = _mm_srli_epi16( 248 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 249 4); 250 flat2_q1p1 = _mm_packus_epi16(res_p, res_q); 251 252 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); 253 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); 254 res_p = _mm_srli_epi16( 255 _mm_add_epi16(pixetFilter_p2p1p0, 256 _mm_add_epi16(sum_p3, p1_16)), 3); 257 res_q = _mm_srli_epi16( 258 _mm_add_epi16(pixetFilter_q2q1q0, 259 _mm_add_epi16(sum_q3, q1_16)), 3); 260 flat_q1p1 = _mm_packus_epi16(res_p, res_q); 261 262 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 263 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 264 sum_p3 = _mm_add_epi16(sum_p3, p3_16); 265 sum_q3 = _mm_add_epi16(sum_q3, q3_16); 266 267 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); 268 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); 269 res_p = _mm_srli_epi16( 270 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 271 4); 272 res_q = _mm_srli_epi16( 273 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 274 4); 275 flat2_q2p2 = _mm_packus_epi16(res_p, res_q); 276 277 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); 278 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); 279 280 res_p = _mm_srli_epi16( 281 _mm_add_epi16(pixetFilter_p2p1p0, 282 _mm_add_epi16(sum_p3, p2_16)), 3); 283 res_q = _mm_srli_epi16( 284 _mm_add_epi16(pixetFilter_q2q1q0, 285 _mm_add_epi16(sum_q3, q2_16)), 3); 286 flat_q2p2 = _mm_packus_epi16(res_p, res_q); 287 288 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 289 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 290 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); 291 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); 292 res_p = _mm_srli_epi16( 293 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 294 4); 295 res_q = _mm_srli_epi16( 296 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 297 4); 298 flat2_q3p3 = _mm_packus_epi16(res_p, res_q); 299 300 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 301 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 302 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); 303 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); 304 res_p = _mm_srli_epi16( 305 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 306 4); 307 res_q = _mm_srli_epi16( 308 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 309 4); 310 flat2_q4p4 = _mm_packus_epi16(res_p, res_q); 311 312 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 313 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 314 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); 315 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); 316 res_p = _mm_srli_epi16( 317 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 318 4); 319 res_q = _mm_srli_epi16( 320 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 321 4); 322 flat2_q5p5 = _mm_packus_epi16(res_p, res_q); 323 324 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 325 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 326 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); 327 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); 328 res_p = _mm_srli_epi16( 329 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 330 4); 331 res_q = _mm_srli_epi16( 332 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 333 4); 334 flat2_q6p6 = _mm_packus_epi16(res_p, res_q); 335 } 336 // wide flat 337 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 338 339 flat = _mm_shuffle_epi32(flat, 68); 340 flat2 = _mm_shuffle_epi32(flat2, 68); 341 342 q2p2 = _mm_andnot_si128(flat, q2p2); 343 flat_q2p2 = _mm_and_si128(flat, flat_q2p2); 344 q2p2 = _mm_or_si128(q2p2, flat_q2p2); 345 346 qs1ps1 = _mm_andnot_si128(flat, qs1ps1); 347 flat_q1p1 = _mm_and_si128(flat, flat_q1p1); 348 q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); 349 350 qs0ps0 = _mm_andnot_si128(flat, qs0ps0); 351 flat_q0p0 = _mm_and_si128(flat, flat_q0p0); 352 q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); 353 354 q6p6 = _mm_andnot_si128(flat2, q6p6); 355 flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); 356 q6p6 = _mm_or_si128(q6p6, flat2_q6p6); 357 _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6); 358 _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6)); 359 360 q5p5 = _mm_andnot_si128(flat2, q5p5); 361 flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); 362 q5p5 = _mm_or_si128(q5p5, flat2_q5p5); 363 _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5); 364 _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5)); 365 366 q4p4 = _mm_andnot_si128(flat2, q4p4); 367 flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); 368 q4p4 = _mm_or_si128(q4p4, flat2_q4p4); 369 _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4); 370 _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4)); 371 372 q3p3 = _mm_andnot_si128(flat2, q3p3); 373 flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); 374 q3p3 = _mm_or_si128(q3p3, flat2_q3p3); 375 _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3); 376 _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3)); 377 378 q2p2 = _mm_andnot_si128(flat2, q2p2); 379 flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); 380 q2p2 = _mm_or_si128(q2p2, flat2_q2p2); 381 _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2); 382 _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2)); 383 384 q1p1 = _mm_andnot_si128(flat2, q1p1); 385 flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); 386 q1p1 = _mm_or_si128(q1p1, flat2_q1p1); 387 _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1); 388 _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1)); 389 390 q0p0 = _mm_andnot_si128(flat2, q0p0); 391 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); 392 q0p0 = _mm_or_si128(q0p0, flat2_q0p0); 393 _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0); 394 _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0)); 395 } 396} 397 398DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { 399 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, 400 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 401}; 402 403static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, 404 const unsigned char *_blimit, const unsigned char *_limit, 405 const unsigned char *_thresh) { 406 __m128i mask, hev, flat, flat2; 407 const __m128i zero = _mm_set1_epi16(0); 408 const __m128i one = _mm_set1_epi8(1); 409 __m128i p7, p6, p5; 410 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; 411 __m128i q5, q6, q7; 412 __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, 413 q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, 414 p256_0, q256_0; 415 416 const __m128i thresh = _mm_broadcastb_epi8( 417 _mm_cvtsi32_si128((int) _thresh[0])); 418 const __m128i limit = _mm_broadcastb_epi8( 419 _mm_cvtsi32_si128((int) _limit[0])); 420 const __m128i blimit = _mm_broadcastb_epi8( 421 _mm_cvtsi32_si128((int) _blimit[0])); 422 423 p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( 424 (__m128d const *)(s - 5 * p))); 425 p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( 426 (__m128d const *)(s - 4 * p))); 427 p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( 428 (__m128d const *)(s - 3 * p))); 429 p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( 430 (__m128d const *)(s - 2 * p))); 431 p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( 432 (__m128d const *)(s - 1 * p))); 433 q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( 434 (__m128d const *)(s - 0 * p))); 435 q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( 436 (__m128d const *)(s + 1 * p))); 437 q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( 438 (__m128d const *)(s + 2 * p))); 439 q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( 440 (__m128d const *)(s + 3 * p))); 441 q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( 442 (__m128d const *)(s + 4 * p))); 443 444 p4 = _mm256_castsi256_si128(p256_4); 445 p3 = _mm256_castsi256_si128(p256_3); 446 p2 = _mm256_castsi256_si128(p256_2); 447 p1 = _mm256_castsi256_si128(p256_1); 448 p0 = _mm256_castsi256_si128(p256_0); 449 q0 = _mm256_castsi256_si128(q256_0); 450 q1 = _mm256_castsi256_si128(q256_1); 451 q2 = _mm256_castsi256_si128(q256_2); 452 q3 = _mm256_castsi256_si128(q256_3); 453 q4 = _mm256_castsi256_si128(q256_4); 454 455 { 456 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), 457 _mm_subs_epu8(p0, p1)); 458 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), 459 _mm_subs_epu8(q0, q1)); 460 const __m128i fe = _mm_set1_epi8(0xfe); 461 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 462 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), 463 _mm_subs_epu8(q0, p0)); 464 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), 465 _mm_subs_epu8(q1, p1)); 466 __m128i work; 467 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 468 hev = _mm_subs_epu8(flat, thresh); 469 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 470 471 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); 472 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 473 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 474 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 475 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 476 mask = _mm_max_epu8(flat, mask); 477 // mask |= (abs(p1 - p0) > limit) * -1; 478 // mask |= (abs(q1 - q0) > limit) * -1; 479 work = _mm_max_epu8( 480 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), 481 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); 482 mask = _mm_max_epu8(work, mask); 483 work = _mm_max_epu8( 484 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), 485 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); 486 mask = _mm_max_epu8(work, mask); 487 mask = _mm_subs_epu8(mask, limit); 488 mask = _mm_cmpeq_epi8(mask, zero); 489 } 490 491 // lp filter 492 { 493 const __m128i t4 = _mm_set1_epi8(4); 494 const __m128i t3 = _mm_set1_epi8(3); 495 const __m128i t80 = _mm_set1_epi8(0x80); 496 const __m128i te0 = _mm_set1_epi8(0xe0); 497 const __m128i t1f = _mm_set1_epi8(0x1f); 498 const __m128i t1 = _mm_set1_epi8(0x1); 499 const __m128i t7f = _mm_set1_epi8(0x7f); 500 501 __m128i ps1 = _mm_xor_si128(p1, t80); 502 __m128i ps0 = _mm_xor_si128(p0, t80); 503 __m128i qs0 = _mm_xor_si128(q0, t80); 504 __m128i qs1 = _mm_xor_si128(q1, t80); 505 __m128i filt; 506 __m128i work_a; 507 __m128i filter1, filter2; 508 __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, 509 flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, 510 flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, 511 flat_q2; 512 513 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 514 work_a = _mm_subs_epi8(qs0, ps0); 515 filt = _mm_adds_epi8(filt, work_a); 516 filt = _mm_adds_epi8(filt, work_a); 517 filt = _mm_adds_epi8(filt, work_a); 518 /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ 519 filt = _mm_and_si128(filt, mask); 520 521 filter1 = _mm_adds_epi8(filt, t4); 522 filter2 = _mm_adds_epi8(filt, t3); 523 524 /* Filter1 >> 3 */ 525 work_a = _mm_cmpgt_epi8(zero, filter1); 526 filter1 = _mm_srli_epi16(filter1, 3); 527 work_a = _mm_and_si128(work_a, te0); 528 filter1 = _mm_and_si128(filter1, t1f); 529 filter1 = _mm_or_si128(filter1, work_a); 530 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 531 532 /* Filter2 >> 3 */ 533 work_a = _mm_cmpgt_epi8(zero, filter2); 534 filter2 = _mm_srli_epi16(filter2, 3); 535 work_a = _mm_and_si128(work_a, te0); 536 filter2 = _mm_and_si128(filter2, t1f); 537 filter2 = _mm_or_si128(filter2, work_a); 538 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 539 540 /* filt >> 1 */ 541 filt = _mm_adds_epi8(filter1, t1); 542 work_a = _mm_cmpgt_epi8(zero, filt); 543 filt = _mm_srli_epi16(filt, 1); 544 work_a = _mm_and_si128(work_a, t80); 545 filt = _mm_and_si128(filt, t7f); 546 filt = _mm_or_si128(filt, work_a); 547 filt = _mm_andnot_si128(hev, filt); 548 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 549 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 550 // loopfilter done 551 552 { 553 __m128i work; 554 work = _mm_max_epu8( 555 _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), 556 _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); 557 flat = _mm_max_epu8(work, flat); 558 work = _mm_max_epu8( 559 _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), 560 _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); 561 flat = _mm_max_epu8(work, flat); 562 work = _mm_max_epu8( 563 _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), 564 _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); 565 flat = _mm_subs_epu8(flat, one); 566 flat = _mm_cmpeq_epi8(flat, zero); 567 flat = _mm_and_si128(flat, mask); 568 569 p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( 570 (__m128d const *)(s - 6 * p))); 571 q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( 572 (__m128d const *)(s + 5 * p))); 573 p5 = _mm256_castsi256_si128(p256_5); 574 q5 = _mm256_castsi256_si128(q256_5); 575 flat2 = _mm_max_epu8( 576 _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), 577 _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); 578 579 flat2 = _mm_max_epu8(work, flat2); 580 p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( 581 (__m128d const *)(s - 7 * p))); 582 q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( 583 (__m128d const *)(s + 6 * p))); 584 p6 = _mm256_castsi256_si128(p256_6); 585 q6 = _mm256_castsi256_si128(q256_6); 586 work = _mm_max_epu8( 587 _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), 588 _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); 589 590 flat2 = _mm_max_epu8(work, flat2); 591 592 p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( 593 (__m128d const *)(s - 8 * p))); 594 q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( 595 (__m128d const *)(s + 7 * p))); 596 p7 = _mm256_castsi256_si128(p256_7); 597 q7 = _mm256_castsi256_si128(q256_7); 598 work = _mm_max_epu8( 599 _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), 600 _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); 601 602 flat2 = _mm_max_epu8(work, flat2); 603 flat2 = _mm_subs_epu8(flat2, one); 604 flat2 = _mm_cmpeq_epi8(flat2, zero); 605 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 606 } 607 608 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 609 // flat and wide flat calculations 610 { 611 const __m256i eight = _mm256_set1_epi16(8); 612 const __m256i four = _mm256_set1_epi16(4); 613 __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, 614 pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, 615 res_q; 616 617 const __m256i filter = _mm256_load_si256( 618 (__m256i const *)filt_loopfilter_avx2); 619 p256_7 = _mm256_shuffle_epi8(p256_7, filter); 620 p256_6 = _mm256_shuffle_epi8(p256_6, filter); 621 p256_5 = _mm256_shuffle_epi8(p256_5, filter); 622 p256_4 = _mm256_shuffle_epi8(p256_4, filter); 623 p256_3 = _mm256_shuffle_epi8(p256_3, filter); 624 p256_2 = _mm256_shuffle_epi8(p256_2, filter); 625 p256_1 = _mm256_shuffle_epi8(p256_1, filter); 626 p256_0 = _mm256_shuffle_epi8(p256_0, filter); 627 q256_0 = _mm256_shuffle_epi8(q256_0, filter); 628 q256_1 = _mm256_shuffle_epi8(q256_1, filter); 629 q256_2 = _mm256_shuffle_epi8(q256_2, filter); 630 q256_3 = _mm256_shuffle_epi8(q256_3, filter); 631 q256_4 = _mm256_shuffle_epi8(q256_4, filter); 632 q256_5 = _mm256_shuffle_epi8(q256_5, filter); 633 q256_6 = _mm256_shuffle_epi8(q256_6, filter); 634 q256_7 = _mm256_shuffle_epi8(q256_7, filter); 635 636 pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), 637 _mm256_add_epi16(p256_4, p256_3)); 638 pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), 639 _mm256_add_epi16(q256_4, q256_3)); 640 641 pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0, 642 _mm256_add_epi16(p256_2, p256_1)); 643 pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); 644 645 pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0, 646 _mm256_add_epi16(q256_2, q256_1)); 647 pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); 648 649 pixelFilter_p = _mm256_add_epi16(eight, 650 _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); 651 652 pixetFilter_p2p1p0 = _mm256_add_epi16(four, 653 _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); 654 655 res_p = _mm256_srli_epi16( 656 _mm256_add_epi16(pixelFilter_p, 657 _mm256_add_epi16(p256_7, p256_0)), 4); 658 659 flat2_p0 = _mm256_castsi256_si128( 660 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 661 168)); 662 663 res_q = _mm256_srli_epi16( 664 _mm256_add_epi16(pixelFilter_p, 665 _mm256_add_epi16(q256_7, q256_0)), 4); 666 667 flat2_q0 = _mm256_castsi256_si128( 668 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 669 168)); 670 671 res_p = _mm256_srli_epi16( 672 _mm256_add_epi16(pixetFilter_p2p1p0, 673 _mm256_add_epi16(p256_3, p256_0)), 3); 674 675 flat_p0 = _mm256_castsi256_si128( 676 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 677 168)); 678 679 res_q = _mm256_srli_epi16( 680 _mm256_add_epi16(pixetFilter_p2p1p0, 681 _mm256_add_epi16(q256_3, q256_0)), 3); 682 683 flat_q0 = _mm256_castsi256_si128( 684 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 685 168)); 686 687 sum_p7 = _mm256_add_epi16(p256_7, p256_7); 688 689 sum_q7 = _mm256_add_epi16(q256_7, q256_7); 690 691 sum_p3 = _mm256_add_epi16(p256_3, p256_3); 692 693 sum_q3 = _mm256_add_epi16(q256_3, q256_3); 694 695 pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); 696 697 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); 698 699 res_p = _mm256_srli_epi16( 700 _mm256_add_epi16(pixelFilter_p, 701 _mm256_add_epi16(sum_p7, p256_1)), 4); 702 703 flat2_p1 = _mm256_castsi256_si128( 704 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 705 168)); 706 707 res_q = _mm256_srli_epi16( 708 _mm256_add_epi16(pixelFilter_q, 709 _mm256_add_epi16(sum_q7, q256_1)), 4); 710 711 flat2_q1 = _mm256_castsi256_si128( 712 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 713 168)); 714 715 pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); 716 717 pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); 718 719 res_p = _mm256_srli_epi16( 720 _mm256_add_epi16(pixetFilter_p2p1p0, 721 _mm256_add_epi16(sum_p3, p256_1)), 3); 722 723 flat_p1 = _mm256_castsi256_si128( 724 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 725 168)); 726 727 res_q = _mm256_srli_epi16( 728 _mm256_add_epi16(pixetFilter_q2q1q0, 729 _mm256_add_epi16(sum_q3, q256_1)), 3); 730 731 flat_q1 = _mm256_castsi256_si128( 732 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 733 168)); 734 735 sum_p7 = _mm256_add_epi16(sum_p7, p256_7); 736 737 sum_q7 = _mm256_add_epi16(sum_q7, q256_7); 738 739 sum_p3 = _mm256_add_epi16(sum_p3, p256_3); 740 741 sum_q3 = _mm256_add_epi16(sum_q3, q256_3); 742 743 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); 744 745 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); 746 747 res_p = _mm256_srli_epi16( 748 _mm256_add_epi16(pixelFilter_p, 749 _mm256_add_epi16(sum_p7, p256_2)), 4); 750 751 flat2_p2 = _mm256_castsi256_si128( 752 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 753 168)); 754 755 res_q = _mm256_srli_epi16( 756 _mm256_add_epi16(pixelFilter_q, 757 _mm256_add_epi16(sum_q7, q256_2)), 4); 758 759 flat2_q2 = _mm256_castsi256_si128( 760 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 761 168)); 762 763 pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); 764 765 pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); 766 767 res_p = _mm256_srli_epi16( 768 _mm256_add_epi16(pixetFilter_p2p1p0, 769 _mm256_add_epi16(sum_p3, p256_2)), 3); 770 771 flat_p2 = _mm256_castsi256_si128( 772 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 773 168)); 774 775 res_q = _mm256_srli_epi16( 776 _mm256_add_epi16(pixetFilter_q2q1q0, 777 _mm256_add_epi16(sum_q3, q256_2)), 3); 778 779 flat_q2 = _mm256_castsi256_si128( 780 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 781 168)); 782 783 sum_p7 = _mm256_add_epi16(sum_p7, p256_7); 784 785 sum_q7 = _mm256_add_epi16(sum_q7, q256_7); 786 787 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); 788 789 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); 790 791 res_p = _mm256_srli_epi16( 792 _mm256_add_epi16(pixelFilter_p, 793 _mm256_add_epi16(sum_p7, p256_3)), 4); 794 795 flat2_p3 = _mm256_castsi256_si128( 796 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 797 168)); 798 799 res_q = _mm256_srli_epi16( 800 _mm256_add_epi16(pixelFilter_q, 801 _mm256_add_epi16(sum_q7, q256_3)), 4); 802 803 flat2_q3 = _mm256_castsi256_si128( 804 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 805 168)); 806 807 sum_p7 = _mm256_add_epi16(sum_p7, p256_7); 808 809 sum_q7 = _mm256_add_epi16(sum_q7, q256_7); 810 811 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); 812 813 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); 814 815 res_p = _mm256_srli_epi16( 816 _mm256_add_epi16(pixelFilter_p, 817 _mm256_add_epi16(sum_p7, p256_4)), 4); 818 819 flat2_p4 = _mm256_castsi256_si128( 820 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 821 168)); 822 823 res_q = _mm256_srli_epi16( 824 _mm256_add_epi16(pixelFilter_q, 825 _mm256_add_epi16(sum_q7, q256_4)), 4); 826 827 flat2_q4 = _mm256_castsi256_si128( 828 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 829 168)); 830 831 sum_p7 = _mm256_add_epi16(sum_p7, p256_7); 832 833 sum_q7 = _mm256_add_epi16(sum_q7, q256_7); 834 835 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); 836 837 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); 838 839 res_p = _mm256_srli_epi16( 840 _mm256_add_epi16(pixelFilter_p, 841 _mm256_add_epi16(sum_p7, p256_5)), 4); 842 843 flat2_p5 = _mm256_castsi256_si128( 844 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 845 168)); 846 847 res_q = _mm256_srli_epi16( 848 _mm256_add_epi16(pixelFilter_q, 849 _mm256_add_epi16(sum_q7, q256_5)), 4); 850 851 flat2_q5 = _mm256_castsi256_si128( 852 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 853 168)); 854 855 sum_p7 = _mm256_add_epi16(sum_p7, p256_7); 856 857 sum_q7 = _mm256_add_epi16(sum_q7, q256_7); 858 859 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); 860 861 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); 862 863 res_p = _mm256_srli_epi16( 864 _mm256_add_epi16(pixelFilter_p, 865 _mm256_add_epi16(sum_p7, p256_6)), 4); 866 867 flat2_p6 = _mm256_castsi256_si128( 868 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 869 168)); 870 871 res_q = _mm256_srli_epi16( 872 _mm256_add_epi16(pixelFilter_q, 873 _mm256_add_epi16(sum_q7, q256_6)), 4); 874 875 flat2_q6 = _mm256_castsi256_si128( 876 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 877 168)); 878 } 879 880 // wide flat 881 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 882 883 p2 = _mm_andnot_si128(flat, p2); 884 flat_p2 = _mm_and_si128(flat, flat_p2); 885 p2 = _mm_or_si128(flat_p2, p2); 886 887 p1 = _mm_andnot_si128(flat, ps1); 888 flat_p1 = _mm_and_si128(flat, flat_p1); 889 p1 = _mm_or_si128(flat_p1, p1); 890 891 p0 = _mm_andnot_si128(flat, ps0); 892 flat_p0 = _mm_and_si128(flat, flat_p0); 893 p0 = _mm_or_si128(flat_p0, p0); 894 895 q0 = _mm_andnot_si128(flat, qs0); 896 flat_q0 = _mm_and_si128(flat, flat_q0); 897 q0 = _mm_or_si128(flat_q0, q0); 898 899 q1 = _mm_andnot_si128(flat, qs1); 900 flat_q1 = _mm_and_si128(flat, flat_q1); 901 q1 = _mm_or_si128(flat_q1, q1); 902 903 q2 = _mm_andnot_si128(flat, q2); 904 flat_q2 = _mm_and_si128(flat, flat_q2); 905 q2 = _mm_or_si128(flat_q2, q2); 906 907 p6 = _mm_andnot_si128(flat2, p6); 908 flat2_p6 = _mm_and_si128(flat2, flat2_p6); 909 p6 = _mm_or_si128(flat2_p6, p6); 910 _mm_storeu_si128((__m128i *) (s - 7 * p), p6); 911 912 p5 = _mm_andnot_si128(flat2, p5); 913 flat2_p5 = _mm_and_si128(flat2, flat2_p5); 914 p5 = _mm_or_si128(flat2_p5, p5); 915 _mm_storeu_si128((__m128i *) (s - 6 * p), p5); 916 917 p4 = _mm_andnot_si128(flat2, p4); 918 flat2_p4 = _mm_and_si128(flat2, flat2_p4); 919 p4 = _mm_or_si128(flat2_p4, p4); 920 _mm_storeu_si128((__m128i *) (s - 5 * p), p4); 921 922 p3 = _mm_andnot_si128(flat2, p3); 923 flat2_p3 = _mm_and_si128(flat2, flat2_p3); 924 p3 = _mm_or_si128(flat2_p3, p3); 925 _mm_storeu_si128((__m128i *) (s - 4 * p), p3); 926 927 p2 = _mm_andnot_si128(flat2, p2); 928 flat2_p2 = _mm_and_si128(flat2, flat2_p2); 929 p2 = _mm_or_si128(flat2_p2, p2); 930 _mm_storeu_si128((__m128i *) (s - 3 * p), p2); 931 932 p1 = _mm_andnot_si128(flat2, p1); 933 flat2_p1 = _mm_and_si128(flat2, flat2_p1); 934 p1 = _mm_or_si128(flat2_p1, p1); 935 _mm_storeu_si128((__m128i *) (s - 2 * p), p1); 936 937 p0 = _mm_andnot_si128(flat2, p0); 938 flat2_p0 = _mm_and_si128(flat2, flat2_p0); 939 p0 = _mm_or_si128(flat2_p0, p0); 940 _mm_storeu_si128((__m128i *) (s - 1 * p), p0); 941 942 q0 = _mm_andnot_si128(flat2, q0); 943 flat2_q0 = _mm_and_si128(flat2, flat2_q0); 944 q0 = _mm_or_si128(flat2_q0, q0); 945 _mm_storeu_si128((__m128i *) (s - 0 * p), q0); 946 947 q1 = _mm_andnot_si128(flat2, q1); 948 flat2_q1 = _mm_and_si128(flat2, flat2_q1); 949 q1 = _mm_or_si128(flat2_q1, q1); 950 _mm_storeu_si128((__m128i *) (s + 1 * p), q1); 951 952 q2 = _mm_andnot_si128(flat2, q2); 953 flat2_q2 = _mm_and_si128(flat2, flat2_q2); 954 q2 = _mm_or_si128(flat2_q2, q2); 955 _mm_storeu_si128((__m128i *) (s + 2 * p), q2); 956 957 q3 = _mm_andnot_si128(flat2, q3); 958 flat2_q3 = _mm_and_si128(flat2, flat2_q3); 959 q3 = _mm_or_si128(flat2_q3, q3); 960 _mm_storeu_si128((__m128i *) (s + 3 * p), q3); 961 962 q4 = _mm_andnot_si128(flat2, q4); 963 flat2_q4 = _mm_and_si128(flat2, flat2_q4); 964 q4 = _mm_or_si128(flat2_q4, q4); 965 _mm_storeu_si128((__m128i *) (s + 4 * p), q4); 966 967 q5 = _mm_andnot_si128(flat2, q5); 968 flat2_q5 = _mm_and_si128(flat2, flat2_q5); 969 q5 = _mm_or_si128(flat2_q5, q5); 970 _mm_storeu_si128((__m128i *) (s + 5 * p), q5); 971 972 q6 = _mm_andnot_si128(flat2, q6); 973 flat2_q6 = _mm_and_si128(flat2, flat2_q6); 974 q6 = _mm_or_si128(flat2_q6, q6); 975 _mm_storeu_si128((__m128i *) (s + 6 * p), q6); 976 } 977} 978 979void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, 980 const unsigned char *_blimit, const unsigned char *_limit, 981 const unsigned char *_thresh, int count) { 982 if (count == 1) 983 mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh); 984 else 985 mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh); 986} 987