1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <emmintrin.h> /* SSE2 */ 12#include "vp9/common/vp9_loopfilter.h" 13#include "vpx_ports/emmintrin_compat.h" 14 15static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, 16 int p, 17 const unsigned char *_blimit, 18 const unsigned char *_limit, 19 const unsigned char *_thresh) { 20 __m128i mask, hev, flat, flat2; 21 const __m128i zero = _mm_set1_epi16(0); 22 const __m128i one = _mm_set1_epi8(1); 23 __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; 24 __m128i abs_p1p0; 25 const unsigned int extended_thresh = _thresh[0] * 0x01010101u; 26 const unsigned int extended_limit = _limit[0] * 0x01010101u; 27 const unsigned int extended_blimit = _blimit[0] * 0x01010101u; 28 const __m128i thresh = 29 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); 30 const __m128i limit = 31 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); 32 const __m128i blimit = 33 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); 34 35 q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); 36 q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4), 37 (__m64 *)(s + 4 * p))); 38 q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); 39 q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3), 40 (__m64 *)(s + 3 * p))); 41 q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); 42 q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2), 43 (__m64 *)(s + 2 * p))); 44 q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); 45 q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1), 46 (__m64 *)(s + 1 * p))); 47 p1q1 = _mm_shuffle_epi32(q1p1, 78); 48 q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); 49 q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0), 50 (__m64 *)(s - 0 * p))); 51 p0q0 = _mm_shuffle_epi32(q0p0, 78); 52 53 { 54 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; 55 abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), 56 _mm_subs_epu8(q0p0, q1p1)); 57 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); 58 fe = _mm_set1_epi8(0xfe); 59 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 60 abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), 61 _mm_subs_epu8(p0q0, q0p0)); 62 abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), 63 _mm_subs_epu8(p1q1, q1p1)); 64 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 65 hev = _mm_subs_epu8(flat, thresh); 66 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 67 68 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 69 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 70 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 71 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 72 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 73 mask = _mm_max_epu8(abs_p1p0, mask); 74 // mask |= (abs(p1 - p0) > limit) * -1; 75 // mask |= (abs(q1 - q0) > limit) * -1; 76 77 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1), 78 _mm_subs_epu8(q1p1, q2p2)), 79 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), 80 _mm_subs_epu8(q2p2, q3p3))); 81 mask = _mm_max_epu8(work, mask); 82 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); 83 mask = _mm_subs_epu8(mask, limit); 84 mask = _mm_cmpeq_epi8(mask, zero); 85 } 86 87 // lp filter 88 { 89 const __m128i t4 = _mm_set1_epi8(4); 90 const __m128i t3 = _mm_set1_epi8(3); 91 const __m128i t80 = _mm_set1_epi8(0x80); 92 const __m128i t1 = _mm_set1_epi16(0x1); 93 __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); 94 __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); 95 __m128i qs0 = _mm_xor_si128(p0q0, t80); 96 __m128i qs1 = _mm_xor_si128(p1q1, t80); 97 __m128i filt; 98 __m128i work_a; 99 __m128i filter1, filter2; 100 __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; 101 __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; 102 103 filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); 104 work_a = _mm_subs_epi8(qs0, qs0ps0); 105 filt = _mm_adds_epi8(filt, work_a); 106 filt = _mm_adds_epi8(filt, work_a); 107 filt = _mm_adds_epi8(filt, work_a); 108 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ 109 filt = _mm_and_si128(filt, mask); 110 111 filter1 = _mm_adds_epi8(filt, t4); 112 filter2 = _mm_adds_epi8(filt, t3); 113 114 filter1 = _mm_unpacklo_epi8(zero, filter1); 115 filter1 = _mm_srai_epi16(filter1, 0xB); 116 filter2 = _mm_unpacklo_epi8(zero, filter2); 117 filter2 = _mm_srai_epi16(filter2, 0xB); 118 119 /* Filter1 >> 3 */ 120 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); 121 qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); 122 123 /* filt >> 1 */ 124 filt = _mm_adds_epi16(filter1, t1); 125 filt = _mm_srai_epi16(filt, 1); 126 filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), 127 filt); 128 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); 129 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); 130 // loopfilter done 131 132 { 133 __m128i work; 134 flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0), 135 _mm_subs_epu8(q0p0, q2p2)), 136 _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), 137 _mm_subs_epu8(q0p0, q3p3))); 138 flat = _mm_max_epu8(abs_p1p0, flat); 139 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); 140 flat = _mm_subs_epu8(flat, one); 141 flat = _mm_cmpeq_epi8(flat, zero); 142 flat = _mm_and_si128(flat, mask); 143 144 q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); 145 q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5), 146 (__m64 *)(s + 5 * p))); 147 148 q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); 149 q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6), 150 (__m64 *)(s + 6 * p))); 151 152 flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0), 153 _mm_subs_epu8(q0p0, q4p4)), 154 _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), 155 _mm_subs_epu8(q0p0, q5p5))); 156 157 q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); 158 q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7), 159 (__m64 *)(s + 7 * p))); 160 161 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0), 162 _mm_subs_epu8(q0p0, q6p6)), 163 _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), 164 _mm_subs_epu8(q0p0, q7p7))); 165 166 flat2 = _mm_max_epu8(work, flat2); 167 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); 168 flat2 = _mm_subs_epu8(flat2, one); 169 flat2 = _mm_cmpeq_epi8(flat2, zero); 170 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 171 } 172 173 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 174 // flat and wide flat calculations 175 { 176 const __m128i eight = _mm_set1_epi16(8); 177 const __m128i four = _mm_set1_epi16(4); 178 __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; 179 __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; 180 __m128i pixelFilter_p, pixelFilter_q; 181 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; 182 __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; 183 184 p7_16 = _mm_unpacklo_epi8(q7p7, zero);; 185 p6_16 = _mm_unpacklo_epi8(q6p6, zero); 186 p5_16 = _mm_unpacklo_epi8(q5p5, zero); 187 p4_16 = _mm_unpacklo_epi8(q4p4, zero); 188 p3_16 = _mm_unpacklo_epi8(q3p3, zero); 189 p2_16 = _mm_unpacklo_epi8(q2p2, zero); 190 p1_16 = _mm_unpacklo_epi8(q1p1, zero); 191 p0_16 = _mm_unpacklo_epi8(q0p0, zero); 192 q0_16 = _mm_unpackhi_epi8(q0p0, zero); 193 q1_16 = _mm_unpackhi_epi8(q1p1, zero); 194 q2_16 = _mm_unpackhi_epi8(q2p2, zero); 195 q3_16 = _mm_unpackhi_epi8(q3p3, zero); 196 q4_16 = _mm_unpackhi_epi8(q4p4, zero); 197 q5_16 = _mm_unpackhi_epi8(q5p5, zero); 198 q6_16 = _mm_unpackhi_epi8(q6p6, zero); 199 q7_16 = _mm_unpackhi_epi8(q7p7, zero); 200 201 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), 202 _mm_add_epi16(p4_16, p3_16)); 203 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), 204 _mm_add_epi16(q4_16, q3_16)); 205 206 pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); 207 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); 208 209 pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); 210 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); 211 pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, 212 pixelFilter_q)); 213 pixetFilter_p2p1p0 = _mm_add_epi16(four, 214 _mm_add_epi16(pixetFilter_p2p1p0, 215 pixetFilter_q2q1q0)); 216 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 217 _mm_add_epi16(p7_16, p0_16)), 4); 218 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 219 _mm_add_epi16(q7_16, q0_16)), 4); 220 flat2_q0p0 = _mm_packus_epi16(res_p, res_q); 221 res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 222 _mm_add_epi16(p3_16, p0_16)), 3); 223 res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 224 _mm_add_epi16(q3_16, q0_16)), 3); 225 226 flat_q0p0 = _mm_packus_epi16(res_p, res_q); 227 228 sum_p7 = _mm_add_epi16(p7_16, p7_16); 229 sum_q7 = _mm_add_epi16(q7_16, q7_16); 230 sum_p3 = _mm_add_epi16(p3_16, p3_16); 231 sum_q3 = _mm_add_epi16(q3_16, q3_16); 232 233 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); 234 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); 235 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 236 _mm_add_epi16(sum_p7, p1_16)), 4); 237 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 238 _mm_add_epi16(sum_q7, q1_16)), 4); 239 flat2_q1p1 = _mm_packus_epi16(res_p, res_q); 240 241 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); 242 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); 243 res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 244 _mm_add_epi16(sum_p3, p1_16)), 3); 245 res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, 246 _mm_add_epi16(sum_q3, q1_16)), 3); 247 flat_q1p1 = _mm_packus_epi16(res_p, res_q); 248 249 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 250 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 251 sum_p3 = _mm_add_epi16(sum_p3, p3_16); 252 sum_q3 = _mm_add_epi16(sum_q3, q3_16); 253 254 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); 255 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); 256 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 257 _mm_add_epi16(sum_p7, p2_16)), 4); 258 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 259 _mm_add_epi16(sum_q7, q2_16)), 4); 260 flat2_q2p2 = _mm_packus_epi16(res_p, res_q); 261 262 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); 263 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); 264 265 res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 266 _mm_add_epi16(sum_p3, p2_16)), 3); 267 res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, 268 _mm_add_epi16(sum_q3, q2_16)), 3); 269 flat_q2p2 = _mm_packus_epi16(res_p, res_q); 270 271 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 272 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 273 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); 274 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); 275 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 276 _mm_add_epi16(sum_p7, p3_16)), 4); 277 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 278 _mm_add_epi16(sum_q7, q3_16)), 4); 279 flat2_q3p3 = _mm_packus_epi16(res_p, res_q); 280 281 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 282 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 283 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); 284 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); 285 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 286 _mm_add_epi16(sum_p7, p4_16)), 4); 287 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 288 _mm_add_epi16(sum_q7, q4_16)), 4); 289 flat2_q4p4 = _mm_packus_epi16(res_p, res_q); 290 291 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 292 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 293 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); 294 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); 295 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 296 _mm_add_epi16(sum_p7, p5_16)), 4); 297 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 298 _mm_add_epi16(sum_q7, q5_16)), 4); 299 flat2_q5p5 = _mm_packus_epi16(res_p, res_q); 300 301 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 302 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 303 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); 304 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); 305 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 306 _mm_add_epi16(sum_p7, p6_16)), 4); 307 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 308 _mm_add_epi16(sum_q7, q6_16)), 4); 309 flat2_q6p6 = _mm_packus_epi16(res_p, res_q); 310 } 311 // wide flat 312 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 313 314 flat = _mm_shuffle_epi32(flat, 68); 315 flat2 = _mm_shuffle_epi32(flat2, 68); 316 317 q2p2 = _mm_andnot_si128(flat, q2p2); 318 flat_q2p2 = _mm_and_si128(flat, flat_q2p2); 319 q2p2 = _mm_or_si128(q2p2, flat_q2p2); 320 321 qs1ps1 = _mm_andnot_si128(flat, qs1ps1); 322 flat_q1p1 = _mm_and_si128(flat, flat_q1p1); 323 q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); 324 325 qs0ps0 = _mm_andnot_si128(flat, qs0ps0); 326 flat_q0p0 = _mm_and_si128(flat, flat_q0p0); 327 q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); 328 329 q6p6 = _mm_andnot_si128(flat2, q6p6); 330 flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); 331 q6p6 = _mm_or_si128(q6p6, flat2_q6p6); 332 _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); 333 _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); 334 335 q5p5 = _mm_andnot_si128(flat2, q5p5); 336 flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); 337 q5p5 = _mm_or_si128(q5p5, flat2_q5p5); 338 _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); 339 _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); 340 341 q4p4 = _mm_andnot_si128(flat2, q4p4); 342 flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); 343 q4p4 = _mm_or_si128(q4p4, flat2_q4p4); 344 _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); 345 _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); 346 347 q3p3 = _mm_andnot_si128(flat2, q3p3); 348 flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); 349 q3p3 = _mm_or_si128(q3p3, flat2_q3p3); 350 _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); 351 _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); 352 353 q2p2 = _mm_andnot_si128(flat2, q2p2); 354 flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); 355 q2p2 = _mm_or_si128(q2p2, flat2_q2p2); 356 _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); 357 _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); 358 359 q1p1 = _mm_andnot_si128(flat2, q1p1); 360 flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); 361 q1p1 = _mm_or_si128(q1p1, flat2_q1p1); 362 _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); 363 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); 364 365 q0p0 = _mm_andnot_si128(flat2, q0p0); 366 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); 367 q0p0 = _mm_or_si128(q0p0, flat2_q0p0); 368 _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); 369 _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); 370 } 371} 372 373static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, 374 int p, 375 const unsigned char *_blimit, 376 const unsigned char *_limit, 377 const unsigned char *_thresh) { 378 DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]); 379 DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]); 380 381 DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]); 382 DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]); 383 384 DECLARE_ALIGNED(16, unsigned char, ap[8][16]); 385 DECLARE_ALIGNED(16, unsigned char, aq[8][16]); 386 387 388 __m128i mask, hev, flat, flat2; 389 const __m128i zero = _mm_set1_epi16(0); 390 const __m128i one = _mm_set1_epi8(1); 391 __m128i p7, p6, p5; 392 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; 393 __m128i q5, q6, q7; 394 int i = 0; 395 const unsigned int extended_thresh = _thresh[0] * 0x01010101u; 396 const unsigned int extended_limit = _limit[0] * 0x01010101u; 397 const unsigned int extended_blimit = _blimit[0] * 0x01010101u; 398 const __m128i thresh = 399 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); 400 const __m128i limit = 401 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); 402 const __m128i blimit = 403 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); 404 405 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); 406 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 407 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 408 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 409 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 410 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 411 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 412 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 413 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 414 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); 415 416 _mm_store_si128((__m128i *)ap[4], p4); 417 _mm_store_si128((__m128i *)ap[3], p3); 418 _mm_store_si128((__m128i *)ap[2], p2); 419 _mm_store_si128((__m128i *)ap[1], p1); 420 _mm_store_si128((__m128i *)ap[0], p0); 421 _mm_store_si128((__m128i *)aq[4], q4); 422 _mm_store_si128((__m128i *)aq[3], q3); 423 _mm_store_si128((__m128i *)aq[2], q2); 424 _mm_store_si128((__m128i *)aq[1], q1); 425 _mm_store_si128((__m128i *)aq[0], q0); 426 427 428 { 429 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), 430 _mm_subs_epu8(p0, p1)); 431 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), 432 _mm_subs_epu8(q0, q1)); 433 const __m128i fe = _mm_set1_epi8(0xfe); 434 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 435 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), 436 _mm_subs_epu8(q0, p0)); 437 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), 438 _mm_subs_epu8(q1, p1)); 439 __m128i work; 440 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 441 hev = _mm_subs_epu8(flat, thresh); 442 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 443 444 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 445 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 446 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 447 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 448 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 449 mask = _mm_max_epu8(flat, mask); 450 // mask |= (abs(p1 - p0) > limit) * -1; 451 // mask |= (abs(q1 - q0) > limit) * -1; 452 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), 453 _mm_subs_epu8(p1, p2)), 454 _mm_or_si128(_mm_subs_epu8(p3, p2), 455 _mm_subs_epu8(p2, p3))); 456 mask = _mm_max_epu8(work, mask); 457 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), 458 _mm_subs_epu8(q1, q2)), 459 _mm_or_si128(_mm_subs_epu8(q3, q2), 460 _mm_subs_epu8(q2, q3))); 461 mask = _mm_max_epu8(work, mask); 462 mask = _mm_subs_epu8(mask, limit); 463 mask = _mm_cmpeq_epi8(mask, zero); 464 } 465 466 // lp filter 467 { 468 const __m128i t4 = _mm_set1_epi8(4); 469 const __m128i t3 = _mm_set1_epi8(3); 470 const __m128i t80 = _mm_set1_epi8(0x80); 471 const __m128i te0 = _mm_set1_epi8(0xe0); 472 const __m128i t1f = _mm_set1_epi8(0x1f); 473 const __m128i t1 = _mm_set1_epi8(0x1); 474 const __m128i t7f = _mm_set1_epi8(0x7f); 475 476 __m128i ps1 = _mm_xor_si128(p1, t80); 477 __m128i ps0 = _mm_xor_si128(p0, t80); 478 __m128i qs0 = _mm_xor_si128(q0, t80); 479 __m128i qs1 = _mm_xor_si128(q1, t80); 480 __m128i filt; 481 __m128i work_a; 482 __m128i filter1, filter2; 483 484 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 485 work_a = _mm_subs_epi8(qs0, ps0); 486 filt = _mm_adds_epi8(filt, work_a); 487 filt = _mm_adds_epi8(filt, work_a); 488 filt = _mm_adds_epi8(filt, work_a); 489 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ 490 filt = _mm_and_si128(filt, mask); 491 492 filter1 = _mm_adds_epi8(filt, t4); 493 filter2 = _mm_adds_epi8(filt, t3); 494 495 /* Filter1 >> 3 */ 496 work_a = _mm_cmpgt_epi8(zero, filter1); 497 filter1 = _mm_srli_epi16(filter1, 3); 498 work_a = _mm_and_si128(work_a, te0); 499 filter1 = _mm_and_si128(filter1, t1f); 500 filter1 = _mm_or_si128(filter1, work_a); 501 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 502 503 /* Filter2 >> 3 */ 504 work_a = _mm_cmpgt_epi8(zero, filter2); 505 filter2 = _mm_srli_epi16(filter2, 3); 506 work_a = _mm_and_si128(work_a, te0); 507 filter2 = _mm_and_si128(filter2, t1f); 508 filter2 = _mm_or_si128(filter2, work_a); 509 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 510 511 /* filt >> 1 */ 512 filt = _mm_adds_epi8(filter1, t1); 513 work_a = _mm_cmpgt_epi8(zero, filt); 514 filt = _mm_srli_epi16(filt, 1); 515 work_a = _mm_and_si128(work_a, t80); 516 filt = _mm_and_si128(filt, t7f); 517 filt = _mm_or_si128(filt, work_a); 518 filt = _mm_andnot_si128(hev, filt); 519 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 520 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 521 // loopfilter done 522 523 { 524 __m128i work; 525 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), 526 _mm_subs_epu8(p0, p2)), 527 _mm_or_si128(_mm_subs_epu8(q2, q0), 528 _mm_subs_epu8(q0, q2))); 529 flat = _mm_max_epu8(work, flat); 530 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), 531 _mm_subs_epu8(p0, p3)), 532 _mm_or_si128(_mm_subs_epu8(q3, q0), 533 _mm_subs_epu8(q0, q3))); 534 flat = _mm_max_epu8(work, flat); 535 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), 536 _mm_subs_epu8(p0, p4)), 537 _mm_or_si128(_mm_subs_epu8(q4, q0), 538 _mm_subs_epu8(q0, q4))); 539 flat = _mm_subs_epu8(flat, one); 540 flat = _mm_cmpeq_epi8(flat, zero); 541 flat = _mm_and_si128(flat, mask); 542 543 p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); 544 q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); 545 flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0), 546 _mm_subs_epu8(p0, p5)), 547 _mm_or_si128(_mm_subs_epu8(q5, q0), 548 _mm_subs_epu8(q0, q5))); 549 _mm_store_si128((__m128i *)ap[5], p5); 550 _mm_store_si128((__m128i *)aq[5], q5); 551 flat2 = _mm_max_epu8(work, flat2); 552 p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); 553 q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); 554 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0), 555 _mm_subs_epu8(p0, p6)), 556 _mm_or_si128(_mm_subs_epu8(q6, q0), 557 _mm_subs_epu8(q0, q6))); 558 _mm_store_si128((__m128i *)ap[6], p6); 559 _mm_store_si128((__m128i *)aq[6], q6); 560 flat2 = _mm_max_epu8(work, flat2); 561 562 p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); 563 q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); 564 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0), 565 _mm_subs_epu8(p0, p7)), 566 _mm_or_si128(_mm_subs_epu8(q7, q0), 567 _mm_subs_epu8(q0, q7))); 568 _mm_store_si128((__m128i *)ap[7], p7); 569 _mm_store_si128((__m128i *)aq[7], q7); 570 flat2 = _mm_max_epu8(work, flat2); 571 flat2 = _mm_subs_epu8(flat2, one); 572 flat2 = _mm_cmpeq_epi8(flat2, zero); 573 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 574 } 575 576 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 577 // flat and wide flat calculations 578 { 579 const __m128i eight = _mm_set1_epi16(8); 580 const __m128i four = _mm_set1_epi16(4); 581 __m128i temp_flat2 = flat2; 582 unsigned char *src = s; 583 int i = 0; 584 do { 585 __m128i workp_shft; 586 __m128i a, b, c; 587 588 unsigned int off = i * 8; 589 p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero); 590 p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero); 591 p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero); 592 p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero); 593 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero); 594 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero); 595 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero); 596 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero); 597 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero); 598 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero); 599 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero); 600 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero); 601 q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero); 602 q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero); 603 q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero); 604 q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero); 605 606 c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 607 c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); 608 609 b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2)); 610 a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); 611 a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); 612 613 _mm_storel_epi64((__m128i *)&flat_op[2][i*8], 614 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 615 , b)); 616 617 c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); 618 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 619 _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], 620 _mm_packus_epi16(workp_shft, workp_shft)); 621 622 a = _mm_add_epi16(q1, a); 623 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); 624 _mm_storel_epi64((__m128i *)&flat_op[1][i*8], 625 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 626 , b)); 627 628 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); 629 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 630 _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], 631 _mm_packus_epi16(workp_shft, workp_shft)); 632 633 a = _mm_add_epi16(q2, a); 634 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); 635 _mm_storel_epi64((__m128i *)&flat_op[0][i*8], 636 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 637 , b)); 638 639 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); 640 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 641 _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], 642 _mm_packus_epi16(workp_shft, workp_shft)); 643 644 a = _mm_add_epi16(q3, a); 645 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); 646 _mm_storel_epi64((__m128i *)&flat_oq[0][i*8], 647 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 648 , b)); 649 650 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); 651 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 652 _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], 653 _mm_packus_epi16(workp_shft, workp_shft)); 654 655 b = _mm_add_epi16(q3, b); 656 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); 657 _mm_storel_epi64((__m128i *)&flat_oq[1][i*8], 658 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 659 , b)); 660 661 c = _mm_add_epi16(q4, c); 662 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); 663 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 664 _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], 665 _mm_packus_epi16(workp_shft, workp_shft)); 666 667 b = _mm_add_epi16(q3, b); 668 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); 669 _mm_storel_epi64((__m128i *)&flat_oq[2][i*8], 670 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 671 , b)); 672 a = _mm_add_epi16(q5, a); 673 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1); 674 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 675 _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], 676 _mm_packus_epi16(workp_shft, workp_shft)); 677 678 a = _mm_add_epi16(q6, a); 679 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); 680 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 681 _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], 682 _mm_packus_epi16(workp_shft, workp_shft)); 683 684 a = _mm_add_epi16(q7, a); 685 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); 686 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 687 _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], 688 _mm_packus_epi16(workp_shft, workp_shft)); 689 690 a = _mm_add_epi16(q7, a); 691 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); 692 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 693 _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], 694 _mm_packus_epi16(workp_shft, workp_shft)); 695 696 a = _mm_add_epi16(q7, a); 697 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); 698 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 699 _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], 700 _mm_packus_epi16(workp_shft, workp_shft)); 701 702 a = _mm_add_epi16(q7, a); 703 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3); 704 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 705 _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], 706 _mm_packus_epi16(workp_shft, workp_shft)); 707 708 a = _mm_add_epi16(q7, a); 709 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4); 710 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 711 _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], 712 _mm_packus_epi16(workp_shft, workp_shft)); 713 714 a = _mm_add_epi16(q7, a); 715 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5); 716 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 717 _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], 718 _mm_packus_epi16(workp_shft, workp_shft)); 719 720 a = _mm_add_epi16(q7, a); 721 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6); 722 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 723 _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], 724 _mm_packus_epi16(workp_shft, workp_shft)); 725 726 temp_flat2 = _mm_srli_si128(temp_flat2, 8); 727 src += 8; 728 } while (++i < 2); 729 } 730 // wide flat 731 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 732 733 work_a = _mm_load_si128((__m128i *)ap[2]); 734 p2 = _mm_load_si128((__m128i *)flat_op[2]); 735 work_a = _mm_andnot_si128(flat, work_a); 736 p2 = _mm_and_si128(flat, p2); 737 p2 = _mm_or_si128(work_a, p2); 738 _mm_store_si128((__m128i *)flat_op[2], p2); 739 740 p1 = _mm_load_si128((__m128i *)flat_op[1]); 741 work_a = _mm_andnot_si128(flat, ps1); 742 p1 = _mm_and_si128(flat, p1); 743 p1 = _mm_or_si128(work_a, p1); 744 _mm_store_si128((__m128i *)flat_op[1], p1); 745 746 p0 = _mm_load_si128((__m128i *)flat_op[0]); 747 work_a = _mm_andnot_si128(flat, ps0); 748 p0 = _mm_and_si128(flat, p0); 749 p0 = _mm_or_si128(work_a, p0); 750 _mm_store_si128((__m128i *)flat_op[0], p0); 751 752 q0 = _mm_load_si128((__m128i *)flat_oq[0]); 753 work_a = _mm_andnot_si128(flat, qs0); 754 q0 = _mm_and_si128(flat, q0); 755 q0 = _mm_or_si128(work_a, q0); 756 _mm_store_si128((__m128i *)flat_oq[0], q0); 757 758 q1 = _mm_load_si128((__m128i *)flat_oq[1]); 759 work_a = _mm_andnot_si128(flat, qs1); 760 q1 = _mm_and_si128(flat, q1); 761 q1 = _mm_or_si128(work_a, q1); 762 _mm_store_si128((__m128i *)flat_oq[1], q1); 763 764 work_a = _mm_load_si128((__m128i *)aq[2]); 765 q2 = _mm_load_si128((__m128i *)flat_oq[2]); 766 work_a = _mm_andnot_si128(flat, work_a); 767 q2 = _mm_and_si128(flat, q2); 768 q2 = _mm_or_si128(work_a, q2); 769 _mm_store_si128((__m128i *)flat_oq[2], q2); 770 771 // write out op6 - op3 772 { 773 unsigned char *dst = (s - 7 * p); 774 for (i = 6; i > 2; i--) { 775 __m128i flat2_output; 776 work_a = _mm_load_si128((__m128i *)ap[i]); 777 flat2_output = _mm_load_si128((__m128i *)flat2_op[i]); 778 work_a = _mm_andnot_si128(flat2, work_a); 779 flat2_output = _mm_and_si128(flat2, flat2_output); 780 work_a = _mm_or_si128(work_a, flat2_output); 781 _mm_storeu_si128((__m128i *)dst, work_a); 782 dst += p; 783 } 784 } 785 786 work_a = _mm_load_si128((__m128i *)flat_op[2]); 787 p2 = _mm_load_si128((__m128i *)flat2_op[2]); 788 work_a = _mm_andnot_si128(flat2, work_a); 789 p2 = _mm_and_si128(flat2, p2); 790 p2 = _mm_or_si128(work_a, p2); 791 _mm_storeu_si128((__m128i *)(s - 3 * p), p2); 792 793 work_a = _mm_load_si128((__m128i *)flat_op[1]); 794 p1 = _mm_load_si128((__m128i *)flat2_op[1]); 795 work_a = _mm_andnot_si128(flat2, work_a); 796 p1 = _mm_and_si128(flat2, p1); 797 p1 = _mm_or_si128(work_a, p1); 798 _mm_storeu_si128((__m128i *)(s - 2 * p), p1); 799 800 work_a = _mm_load_si128((__m128i *)flat_op[0]); 801 p0 = _mm_load_si128((__m128i *)flat2_op[0]); 802 work_a = _mm_andnot_si128(flat2, work_a); 803 p0 = _mm_and_si128(flat2, p0); 804 p0 = _mm_or_si128(work_a, p0); 805 _mm_storeu_si128((__m128i *)(s - 1 * p), p0); 806 807 work_a = _mm_load_si128((__m128i *)flat_oq[0]); 808 q0 = _mm_load_si128((__m128i *)flat2_oq[0]); 809 work_a = _mm_andnot_si128(flat2, work_a); 810 q0 = _mm_and_si128(flat2, q0); 811 q0 = _mm_or_si128(work_a, q0); 812 _mm_storeu_si128((__m128i *)(s - 0 * p), q0); 813 814 work_a = _mm_load_si128((__m128i *)flat_oq[1]); 815 q1 = _mm_load_si128((__m128i *)flat2_oq[1]); 816 work_a = _mm_andnot_si128(flat2, work_a); 817 q1 = _mm_and_si128(flat2, q1); 818 q1 = _mm_or_si128(work_a, q1); 819 _mm_storeu_si128((__m128i *)(s + 1 * p), q1); 820 821 work_a = _mm_load_si128((__m128i *)flat_oq[2]); 822 q2 = _mm_load_si128((__m128i *)flat2_oq[2]); 823 work_a = _mm_andnot_si128(flat2, work_a); 824 q2 = _mm_and_si128(flat2, q2); 825 q2 = _mm_or_si128(work_a, q2); 826 _mm_storeu_si128((__m128i *)(s + 2 * p), q2); 827 828 // write out oq3 - oq7 829 { 830 unsigned char *dst = (s + 3 * p); 831 for (i = 3; i < 7; i++) { 832 __m128i flat2_output; 833 work_a = _mm_load_si128((__m128i *)aq[i]); 834 flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]); 835 work_a = _mm_andnot_si128(flat2, work_a); 836 flat2_output = _mm_and_si128(flat2, flat2_output); 837 work_a = _mm_or_si128(work_a, flat2_output); 838 _mm_storeu_si128((__m128i *)dst, work_a); 839 dst += p; 840 } 841 } 842 } 843} 844 845void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, 846 int p, 847 const unsigned char *_blimit, 848 const unsigned char *_limit, 849 const unsigned char *_thresh, 850 int count) { 851 if (count == 1) 852 mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); 853 else 854 mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh); 855} 856 857void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, 858 int p, 859 const unsigned char *_blimit, 860 const unsigned char *_limit, 861 const unsigned char *_thresh, 862 int count) { 863 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); 864 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); 865 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); 866 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); 867 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); 868 DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); 869 __m128i mask, hev, flat; 870 const __m128i zero = _mm_set1_epi16(0); 871 __m128i p3, p2, p1, p0, q0, q1, q2, q3; 872 const unsigned int extended_thresh = _thresh[0] * 0x01010101u; 873 const unsigned int extended_limit = _limit[0] * 0x01010101u; 874 const unsigned int extended_blimit = _blimit[0] * 0x01010101u; 875 const __m128i thresh = 876 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); 877 const __m128i limit = 878 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); 879 const __m128i blimit = 880 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); 881 882 (void)count; 883 p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); 884 p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); 885 p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); 886 p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); 887 q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); 888 q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); 889 q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); 890 q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); 891 { 892 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), 893 _mm_subs_epu8(p0, p1)); 894 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), 895 _mm_subs_epu8(q0, q1)); 896 const __m128i one = _mm_set1_epi8(1); 897 const __m128i fe = _mm_set1_epi8(0xfe); 898 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 899 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), 900 _mm_subs_epu8(q0, p0)); 901 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), 902 _mm_subs_epu8(q1, p1)); 903 __m128i work; 904 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 905 hev = _mm_subs_epu8(flat, thresh); 906 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 907 908 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 909 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 910 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 911 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 912 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 913 mask = _mm_max_epu8(flat, mask); 914 // mask |= (abs(p1 - p0) > limit) * -1; 915 // mask |= (abs(q1 - q0) > limit) * -1; 916 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), 917 _mm_subs_epu8(p1, p2)), 918 _mm_or_si128(_mm_subs_epu8(p3, p2), 919 _mm_subs_epu8(p2, p3))); 920 mask = _mm_max_epu8(work, mask); 921 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), 922 _mm_subs_epu8(q1, q2)), 923 _mm_or_si128(_mm_subs_epu8(q3, q2), 924 _mm_subs_epu8(q2, q3))); 925 mask = _mm_max_epu8(work, mask); 926 mask = _mm_subs_epu8(mask, limit); 927 mask = _mm_cmpeq_epi8(mask, zero); 928 929 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), 930 _mm_subs_epu8(p0, p2)), 931 _mm_or_si128(_mm_subs_epu8(q2, q0), 932 _mm_subs_epu8(q0, q2))); 933 flat = _mm_max_epu8(work, flat); 934 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), 935 _mm_subs_epu8(p0, p3)), 936 _mm_or_si128(_mm_subs_epu8(q3, q0), 937 _mm_subs_epu8(q0, q3))); 938 flat = _mm_max_epu8(work, flat); 939 flat = _mm_subs_epu8(flat, one); 940 flat = _mm_cmpeq_epi8(flat, zero); 941 flat = _mm_and_si128(flat, mask); 942 } 943 { 944 const __m128i four = _mm_set1_epi16(4); 945 unsigned char *src = s; 946 { 947 __m128i workp_a, workp_b, workp_shft; 948 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); 949 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); 950 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); 951 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); 952 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); 953 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); 954 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); 955 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); 956 957 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); 958 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); 959 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); 960 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 961 _mm_storel_epi64((__m128i *)&flat_op2[0], 962 _mm_packus_epi16(workp_shft, workp_shft)); 963 964 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); 965 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 966 _mm_storel_epi64((__m128i *)&flat_op1[0], 967 _mm_packus_epi16(workp_shft, workp_shft)); 968 969 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); 970 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); 971 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 972 _mm_storel_epi64((__m128i *)&flat_op0[0], 973 _mm_packus_epi16(workp_shft, workp_shft)); 974 975 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); 976 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); 977 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 978 _mm_storel_epi64((__m128i *)&flat_oq0[0], 979 _mm_packus_epi16(workp_shft, workp_shft)); 980 981 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); 982 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); 983 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 984 _mm_storel_epi64((__m128i *)&flat_oq1[0], 985 _mm_packus_epi16(workp_shft, workp_shft)); 986 987 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); 988 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); 989 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 990 _mm_storel_epi64((__m128i *)&flat_oq2[0], 991 _mm_packus_epi16(workp_shft, workp_shft)); 992 } 993 } 994 // lp filter 995 { 996 const __m128i t4 = _mm_set1_epi8(4); 997 const __m128i t3 = _mm_set1_epi8(3); 998 const __m128i t80 = _mm_set1_epi8(0x80); 999 const __m128i te0 = _mm_set1_epi8(0xe0); 1000 const __m128i t1f = _mm_set1_epi8(0x1f); 1001 const __m128i t1 = _mm_set1_epi8(0x1); 1002 const __m128i t7f = _mm_set1_epi8(0x7f); 1003 1004 const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), 1005 t80); 1006 const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), 1007 t80); 1008 const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), 1009 t80); 1010 const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), 1011 t80); 1012 __m128i filt; 1013 __m128i work_a; 1014 __m128i filter1, filter2; 1015 1016 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 1017 work_a = _mm_subs_epi8(qs0, ps0); 1018 filt = _mm_adds_epi8(filt, work_a); 1019 filt = _mm_adds_epi8(filt, work_a); 1020 filt = _mm_adds_epi8(filt, work_a); 1021 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ 1022 filt = _mm_and_si128(filt, mask); 1023 1024 filter1 = _mm_adds_epi8(filt, t4); 1025 filter2 = _mm_adds_epi8(filt, t3); 1026 1027 /* Filter1 >> 3 */ 1028 work_a = _mm_cmpgt_epi8(zero, filter1); 1029 filter1 = _mm_srli_epi16(filter1, 3); 1030 work_a = _mm_and_si128(work_a, te0); 1031 filter1 = _mm_and_si128(filter1, t1f); 1032 filter1 = _mm_or_si128(filter1, work_a); 1033 1034 /* Filter2 >> 3 */ 1035 work_a = _mm_cmpgt_epi8(zero, filter2); 1036 filter2 = _mm_srli_epi16(filter2, 3); 1037 work_a = _mm_and_si128(work_a, te0); 1038 filter2 = _mm_and_si128(filter2, t1f); 1039 filter2 = _mm_or_si128(filter2, work_a); 1040 1041 /* filt >> 1 */ 1042 filt = _mm_adds_epi8(filter1, t1); 1043 work_a = _mm_cmpgt_epi8(zero, filt); 1044 filt = _mm_srli_epi16(filt, 1); 1045 work_a = _mm_and_si128(work_a, t80); 1046 filt = _mm_and_si128(filt, t7f); 1047 filt = _mm_or_si128(filt, work_a); 1048 1049 filt = _mm_andnot_si128(hev, filt); 1050 1051 work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 1052 q0 = _mm_loadl_epi64((__m128i *)flat_oq0); 1053 work_a = _mm_andnot_si128(flat, work_a); 1054 q0 = _mm_and_si128(flat, q0); 1055 q0 = _mm_or_si128(work_a, q0); 1056 1057 work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 1058 q1 = _mm_loadl_epi64((__m128i *)flat_oq1); 1059 work_a = _mm_andnot_si128(flat, work_a); 1060 q1 = _mm_and_si128(flat, q1); 1061 q1 = _mm_or_si128(work_a, q1); 1062 1063 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); 1064 q2 = _mm_loadl_epi64((__m128i *)flat_oq2); 1065 work_a = _mm_andnot_si128(flat, work_a); 1066 q2 = _mm_and_si128(flat, q2); 1067 q2 = _mm_or_si128(work_a, q2); 1068 1069 work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 1070 p0 = _mm_loadl_epi64((__m128i *)flat_op0); 1071 work_a = _mm_andnot_si128(flat, work_a); 1072 p0 = _mm_and_si128(flat, p0); 1073 p0 = _mm_or_si128(work_a, p0); 1074 1075 work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 1076 p1 = _mm_loadl_epi64((__m128i *)flat_op1); 1077 work_a = _mm_andnot_si128(flat, work_a); 1078 p1 = _mm_and_si128(flat, p1); 1079 p1 = _mm_or_si128(work_a, p1); 1080 1081 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); 1082 p2 = _mm_loadl_epi64((__m128i *)flat_op2); 1083 work_a = _mm_andnot_si128(flat, work_a); 1084 p2 = _mm_and_si128(flat, p2); 1085 p2 = _mm_or_si128(work_a, p2); 1086 1087 _mm_storel_epi64((__m128i *)(s - 3 * p), p2); 1088 _mm_storel_epi64((__m128i *)(s - 2 * p), p1); 1089 _mm_storel_epi64((__m128i *)(s - 1 * p), p0); 1090 _mm_storel_epi64((__m128i *)(s + 0 * p), q0); 1091 _mm_storel_epi64((__m128i *)(s + 1 * p), q1); 1092 _mm_storel_epi64((__m128i *)(s + 2 * p), q2); 1093 } 1094} 1095 1096static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, 1097 int in_p, unsigned char *out, int out_p) { 1098 __m128i x0, x1, x2, x3, x4, x5, x6, x7; 1099 __m128i x8, x9, x10, x11, x12, x13, x14, x15; 1100 1101 /* Read in 16 lines */ 1102 x0 = _mm_loadl_epi64((__m128i *)in0); 1103 x8 = _mm_loadl_epi64((__m128i *)in1); 1104 x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); 1105 x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); 1106 x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); 1107 x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); 1108 x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); 1109 x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); 1110 x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); 1111 x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); 1112 x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); 1113 x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); 1114 x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); 1115 x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); 1116 x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); 1117 x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); 1118 1119 x0 = _mm_unpacklo_epi8(x0, x1); 1120 x1 = _mm_unpacklo_epi8(x2, x3); 1121 x2 = _mm_unpacklo_epi8(x4, x5); 1122 x3 = _mm_unpacklo_epi8(x6, x7); 1123 1124 x8 = _mm_unpacklo_epi8(x8, x9); 1125 x9 = _mm_unpacklo_epi8(x10, x11); 1126 x10 = _mm_unpacklo_epi8(x12, x13); 1127 x11 = _mm_unpacklo_epi8(x14, x15); 1128 1129 x4 = _mm_unpacklo_epi16(x0, x1); 1130 x5 = _mm_unpacklo_epi16(x2, x3); 1131 x12 = _mm_unpacklo_epi16(x8, x9); 1132 x13 = _mm_unpacklo_epi16(x10, x11); 1133 1134 x6 = _mm_unpacklo_epi32(x4, x5); 1135 x7 = _mm_unpackhi_epi32(x4, x5); 1136 x14 = _mm_unpacklo_epi32(x12, x13); 1137 x15 = _mm_unpackhi_epi32(x12, x13); 1138 1139 /* Store first 4-line result */ 1140 _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); 1141 _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); 1142 _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); 1143 _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); 1144 1145 x4 = _mm_unpackhi_epi16(x0, x1); 1146 x5 = _mm_unpackhi_epi16(x2, x3); 1147 x12 = _mm_unpackhi_epi16(x8, x9); 1148 x13 = _mm_unpackhi_epi16(x10, x11); 1149 1150 x6 = _mm_unpacklo_epi32(x4, x5); 1151 x7 = _mm_unpackhi_epi32(x4, x5); 1152 x14 = _mm_unpacklo_epi32(x12, x13); 1153 x15 = _mm_unpackhi_epi32(x12, x13); 1154 1155 /* Store second 4-line result */ 1156 _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); 1157 _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); 1158 _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); 1159 _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); 1160} 1161 1162static INLINE void transpose(unsigned char *src[], int in_p, 1163 unsigned char *dst[], int out_p, 1164 int num_8x8_to_transpose) { 1165 int idx8x8 = 0; 1166 __m128i x0, x1, x2, x3, x4, x5, x6, x7; 1167 do { 1168 unsigned char *in = src[idx8x8]; 1169 unsigned char *out = dst[idx8x8]; 1170 1171 x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 1172 x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 1173 x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 1174 x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 1175 x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 1176 x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 1177 x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 1178 x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 1179 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 1180 x0 = _mm_unpacklo_epi8(x0, x1); 1181 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 1182 x1 = _mm_unpacklo_epi8(x2, x3); 1183 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 1184 x2 = _mm_unpacklo_epi8(x4, x5); 1185 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 1186 x3 = _mm_unpacklo_epi8(x6, x7); 1187 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 1188 x4 = _mm_unpacklo_epi16(x0, x1); 1189 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 1190 x5 = _mm_unpacklo_epi16(x2, x3); 1191 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 1192 x6 = _mm_unpacklo_epi32(x4, x5); 1193 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 1194 x7 = _mm_unpackhi_epi32(x4, x5); 1195 1196 _mm_storel_pd((double *)(out + 0*out_p), 1197 _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 1198 _mm_storeh_pd((double *)(out + 1*out_p), 1199 _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 1200 _mm_storel_pd((double *)(out + 2*out_p), 1201 _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 1202 _mm_storeh_pd((double *)(out + 3*out_p), 1203 _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 1204 1205 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 1206 x4 = _mm_unpackhi_epi16(x0, x1); 1207 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 1208 x5 = _mm_unpackhi_epi16(x2, x3); 1209 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 1210 x6 = _mm_unpacklo_epi32(x4, x5); 1211 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 1212 x7 = _mm_unpackhi_epi32(x4, x5); 1213 1214 _mm_storel_pd((double *)(out + 4*out_p), 1215 _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 1216 _mm_storeh_pd((double *)(out + 5*out_p), 1217 _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 1218 _mm_storel_pd((double *)(out + 6*out_p), 1219 _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 1220 _mm_storeh_pd((double *)(out + 7*out_p), 1221 _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 1222 } while (++idx8x8 < num_8x8_to_transpose); 1223} 1224 1225void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, 1226 int p, 1227 const unsigned char *blimit, 1228 const unsigned char *limit, 1229 const unsigned char *thresh, 1230 int count) { 1231 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); 1232 unsigned char *src[2]; 1233 unsigned char *dst[2]; 1234 1235 (void)count; 1236 /* Transpose 16x16 */ 1237 transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16); 1238 transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16); 1239 1240 /* Loop filtering */ 1241 vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, 1242 thresh, 1); 1243 src[0] = t_dst + 3 * 16; 1244 src[1] = t_dst + 3 * 16 + 8; 1245 1246 dst[0] = s - 5; 1247 dst[1] = s - 5 + p * 8; 1248 1249 /* Transpose 16x8 */ 1250 transpose(src, 16, dst, p, 2); 1251} 1252 1253void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, 1254 int p, 1255 const unsigned char *blimit, 1256 const unsigned char *limit, 1257 const unsigned char *thresh) { 1258 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); 1259 unsigned char *src[4]; 1260 unsigned char *dst[4]; 1261 1262 dst[0] = t_dst; 1263 dst[1] = t_dst + 8 * 16; 1264 1265 src[0] = s - 8; 1266 src[1] = s - 8 + 8; 1267 1268 /* Transpose 16x16 */ 1269 transpose(src, p, dst, 16, 2); 1270 1271 /* Loop filtering */ 1272 vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit, 1273 thresh, 1); 1274 1275 src[0] = t_dst; 1276 src[1] = t_dst + 8 * 16; 1277 1278 dst[0] = s - 8; 1279 dst[1] = s - 8 + 8; 1280 1281 transpose(src, 16, dst, p, 2); 1282} 1283