1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13; This file provides SSSE3 version of the inverse transformation. Part 14; of the functions are originally derived from the ffmpeg project. 15; Note that the current version applies to x86 64-bit only. 16 17SECTION_RODATA 18 19pw_11585x2: times 8 dw 23170 20pd_8192: times 4 dd 8192 21pw_16: times 8 dw 16 22 23%macro TRANSFORM_COEFFS 2 24pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 25pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 26%endmacro 27 28TRANSFORM_COEFFS 6270, 15137 29TRANSFORM_COEFFS 3196, 16069 30TRANSFORM_COEFFS 13623, 9102 31 32%macro PAIR_PP_COEFFS 2 33dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 34%endmacro 35 36%macro PAIR_MP_COEFFS 2 37dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 38%endmacro 39 40%macro PAIR_MM_COEFFS 2 41dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 42%endmacro 43 44PAIR_PP_COEFFS 30274, 12540 45PAIR_PP_COEFFS 6392, 32138 46PAIR_MP_COEFFS 18204, 27246 47 48PAIR_PP_COEFFS 12540, 12540 49PAIR_PP_COEFFS 30274, 30274 50PAIR_PP_COEFFS 6392, 6392 51PAIR_PP_COEFFS 32138, 32138 52PAIR_MM_COEFFS 18204, 18204 53PAIR_PP_COEFFS 27246, 27246 54 55SECTION .text 56 57%if ARCH_X86_64 58%macro SUM_SUB 3 59 psubw m%3, m%1, m%2 60 paddw m%1, m%2 61 SWAP %2, %3 62%endmacro 63 64; butterfly operation 65%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 66 pmaddwd m%1, m%3, %5 67 pmaddwd m%2, m%3, %6 68 paddd m%1, %4 69 paddd m%2, %4 70 psrad m%1, 14 71 psrad m%2, 14 72%endmacro 73 74%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 75 punpckhwd m%6, m%2, m%1 76 MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] 77 punpcklwd m%2, m%1 78 MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] 79 packssdw m%1, m%7 80 packssdw m%2, m%6 81%endmacro 82 83; matrix transpose 84%macro INTERLEAVE_2X 4 85 punpckh%1 m%4, m%2, m%3 86 punpckl%1 m%2, m%3 87 SWAP %3, %4 88%endmacro 89 90%macro TRANSPOSE8X8 9 91 INTERLEAVE_2X wd, %1, %2, %9 92 INTERLEAVE_2X wd, %3, %4, %9 93 INTERLEAVE_2X wd, %5, %6, %9 94 INTERLEAVE_2X wd, %7, %8, %9 95 96 INTERLEAVE_2X dq, %1, %3, %9 97 INTERLEAVE_2X dq, %2, %4, %9 98 INTERLEAVE_2X dq, %5, %7, %9 99 INTERLEAVE_2X dq, %6, %8, %9 100 101 INTERLEAVE_2X qdq, %1, %5, %9 102 INTERLEAVE_2X qdq, %3, %7, %9 103 INTERLEAVE_2X qdq, %2, %6, %9 104 INTERLEAVE_2X qdq, %4, %8, %9 105 106 SWAP %2, %5 107 SWAP %4, %7 108%endmacro 109 110%macro IDCT8_1D 0 111 SUM_SUB 0, 4, 9 112 BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 113 pmulhrsw m0, m12 114 pmulhrsw m4, m12 115 BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 116 BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 117 118 SUM_SUB 1, 5, 9 119 SUM_SUB 7, 3, 9 120 SUM_SUB 0, 6, 9 121 SUM_SUB 4, 2, 9 122 SUM_SUB 3, 5, 9 123 pmulhrsw m3, m12 124 pmulhrsw m5, m12 125 126 SUM_SUB 0, 7, 9 127 SUM_SUB 4, 3, 9 128 SUM_SUB 2, 5, 9 129 SUM_SUB 6, 1, 9 130 131 SWAP 3, 6 132 SWAP 1, 4 133%endmacro 134 135; This macro handles 8 pixels per line 136%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero 137 paddw m%1, m11 138 paddw m%2, m11 139 psraw m%1, 5 140 psraw m%2, 5 141 142 movh m%3, [outputq] 143 movh m%4, [outputq + strideq] 144 punpcklbw m%3, m%5 145 punpcklbw m%4, m%5 146 paddw m%3, m%1 147 paddw m%4, m%2 148 packuswb m%3, m%5 149 packuswb m%4, m%5 150 movh [outputq], m%3 151 movh [outputq + strideq], m%4 152%endmacro 153 154INIT_XMM ssse3 155; full inverse 8x8 2D-DCT transform 156cglobal idct8x8_64_add, 3, 5, 13, input, output, stride 157 mova m8, [pd_8192] 158 mova m11, [pw_16] 159 mova m12, [pw_11585x2] 160 161 lea r3, [2 * strideq] 162 163 mova m0, [inputq + 0] 164 mova m1, [inputq + 16] 165 mova m2, [inputq + 32] 166 mova m3, [inputq + 48] 167 mova m4, [inputq + 64] 168 mova m5, [inputq + 80] 169 mova m6, [inputq + 96] 170 mova m7, [inputq + 112] 171 172 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 173 IDCT8_1D 174 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 175 IDCT8_1D 176 177 pxor m12, m12 178 ADD_STORE_8P_2X 0, 1, 9, 10, 12 179 lea outputq, [outputq + r3] 180 ADD_STORE_8P_2X 2, 3, 9, 10, 12 181 lea outputq, [outputq + r3] 182 ADD_STORE_8P_2X 4, 5, 9, 10, 12 183 lea outputq, [outputq + r3] 184 ADD_STORE_8P_2X 6, 7, 9, 10, 12 185 186 RET 187 188; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero 189cglobal idct8x8_12_add, 3, 5, 13, input, output, stride 190 mova m8, [pd_8192] 191 mova m11, [pw_16] 192 mova m12, [pw_11585x2] 193 194 lea r3, [2 * strideq] 195 196 mova m0, [inputq + 0] 197 mova m1, [inputq + 16] 198 mova m2, [inputq + 32] 199 mova m3, [inputq + 48] 200 201 punpcklwd m0, m1 202 punpcklwd m2, m3 203 punpckhdq m9, m0, m2 204 punpckldq m0, m2 205 SWAP 2, 9 206 207 ; m0 -> [0], [0] 208 ; m1 -> [1], [1] 209 ; m2 -> [2], [2] 210 ; m3 -> [3], [3] 211 punpckhqdq m10, m0, m0 212 punpcklqdq m0, m0 213 punpckhqdq m9, m2, m2 214 punpcklqdq m2, m2 215 SWAP 1, 10 216 SWAP 3, 9 217 218 pmulhrsw m0, m12 219 pmulhrsw m2, [dpw_30274_12540] 220 pmulhrsw m1, [dpw_6392_32138] 221 pmulhrsw m3, [dpw_m18204_27246] 222 223 SUM_SUB 0, 2, 9 224 SUM_SUB 1, 3, 9 225 226 punpcklqdq m9, m3, m3 227 punpckhqdq m5, m3, m9 228 229 SUM_SUB 3, 5, 9 230 punpckhqdq m5, m3 231 pmulhrsw m5, m12 232 233 punpckhqdq m9, m1, m5 234 punpcklqdq m1, m5 235 SWAP 5, 9 236 237 SUM_SUB 0, 5, 9 238 SUM_SUB 2, 1, 9 239 240 punpckhqdq m3, m0, m0 241 punpckhqdq m4, m1, m1 242 punpckhqdq m6, m5, m5 243 punpckhqdq m7, m2, m2 244 245 punpcklwd m0, m3 246 punpcklwd m7, m2 247 punpcklwd m1, m4 248 punpcklwd m6, m5 249 250 punpckhdq m4, m0, m7 251 punpckldq m0, m7 252 punpckhdq m10, m1, m6 253 punpckldq m5, m1, m6 254 255 punpckhqdq m1, m0, m5 256 punpcklqdq m0, m5 257 punpckhqdq m3, m4, m10 258 punpcklqdq m2, m4, m10 259 260 261 pmulhrsw m0, m12 262 pmulhrsw m6, m2, [dpw_30274_30274] 263 pmulhrsw m4, m2, [dpw_12540_12540] 264 265 pmulhrsw m7, m1, [dpw_32138_32138] 266 pmulhrsw m1, [dpw_6392_6392] 267 pmulhrsw m5, m3, [dpw_m18204_m18204] 268 pmulhrsw m3, [dpw_27246_27246] 269 270 mova m2, m0 271 SUM_SUB 0, 6, 9 272 SUM_SUB 2, 4, 9 273 SUM_SUB 1, 5, 9 274 SUM_SUB 7, 3, 9 275 276 SUM_SUB 3, 5, 9 277 pmulhrsw m3, m12 278 pmulhrsw m5, m12 279 280 SUM_SUB 0, 7, 9 281 SUM_SUB 2, 3, 9 282 SUM_SUB 4, 5, 9 283 SUM_SUB 6, 1, 9 284 285 SWAP 3, 6 286 SWAP 1, 2 287 SWAP 2, 4 288 289 290 pxor m12, m12 291 ADD_STORE_8P_2X 0, 1, 9, 10, 12 292 lea outputq, [outputq + r3] 293 ADD_STORE_8P_2X 2, 3, 9, 10, 12 294 lea outputq, [outputq + r3] 295 ADD_STORE_8P_2X 4, 5, 9, 10, 12 296 lea outputq, [outputq + r3] 297 ADD_STORE_8P_2X 6, 7, 9, 10, 12 298 299 RET 300 301%endif 302