1; 2; Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14 15pw_11585x2: times 8 dw 23170 16pd_8192: times 4 dd 8192 17 18%macro TRANSFORM_COEFFS 2 19pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 20pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 21%endmacro 22 23TRANSFORM_COEFFS 11585, 11585 24TRANSFORM_COEFFS 15137, 6270 25TRANSFORM_COEFFS 16069, 3196 26TRANSFORM_COEFFS 9102, 13623 27 28SECTION .text 29 30%if ARCH_X86_64 31INIT_XMM ssse3 32cglobal fdct8x8, 3, 5, 13, input, output, stride 33 34 mova m8, [GLOBAL(pd_8192)] 35 mova m12, [GLOBAL(pw_11585x2)] 36 37 lea r3, [2 * strideq] 38 lea r4, [4 * strideq] 39 mova m0, [inputq] 40 mova m1, [inputq + r3] 41 lea inputq, [inputq + r4] 42 mova m2, [inputq] 43 mova m3, [inputq + r3] 44 lea inputq, [inputq + r4] 45 mova m4, [inputq] 46 mova m5, [inputq + r3] 47 lea inputq, [inputq + r4] 48 mova m6, [inputq] 49 mova m7, [inputq + r3] 50 51 ; left shift by 2 to increase forward transformation precision 52 psllw m0, 2 53 psllw m1, 2 54 psllw m2, 2 55 psllw m3, 2 56 psllw m4, 2 57 psllw m5, 2 58 psllw m6, 2 59 psllw m7, 2 60 61 ; column transform 62 ; stage 1 63 paddw m10, m0, m7 64 psubw m0, m7 65 66 paddw m9, m1, m6 67 psubw m1, m6 68 69 paddw m7, m2, m5 70 psubw m2, m5 71 72 paddw m6, m3, m4 73 psubw m3, m4 74 75 ; stage 2 76 paddw m5, m9, m7 77 psubw m9, m7 78 79 paddw m4, m10, m6 80 psubw m10, m6 81 82 paddw m7, m1, m2 83 psubw m1, m2 84 85 ; stage 3 86 paddw m6, m4, m5 87 psubw m4, m5 88 89 pmulhrsw m1, m12 90 pmulhrsw m7, m12 91 92 ; sin(pi / 8), cos(pi / 8) 93 punpcklwd m2, m10, m9 94 punpckhwd m10, m9 95 pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] 96 pmaddwd m2, [GLOBAL(pw_6270_m15137)] 97 pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] 98 pmaddwd m10, [GLOBAL(pw_6270_m15137)] 99 paddd m5, m8 100 paddd m2, m8 101 paddd m9, m8 102 paddd m10, m8 103 psrad m5, 14 104 psrad m2, 14 105 psrad m9, 14 106 psrad m10, 14 107 packssdw m5, m9 108 packssdw m2, m10 109 110 pmulhrsw m6, m12 111 pmulhrsw m4, m12 112 113 paddw m9, m3, m1 114 psubw m3, m1 115 116 paddw m10, m0, m7 117 psubw m0, m7 118 119 ; stage 4 120 ; sin(pi / 16), cos(pi / 16) 121 punpcklwd m1, m10, m9 122 punpckhwd m10, m9 123 pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] 124 pmaddwd m1, [GLOBAL(pw_3196_m16069)] 125 pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] 126 pmaddwd m10, [GLOBAL(pw_3196_m16069)] 127 paddd m7, m8 128 paddd m1, m8 129 paddd m9, m8 130 paddd m10, m8 131 psrad m7, 14 132 psrad m1, 14 133 psrad m9, 14 134 psrad m10, 14 135 packssdw m7, m9 136 packssdw m1, m10 137 138 ; sin(3 * pi / 16), cos(3 * pi / 16) 139 punpcklwd m11, m0, m3 140 punpckhwd m0, m3 141 pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] 142 pmaddwd m11, [GLOBAL(pw_13623_m9102)] 143 pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] 144 pmaddwd m0, [GLOBAL(pw_13623_m9102)] 145 paddd m9, m8 146 paddd m11, m8 147 paddd m3, m8 148 paddd m0, m8 149 psrad m9, 14 150 psrad m11, 14 151 psrad m3, 14 152 psrad m0, 14 153 packssdw m9, m3 154 packssdw m11, m0 155 156 ; transpose 157 ; stage 1 158 punpcklwd m0, m6, m7 159 punpcklwd m3, m5, m11 160 punpckhwd m6, m7 161 punpckhwd m5, m11 162 punpcklwd m7, m4, m9 163 punpcklwd m10, m2, m1 164 punpckhwd m4, m9 165 punpckhwd m2, m1 166 167 ; stage 2 168 punpckldq m9, m0, m3 169 punpckldq m1, m6, m5 170 punpckhdq m0, m3 171 punpckhdq m6, m5 172 punpckldq m3, m7, m10 173 punpckldq m5, m4, m2 174 punpckhdq m7, m10 175 punpckhdq m4, m2 176 177 ; stage 3 178 punpcklqdq m10, m9, m3 179 punpckhqdq m9, m3 180 punpcklqdq m2, m0, m7 181 punpckhqdq m0, m7 182 punpcklqdq m3, m1, m5 183 punpckhqdq m1, m5 184 punpcklqdq m7, m6, m4 185 punpckhqdq m6, m4 186 187 ; row transform 188 ; stage 1 189 paddw m5, m10, m6 190 psubw m10, m6 191 192 paddw m4, m9, m7 193 psubw m9, m7 194 195 paddw m6, m2, m1 196 psubw m2, m1 197 198 paddw m7, m0, m3 199 psubw m0, m3 200 201 ;stage 2 202 paddw m1, m5, m7 203 psubw m5, m7 204 205 paddw m3, m4, m6 206 psubw m4, m6 207 208 paddw m7, m9, m2 209 psubw m9, m2 210 211 ; stage 3 212 punpcklwd m6, m1, m3 213 punpckhwd m1, m3 214 pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] 215 pmaddwd m6, [GLOBAL(pw_11585_m11585)] 216 pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] 217 pmaddwd m1, [GLOBAL(pw_11585_m11585)] 218 paddd m2, m8 219 paddd m6, m8 220 paddd m3, m8 221 paddd m1, m8 222 psrad m2, 14 223 psrad m6, 14 224 psrad m3, 14 225 psrad m1, 14 226 packssdw m2, m3 227 packssdw m6, m1 228 229 pmulhrsw m7, m12 230 pmulhrsw m9, m12 231 232 punpcklwd m3, m5, m4 233 punpckhwd m5, m4 234 pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] 235 pmaddwd m3, [GLOBAL(pw_6270_m15137)] 236 pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] 237 pmaddwd m5, [GLOBAL(pw_6270_m15137)] 238 paddd m1, m8 239 paddd m3, m8 240 paddd m4, m8 241 paddd m5, m8 242 psrad m1, 14 243 psrad m3, 14 244 psrad m4, 14 245 psrad m5, 14 246 packssdw m1, m4 247 packssdw m3, m5 248 249 paddw m4, m0, m9 250 psubw m0, m9 251 252 paddw m5, m10, m7 253 psubw m10, m7 254 255 ; stage 4 256 punpcklwd m9, m5, m4 257 punpckhwd m5, m4 258 pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] 259 pmaddwd m9, [GLOBAL(pw_3196_m16069)] 260 pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] 261 pmaddwd m5, [GLOBAL(pw_3196_m16069)] 262 paddd m7, m8 263 paddd m9, m8 264 paddd m4, m8 265 paddd m5, m8 266 psrad m7, 14 267 psrad m9, 14 268 psrad m4, 14 269 psrad m5, 14 270 packssdw m7, m4 271 packssdw m9, m5 272 273 punpcklwd m4, m10, m0 274 punpckhwd m10, m0 275 pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] 276 pmaddwd m4, [GLOBAL(pw_13623_m9102)] 277 pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] 278 pmaddwd m10, [GLOBAL(pw_13623_m9102)] 279 paddd m5, m8 280 paddd m4, m8 281 paddd m0, m8 282 paddd m10, m8 283 psrad m5, 14 284 psrad m4, 14 285 psrad m0, 14 286 psrad m10, 14 287 packssdw m5, m0 288 packssdw m4, m10 289 290 ; transpose 291 ; stage 1 292 punpcklwd m0, m2, m7 293 punpcklwd m10, m1, m4 294 punpckhwd m2, m7 295 punpckhwd m1, m4 296 punpcklwd m7, m6, m5 297 punpcklwd m4, m3, m9 298 punpckhwd m6, m5 299 punpckhwd m3, m9 300 301 ; stage 2 302 punpckldq m5, m0, m10 303 punpckldq m9, m2, m1 304 punpckhdq m0, m10 305 punpckhdq m2, m1 306 punpckldq m10, m7, m4 307 punpckldq m1, m6, m3 308 punpckhdq m7, m4 309 punpckhdq m6, m3 310 311 ; stage 3 312 punpcklqdq m4, m5, m10 313 punpckhqdq m5, m10 314 punpcklqdq m3, m0, m7 315 punpckhqdq m0, m7 316 punpcklqdq m10, m9, m1 317 punpckhqdq m9, m1 318 punpcklqdq m7, m2, m6 319 punpckhqdq m2, m6 320 321 psraw m1, m4, 15 322 psraw m6, m5, 15 323 psraw m8, m3, 15 324 psraw m11, m0, 15 325 326 psubw m4, m1 327 psubw m5, m6 328 psubw m3, m8 329 psubw m0, m11 330 331 psraw m4, 1 332 psraw m5, 1 333 psraw m3, 1 334 psraw m0, 1 335 336 psraw m1, m10, 15 337 psraw m6, m9, 15 338 psraw m8, m7, 15 339 psraw m11, m2, 15 340 341 psubw m10, m1 342 psubw m9, m6 343 psubw m7, m8 344 psubw m2, m11 345 346 psraw m10, 1 347 psraw m9, 1 348 psraw m7, 1 349 psraw m2, 1 350 351 mova [outputq + 0], m4 352 mova [outputq + 16], m5 353 mova [outputq + 32], m3 354 mova [outputq + 48], m0 355 mova [outputq + 64], m10 356 mova [outputq + 80], m9 357 mova [outputq + 96], m7 358 mova [outputq + 112], m2 359 360 RET 361%endif 362