1538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;
2538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;
4538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;  Use of this source code is governed by a BSD-style license and patent
5538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;  grant that can be found in the LICENSE file in the root of the source
6538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;  tree. All contributing project authors may be found in the AUTHORS
7538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;  file in the root of the source tree.
8538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;
9538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
10538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
11538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%include "vpx_ports/x86_abi_support.asm"
12538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
13538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
14538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr
15538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;               short *qcoeff_ptr,short *dequant_ptr,
16538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;               short *round_ptr,
17538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;               short *quant_ptr, short *dqcoeff_ptr);
18538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;
19538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberglobal sym(vp8_fast_quantize_b_impl_ssse3)
20538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubersym(vp8_fast_quantize_b_impl_ssse3):
21538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    push        rbp
22538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rbp, rsp
23538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    SHADOW_ARGS_TO_STACK 6
24538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    GET_GOT     rbx
25538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    push        rsi
26538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    push        rdi
27538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; end prolog
28538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
29538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rdx, arg(0)                 ;coeff_ptr
30538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rdi, arg(3)                 ;round_ptr
31538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rsi, arg(4)                 ;quant_ptr
32538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
33538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm0, [rdx]
34538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm4, [rdx + 16]
35538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
36538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm2, [rdi]                 ;round lo
37538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm3, [rdi + 16]            ;round hi
38538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
39538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm1, xmm0
40538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm5, xmm4
41538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
42538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psraw       xmm0, 15                    ;sign of z (aka sz)
43538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psraw       xmm4, 15                    ;sign of z (aka sz)
44538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
45538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pabsw       xmm1, xmm1
46538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pabsw       xmm5, xmm5
47538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
48538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw       xmm1, xmm2
49538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw       xmm5, xmm3
50538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
51538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmulhw      xmm1, [rsi]
52538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmulhw      xmm5, [rsi + 16]
53538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
54538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rdi, arg(1)                 ;qcoeff_ptr
55538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rcx, arg(2)                 ;dequant_ptr
56538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rsi, arg(5)                 ;dqcoeff_ptr
57538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
58538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm1, xmm0
59538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm5, xmm4
60538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw       xmm1, xmm0
61538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw       xmm5, xmm4
62538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
63538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      [rdi], xmm1
64538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      [rdi + 16], xmm5
65538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
66538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm2, [rcx]
67538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm3, [rcx + 16]
68538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
69538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm4, xmm4
70538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmullw      xmm2, xmm1
71538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmullw      xmm3, xmm5
72538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
73538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpeqw     xmm1, xmm4                  ;non zero mask
74538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpeqw     xmm5, xmm4                  ;non zero mask
75538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    packsswb    xmm1, xmm5
76538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufb      xmm1, [ GLOBAL(zz_shuf)]
77538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
78538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmovmskb    edx, xmm1
79538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
80538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;    xor         ecx, ecx
81538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;    mov         eax, -1
82538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;find_eob_loop:
83538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;    shr         edx, 1
84538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;    jc          fq_skip
85538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;    mov         eax, ecx
86538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;fq_skip:
87538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;    inc         ecx
88538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;    cmp         ecx, 16
89538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;    jne         find_eob_loop
90538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    xor         rdi, rdi
91538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         eax, -1
92538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    xor         dx, ax                      ;flip the bits for bsr
93538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    bsr         eax, edx
94538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
95538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      [rsi], xmm2                 ;store dqcoeff
96538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      [rsi + 16], xmm3            ;store dqcoeff
97538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
98538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    sub         edi, edx                    ;check for all zeros in bit mask
99538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    sar         edi, 31                     ;0 or -1
100538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    add         eax, 1
101538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    and         eax, edi                    ;if the bit mask was all zero,
102538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber                                            ;then eob = 0
103538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; begin epilog
104538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pop         rdi
105538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pop         rsi
106538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    RESTORE_GOT
107538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    UNSHADOW_ARGS
108538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pop         rbp
109538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ret
110538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
111538f6170b788de7408b06efc6613dc98579aa6a6Andreas HuberSECTION_RODATA
112538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16
113538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberzz_shuf:
114538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
115