11b362b15af34006e6a11974088a46d42b903418eJohann;
21b362b15af34006e6a11974088a46d42b903418eJohann;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
31b362b15af34006e6a11974088a46d42b903418eJohann;
41b362b15af34006e6a11974088a46d42b903418eJohann;  Use of this source code is governed by a BSD-style license
51b362b15af34006e6a11974088a46d42b903418eJohann;  that can be found in the LICENSE file in the root of the source
61b362b15af34006e6a11974088a46d42b903418eJohann;  tree. An additional intellectual property rights grant can be found
71b362b15af34006e6a11974088a46d42b903418eJohann;  in the file PATENTS.  All contributing project authors may
81b362b15af34006e6a11974088a46d42b903418eJohann;  be found in the AUTHORS file in the root of the source tree.
91b362b15af34006e6a11974088a46d42b903418eJohann;
101b362b15af34006e6a11974088a46d42b903418eJohann
111b362b15af34006e6a11974088a46d42b903418eJohann
121b362b15af34006e6a11974088a46d42b903418eJohann%include "vpx_ports/x86_abi_support.asm"
131b362b15af34006e6a11974088a46d42b903418eJohann
141b362b15af34006e6a11974088a46d42b903418eJohann; /****************************************************************************
151b362b15af34006e6a11974088a46d42b903418eJohann; * Notes:
161b362b15af34006e6a11974088a46d42b903418eJohann; *
171b362b15af34006e6a11974088a46d42b903418eJohann; * This implementation makes use of 16 bit fixed point version of two multiply
181b362b15af34006e6a11974088a46d42b903418eJohann; * constants:
191b362b15af34006e6a11974088a46d42b903418eJohann; *        1.   sqrt(2) * cos (pi/8)
201b362b15af34006e6a11974088a46d42b903418eJohann; *        2.   sqrt(2) * sin (pi/8)
211b362b15af34006e6a11974088a46d42b903418eJohann; * Because the first constant is bigger than 1, to maintain the same 16 bit
221b362b15af34006e6a11974088a46d42b903418eJohann; * fixed point precision as the second one, we use a trick of
231b362b15af34006e6a11974088a46d42b903418eJohann; *        x * a = x + x*(a-1)
241b362b15af34006e6a11974088a46d42b903418eJohann; * so
251b362b15af34006e6a11974088a46d42b903418eJohann; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
261b362b15af34006e6a11974088a46d42b903418eJohann; *
271b362b15af34006e6a11974088a46d42b903418eJohann; * For the second constant, because of the 16bit version is 35468, which
281b362b15af34006e6a11974088a46d42b903418eJohann; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
291b362b15af34006e6a11974088a46d42b903418eJohann; * number.
301b362b15af34006e6a11974088a46d42b903418eJohann; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
311b362b15af34006e6a11974088a46d42b903418eJohann; *
321b362b15af34006e6a11974088a46d42b903418eJohann; **************************************************************************/
331b362b15af34006e6a11974088a46d42b903418eJohann
341b362b15af34006e6a11974088a46d42b903418eJohann
351b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
361b362b15af34006e6a11974088a46d42b903418eJohann;int pitch, unsigned char *dest,int stride)
371b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_short_idct4x4llm_mmx) PRIVATE
381b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_short_idct4x4llm_mmx):
391b362b15af34006e6a11974088a46d42b903418eJohann    push        rbp
401b362b15af34006e6a11974088a46d42b903418eJohann    mov         rbp, rsp
411b362b15af34006e6a11974088a46d42b903418eJohann    SHADOW_ARGS_TO_STACK 5
421b362b15af34006e6a11974088a46d42b903418eJohann    GET_GOT     rbx
431b362b15af34006e6a11974088a46d42b903418eJohann    push        rsi
441b362b15af34006e6a11974088a46d42b903418eJohann    push        rdi
451b362b15af34006e6a11974088a46d42b903418eJohann    ; end prolog
461b362b15af34006e6a11974088a46d42b903418eJohann
471b362b15af34006e6a11974088a46d42b903418eJohann    mov         rax,    arg(0)              ;input
481b362b15af34006e6a11974088a46d42b903418eJohann    mov         rsi,    arg(1)              ;pred
491b362b15af34006e6a11974088a46d42b903418eJohann
501b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm0,    [rax   ]
511b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm1,    [rax+ 8]
521b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm2,    [rax+16]
531b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm3,    [rax+24]
541b362b15af34006e6a11974088a46d42b903418eJohann
551b362b15af34006e6a11974088a46d42b903418eJohann%if 0
561b362b15af34006e6a11974088a46d42b903418eJohann    pxor        mm7,    mm7
571b362b15af34006e6a11974088a46d42b903418eJohann    movq        [rax],   mm7
581b362b15af34006e6a11974088a46d42b903418eJohann    movq        [rax+8], mm7
591b362b15af34006e6a11974088a46d42b903418eJohann    movq        [rax+16],mm7
601b362b15af34006e6a11974088a46d42b903418eJohann    movq        [rax+24],mm7
611b362b15af34006e6a11974088a46d42b903418eJohann%endif
621b362b15af34006e6a11974088a46d42b903418eJohann    movsxd      rax,    dword ptr arg(2)    ;pitch
631b362b15af34006e6a11974088a46d42b903418eJohann    mov         rdx,    arg(3)              ;dest
641b362b15af34006e6a11974088a46d42b903418eJohann    movsxd      rdi,    dword ptr arg(4)    ;stride
651b362b15af34006e6a11974088a46d42b903418eJohann
661b362b15af34006e6a11974088a46d42b903418eJohann
671b362b15af34006e6a11974088a46d42b903418eJohann    psubw       mm0,            mm2             ; b1= 0-2
681b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm2,            mm2             ;
691b362b15af34006e6a11974088a46d42b903418eJohann
701b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm5,            mm1
711b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm2,            mm0             ; a1 =0+2
721b362b15af34006e6a11974088a46d42b903418eJohann
731b362b15af34006e6a11974088a46d42b903418eJohann    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
741b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
751b362b15af34006e6a11974088a46d42b903418eJohann
761b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm7,            mm3             ;
771b362b15af34006e6a11974088a46d42b903418eJohann    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
781b362b15af34006e6a11974088a46d42b903418eJohann
791b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
801b362b15af34006e6a11974088a46d42b903418eJohann    psubw       mm7,            mm5             ; c1
811b362b15af34006e6a11974088a46d42b903418eJohann
821b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm5,            mm1
831b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm4,            mm3
841b362b15af34006e6a11974088a46d42b903418eJohann
851b362b15af34006e6a11974088a46d42b903418eJohann    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
861b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm5,            mm1
871b362b15af34006e6a11974088a46d42b903418eJohann
881b362b15af34006e6a11974088a46d42b903418eJohann    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
891b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm3,            mm4
901b362b15af34006e6a11974088a46d42b903418eJohann
911b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm3,            mm5             ; d1
921b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm6,            mm2             ; a1
931b362b15af34006e6a11974088a46d42b903418eJohann
941b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm4,            mm0             ; b1
951b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm2,            mm3             ;0
961b362b15af34006e6a11974088a46d42b903418eJohann
971b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm4,            mm7             ;1
981b362b15af34006e6a11974088a46d42b903418eJohann    psubw       mm0,            mm7             ;2
991b362b15af34006e6a11974088a46d42b903418eJohann
1001b362b15af34006e6a11974088a46d42b903418eJohann    psubw       mm6,            mm3             ;3
1011b362b15af34006e6a11974088a46d42b903418eJohann
1021b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm1,            mm2             ; 03 02 01 00
1031b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm3,            mm4             ; 23 22 21 20
1041b362b15af34006e6a11974088a46d42b903418eJohann
1051b362b15af34006e6a11974088a46d42b903418eJohann    punpcklwd   mm1,            mm0             ; 11 01 10 00
1061b362b15af34006e6a11974088a46d42b903418eJohann    punpckhwd   mm2,            mm0             ; 13 03 12 02
1071b362b15af34006e6a11974088a46d42b903418eJohann
1081b362b15af34006e6a11974088a46d42b903418eJohann    punpcklwd   mm3,            mm6             ; 31 21 30 20
1091b362b15af34006e6a11974088a46d42b903418eJohann    punpckhwd   mm4,            mm6             ; 33 23 32 22
1101b362b15af34006e6a11974088a46d42b903418eJohann
1111b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm0,            mm1             ; 11 01 10 00
1121b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm5,            mm2             ; 13 03 12 02
1131b362b15af34006e6a11974088a46d42b903418eJohann
1141b362b15af34006e6a11974088a46d42b903418eJohann    punpckldq   mm0,            mm3             ; 30 20 10 00
1151b362b15af34006e6a11974088a46d42b903418eJohann    punpckhdq   mm1,            mm3             ; 31 21 11 01
1161b362b15af34006e6a11974088a46d42b903418eJohann
1171b362b15af34006e6a11974088a46d42b903418eJohann    punpckldq   mm2,            mm4             ; 32 22 12 02
1181b362b15af34006e6a11974088a46d42b903418eJohann    punpckhdq   mm5,            mm4             ; 33 23 13 03
1191b362b15af34006e6a11974088a46d42b903418eJohann
1201b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm3,            mm5             ; 33 23 13 03
1211b362b15af34006e6a11974088a46d42b903418eJohann
1221b362b15af34006e6a11974088a46d42b903418eJohann    psubw       mm0,            mm2             ; b1= 0-2
1231b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm2,            mm2             ;
1241b362b15af34006e6a11974088a46d42b903418eJohann
1251b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm5,            mm1
1261b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm2,            mm0             ; a1 =0+2
1271b362b15af34006e6a11974088a46d42b903418eJohann
1281b362b15af34006e6a11974088a46d42b903418eJohann    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
1291b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
1301b362b15af34006e6a11974088a46d42b903418eJohann
1311b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm7,            mm3             ;
1321b362b15af34006e6a11974088a46d42b903418eJohann    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
1331b362b15af34006e6a11974088a46d42b903418eJohann
1341b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
1351b362b15af34006e6a11974088a46d42b903418eJohann    psubw       mm7,            mm5             ; c1
1361b362b15af34006e6a11974088a46d42b903418eJohann
1371b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm5,            mm1
1381b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm4,            mm3
1391b362b15af34006e6a11974088a46d42b903418eJohann
1401b362b15af34006e6a11974088a46d42b903418eJohann    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
1411b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm5,            mm1
1421b362b15af34006e6a11974088a46d42b903418eJohann
1431b362b15af34006e6a11974088a46d42b903418eJohann    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
1441b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm3,            mm4
1451b362b15af34006e6a11974088a46d42b903418eJohann
1461b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm3,            mm5             ; d1
1471b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm0,            [GLOBAL(fours)]
1481b362b15af34006e6a11974088a46d42b903418eJohann
1491b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm2,            [GLOBAL(fours)]
1501b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm6,            mm2             ; a1
1511b362b15af34006e6a11974088a46d42b903418eJohann
1521b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm4,            mm0             ; b1
1531b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm2,            mm3             ;0
1541b362b15af34006e6a11974088a46d42b903418eJohann
1551b362b15af34006e6a11974088a46d42b903418eJohann    paddw       mm4,            mm7             ;1
1561b362b15af34006e6a11974088a46d42b903418eJohann    psubw       mm0,            mm7             ;2
1571b362b15af34006e6a11974088a46d42b903418eJohann
1581b362b15af34006e6a11974088a46d42b903418eJohann    psubw       mm6,            mm3             ;3
1591b362b15af34006e6a11974088a46d42b903418eJohann    psraw       mm2,            3
1601b362b15af34006e6a11974088a46d42b903418eJohann
1611b362b15af34006e6a11974088a46d42b903418eJohann    psraw       mm0,            3
1621b362b15af34006e6a11974088a46d42b903418eJohann    psraw       mm4,            3
1631b362b15af34006e6a11974088a46d42b903418eJohann
1641b362b15af34006e6a11974088a46d42b903418eJohann    psraw       mm6,            3
1651b362b15af34006e6a11974088a46d42b903418eJohann
1661b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm1,            mm2             ; 03 02 01 00
1671b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm3,            mm4             ; 23 22 21 20
1681b362b15af34006e6a11974088a46d42b903418eJohann
1691b362b15af34006e6a11974088a46d42b903418eJohann    punpcklwd   mm1,            mm0             ; 11 01 10 00
1701b362b15af34006e6a11974088a46d42b903418eJohann    punpckhwd   mm2,            mm0             ; 13 03 12 02
1711b362b15af34006e6a11974088a46d42b903418eJohann
1721b362b15af34006e6a11974088a46d42b903418eJohann    punpcklwd   mm3,            mm6             ; 31 21 30 20
1731b362b15af34006e6a11974088a46d42b903418eJohann    punpckhwd   mm4,            mm6             ; 33 23 32 22
1741b362b15af34006e6a11974088a46d42b903418eJohann
1751b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm0,            mm1             ; 11 01 10 00
1761b362b15af34006e6a11974088a46d42b903418eJohann    movq        mm5,            mm2             ; 13 03 12 02
1771b362b15af34006e6a11974088a46d42b903418eJohann
1781b362b15af34006e6a11974088a46d42b903418eJohann    punpckldq   mm0,            mm3             ; 30 20 10 00
1791b362b15af34006e6a11974088a46d42b903418eJohann    punpckhdq   mm1,            mm3             ; 31 21 11 01
1801b362b15af34006e6a11974088a46d42b903418eJohann
1811b362b15af34006e6a11974088a46d42b903418eJohann    punpckldq   mm2,            mm4             ; 32 22 12 02
1821b362b15af34006e6a11974088a46d42b903418eJohann    punpckhdq   mm5,            mm4             ; 33 23 13 03
1831b362b15af34006e6a11974088a46d42b903418eJohann
1841b362b15af34006e6a11974088a46d42b903418eJohann    pxor        mm7,            mm7
1851b362b15af34006e6a11974088a46d42b903418eJohann
1861b362b15af34006e6a11974088a46d42b903418eJohann    movd        mm4,            [rsi]
1871b362b15af34006e6a11974088a46d42b903418eJohann    punpcklbw   mm4,            mm7
1881b362b15af34006e6a11974088a46d42b903418eJohann    paddsw      mm0,            mm4
1891b362b15af34006e6a11974088a46d42b903418eJohann    packuswb    mm0,            mm7
1901b362b15af34006e6a11974088a46d42b903418eJohann    movd        [rdx],          mm0
1911b362b15af34006e6a11974088a46d42b903418eJohann
1921b362b15af34006e6a11974088a46d42b903418eJohann    movd        mm4,            [rsi+rax]
1931b362b15af34006e6a11974088a46d42b903418eJohann    punpcklbw   mm4,            mm7
1941b362b15af34006e6a11974088a46d42b903418eJohann    paddsw      mm1,            mm4
1951b362b15af34006e6a11974088a46d42b903418eJohann    packuswb    mm1,            mm7
1961b362b15af34006e6a11974088a46d42b903418eJohann    movd        [rdx+rdi],      mm1
1971b362b15af34006e6a11974088a46d42b903418eJohann
1981b362b15af34006e6a11974088a46d42b903418eJohann    movd        mm4,            [rsi+2*rax]
1991b362b15af34006e6a11974088a46d42b903418eJohann    punpcklbw   mm4,            mm7
2001b362b15af34006e6a11974088a46d42b903418eJohann    paddsw      mm2,            mm4
2011b362b15af34006e6a11974088a46d42b903418eJohann    packuswb    mm2,            mm7
2021b362b15af34006e6a11974088a46d42b903418eJohann    movd        [rdx+rdi*2],    mm2
2031b362b15af34006e6a11974088a46d42b903418eJohann
2041b362b15af34006e6a11974088a46d42b903418eJohann    add         rdx,            rdi
2051b362b15af34006e6a11974088a46d42b903418eJohann    add         rsi,            rax
2061b362b15af34006e6a11974088a46d42b903418eJohann
2071b362b15af34006e6a11974088a46d42b903418eJohann    movd        mm4,            [rsi+2*rax]
2081b362b15af34006e6a11974088a46d42b903418eJohann    punpcklbw   mm4,            mm7
2091b362b15af34006e6a11974088a46d42b903418eJohann    paddsw      mm5,            mm4
2101b362b15af34006e6a11974088a46d42b903418eJohann    packuswb    mm5,            mm7
2111b362b15af34006e6a11974088a46d42b903418eJohann    movd        [rdx+rdi*2],    mm5
2121b362b15af34006e6a11974088a46d42b903418eJohann
2131b362b15af34006e6a11974088a46d42b903418eJohann    ; begin epilog
2141b362b15af34006e6a11974088a46d42b903418eJohann    pop rdi
2151b362b15af34006e6a11974088a46d42b903418eJohann    pop rsi
2161b362b15af34006e6a11974088a46d42b903418eJohann    RESTORE_GOT
2171b362b15af34006e6a11974088a46d42b903418eJohann    UNSHADOW_ARGS
2181b362b15af34006e6a11974088a46d42b903418eJohann    pop         rbp
2191b362b15af34006e6a11974088a46d42b903418eJohann    ret
2201b362b15af34006e6a11974088a46d42b903418eJohann
2211b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_dc_only_idct_add_mmx(
2221b362b15af34006e6a11974088a46d42b903418eJohann;short input_dc,
2231b362b15af34006e6a11974088a46d42b903418eJohann;unsigned char *pred_ptr,
2241b362b15af34006e6a11974088a46d42b903418eJohann;int pred_stride,
2251b362b15af34006e6a11974088a46d42b903418eJohann;unsigned char *dst_ptr,
2261b362b15af34006e6a11974088a46d42b903418eJohann;int stride)
2271b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_dc_only_idct_add_mmx) PRIVATE
2281b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_dc_only_idct_add_mmx):
2291b362b15af34006e6a11974088a46d42b903418eJohann    push        rbp
2301b362b15af34006e6a11974088a46d42b903418eJohann    mov         rbp, rsp
2311b362b15af34006e6a11974088a46d42b903418eJohann    SHADOW_ARGS_TO_STACK 5
2321b362b15af34006e6a11974088a46d42b903418eJohann    GET_GOT     rbx
2331b362b15af34006e6a11974088a46d42b903418eJohann    ; end prolog
2341b362b15af34006e6a11974088a46d42b903418eJohann
2351b362b15af34006e6a11974088a46d42b903418eJohann        movd        mm5,            arg(0) ;input_dc
2361b362b15af34006e6a11974088a46d42b903418eJohann        mov         rax,            arg(1) ;pred_ptr
2371b362b15af34006e6a11974088a46d42b903418eJohann        movsxd      rdx,            dword ptr arg(2) ;pred_stride
2381b362b15af34006e6a11974088a46d42b903418eJohann
2391b362b15af34006e6a11974088a46d42b903418eJohann        pxor        mm0,            mm0
2401b362b15af34006e6a11974088a46d42b903418eJohann
2411b362b15af34006e6a11974088a46d42b903418eJohann        paddw       mm5,            [GLOBAL(fours)]
2421b362b15af34006e6a11974088a46d42b903418eJohann        lea         rcx,            [rdx + rdx*2]
2431b362b15af34006e6a11974088a46d42b903418eJohann
2441b362b15af34006e6a11974088a46d42b903418eJohann        psraw       mm5,            3
2451b362b15af34006e6a11974088a46d42b903418eJohann
2461b362b15af34006e6a11974088a46d42b903418eJohann        punpcklwd   mm5,            mm5
2471b362b15af34006e6a11974088a46d42b903418eJohann
2481b362b15af34006e6a11974088a46d42b903418eJohann        punpckldq   mm5,            mm5
2491b362b15af34006e6a11974088a46d42b903418eJohann
2501b362b15af34006e6a11974088a46d42b903418eJohann        movd        mm1,            [rax]
2511b362b15af34006e6a11974088a46d42b903418eJohann        movd        mm2,            [rax+rdx]
2521b362b15af34006e6a11974088a46d42b903418eJohann        movd        mm3,            [rax+2*rdx]
2531b362b15af34006e6a11974088a46d42b903418eJohann        movd        mm4,            [rax+rcx]
2541b362b15af34006e6a11974088a46d42b903418eJohann
2551b362b15af34006e6a11974088a46d42b903418eJohann        mov         rax,            arg(3) ;d -- destination
2561b362b15af34006e6a11974088a46d42b903418eJohann        movsxd      rdx,            dword ptr arg(4) ;dst_stride
2571b362b15af34006e6a11974088a46d42b903418eJohann
2581b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   mm1,            mm0
2591b362b15af34006e6a11974088a46d42b903418eJohann        paddsw      mm1,            mm5
2601b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    mm1,            mm0              ; pack and unpack to saturate
2611b362b15af34006e6a11974088a46d42b903418eJohann        lea         rcx,            [rdx + rdx*2]
2621b362b15af34006e6a11974088a46d42b903418eJohann
2631b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   mm2,            mm0
2641b362b15af34006e6a11974088a46d42b903418eJohann        paddsw      mm2,            mm5
2651b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    mm2,            mm0              ; pack and unpack to saturate
2661b362b15af34006e6a11974088a46d42b903418eJohann
2671b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   mm3,            mm0
2681b362b15af34006e6a11974088a46d42b903418eJohann        paddsw      mm3,            mm5
2691b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    mm3,            mm0              ; pack and unpack to saturate
2701b362b15af34006e6a11974088a46d42b903418eJohann
2711b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw   mm4,            mm0
2721b362b15af34006e6a11974088a46d42b903418eJohann        paddsw      mm4,            mm5
2731b362b15af34006e6a11974088a46d42b903418eJohann        packuswb    mm4,            mm0              ; pack and unpack to saturate
2741b362b15af34006e6a11974088a46d42b903418eJohann
2751b362b15af34006e6a11974088a46d42b903418eJohann        movd        [rax],          mm1
2761b362b15af34006e6a11974088a46d42b903418eJohann        movd        [rax+rdx],      mm2
2771b362b15af34006e6a11974088a46d42b903418eJohann        movd        [rax+2*rdx],    mm3
2781b362b15af34006e6a11974088a46d42b903418eJohann        movd        [rax+rcx],      mm4
2791b362b15af34006e6a11974088a46d42b903418eJohann
2801b362b15af34006e6a11974088a46d42b903418eJohann    ; begin epilog
2811b362b15af34006e6a11974088a46d42b903418eJohann    RESTORE_GOT
2821b362b15af34006e6a11974088a46d42b903418eJohann    UNSHADOW_ARGS
2831b362b15af34006e6a11974088a46d42b903418eJohann    pop         rbp
2841b362b15af34006e6a11974088a46d42b903418eJohann    ret
2851b362b15af34006e6a11974088a46d42b903418eJohann
2861b362b15af34006e6a11974088a46d42b903418eJohannSECTION_RODATA
2871b362b15af34006e6a11974088a46d42b903418eJohannalign 16
2881b362b15af34006e6a11974088a46d42b903418eJohannx_s1sqr2:
2891b362b15af34006e6a11974088a46d42b903418eJohann    times 4 dw 0x8A8C
2901b362b15af34006e6a11974088a46d42b903418eJohannalign 16
2911b362b15af34006e6a11974088a46d42b903418eJohannx_c1sqr2less1:
2921b362b15af34006e6a11974088a46d42b903418eJohann    times 4 dw 0x4E7B
2931b362b15af34006e6a11974088a46d42b903418eJohannalign 16
2941b362b15af34006e6a11974088a46d42b903418eJohannfours:
2951b362b15af34006e6a11974088a46d42b903418eJohann    times 4 dw 0x0004
296