16fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;
26fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
36fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;
46fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;  Use of this source code is governed by a BSD-style license
56fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;  that can be found in the LICENSE file in the root of the source
66fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;  tree. An additional intellectual property rights grant can be found
76fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;  in the file PATENTS.  All contributing project authors may
86fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;  be found in the AUTHORS file in the root of the source tree.
96fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;
106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%include "vpx_ports/x86_abi_support.asm"
136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;void vp9_post_proc_down_and_across_xmm
156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;(
166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;    unsigned char *src_ptr,
176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;    unsigned char *dst_ptr,
186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;    int src_pixels_per_line,
196fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;    int dst_pixels_per_line,
206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;    int rows,
216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;    int cols,
226fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;    int flimit
236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;)
24afc4a270e3f2ecbbb47aad63c6a6a77ca902d30efgalligan@chromium.orgglobal sym(vp9_post_proc_down_and_across_xmm) PRIVATE
256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.orgsym(vp9_post_proc_down_and_across_xmm):
266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rbp
276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         rbp, rsp
286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    SHADOW_ARGS_TO_STACK 7
296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    SAVE_XMM 7
306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    GET_GOT     rbx
316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rsi
326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rdi
336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; end prolog
346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%if ABI_IS_32BIT=1 && CONFIG_PIC=1
366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ALIGN_STACK 16, rax
376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; move the global rd onto the stack, since we don't have enough registers
386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; to do PIC addressing
396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    movdqa      xmm0, [GLOBAL(rd42)]
406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    sub         rsp, 16
416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    movdqa      [rsp], xmm0
426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%define RD42 [rsp]
436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%else
446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%define RD42 [GLOBAL(rd42)]
456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%endif
466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movd        xmm2,       dword ptr arg(6) ;flimit
496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklwd   xmm2,       xmm2
506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpckldq   xmm2,       xmm2
516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklqdq  xmm2,       xmm2
526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        mov         rsi,        arg(0) ;src_ptr
546fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        mov         rdi,        arg(1) ;dst_ptr
556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movsxd      rcx,        DWORD PTR arg(4) ;rows
576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pxor        xmm0,       xmm0              ; mm0 = 00000000
596fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org.nextrow:
616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
626fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        xor         rdx,        rdx       ; clear out rdx for use as loop counter
636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org.nextcol:
646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
656fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psllw       xmm3,       2                       ;
686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
716fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm3,       xmm5                    ; mm3 += mm6
726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ; thresholding
746fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
776fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pcmpgtw     xmm7,       xmm2
796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm3,       xmm5                    ; mm3 += mm5
836fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ; thresholding
856fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
866fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pcmpgtw     xmm6,       xmm2
906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        por         xmm7,       xmm6                    ; accumulate thresholds
916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
926fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
936fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        neg         rax
946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
956fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
966fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm3,       xmm5                    ; mm3 += mm5
976fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ; thresholding
996fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
1006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
1016fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
1026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
1036fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pcmpgtw     xmm6,       xmm2
1046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        por         xmm7,       xmm6                    ; accumulate thresholds
1056fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
1076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
1086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm3,       xmm4                    ; mm3 += mm5
1096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ; thresholding
1116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
1126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
1136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
1146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
1156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pcmpgtw     xmm6,       xmm2
1166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        por         xmm7,       xmm6                    ; accumulate thresholds
1176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1196fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm3,       RD42                    ; mm3 += round value
1206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psraw       xmm3,       3                       ; mm3 /= 8
1216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1226fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
1236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
1246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm1,       xmm7                    ; combination
1256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        packuswb    xmm1,       xmm0                    ; pack to bytes
1276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movq        QWORD PTR [rdi], xmm1             ;
1286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        neg         rax                   ; pitch is positive
1306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        add         rsi,        8
1316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        add         rdi,        8
1326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        add         rdx,        8
1346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        cmp         edx,        dword arg(5) ;cols
1356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        jl          .nextcol
1376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ; done with the all cols, start the across filtering in place
1396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        sub         rsi,        rdx
1406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        sub         rdi,        rdx
1416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        xor         rdx,        rdx
1436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movq        mm0,        QWORD PTR [rdi-8];
1446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org.acrossnextcol:
1466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movq        xmm7,       QWORD PTR [rdi +rdx -2]
1476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movd        xmm4,       DWORD PTR [rdi +rdx +6]
1486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pslldq      xmm4,       8
1506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        por         xmm4,       xmm7
1516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm3,       xmm4
1536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psrldq      xmm3,       2
1546fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
1556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
1566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psllw       xmm3,       2
1576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1596fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm5,       xmm4
1606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psrldq      xmm5,       3
1616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
1626fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm3,       xmm5              ; mm3 += mm6
1636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ; thresholding
1656fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
1666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
1676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
1686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
1696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pcmpgtw     xmm7,       xmm2
1706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1716fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm5,       xmm4
1726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psrldq      xmm5,       4
1736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
1746fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm3,       xmm5              ; mm3 += mm5
1756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ; thresholding
1776fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
1786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
1796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
1806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
1816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pcmpgtw     xmm6,       xmm2
1826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        por         xmm7,       xmm6              ; accumulate thresholds
1836fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1856fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
1866fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
1876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm3,       xmm5              ; mm3 += mm5
1886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ; thresholding
1906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
1916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
1926fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
1936fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
1946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pcmpgtw     xmm6,       xmm2
1956fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        por         xmm7,       xmm6              ; accumulate thresholds
1966fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1976fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psrldq      xmm4,       1                   ; mm4 = p-1..p5
1986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
1996fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm3,       xmm4              ; mm3 += mm5
2006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2016fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ; thresholding
2026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
2036fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
2046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
2056fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
2066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pcmpgtw     xmm6,       xmm2
2076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        por         xmm7,       xmm6              ; accumulate thresholds
2086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm3,       RD42              ; mm3 += round value
2106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        psraw       xmm3,       3                 ; mm3 /= 8
2116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
2136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
2146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        paddusw     xmm1,       xmm7              ; combination
2156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        packuswb    xmm1,       xmm0              ; pack to bytes
2176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
2186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movdq2q     mm0,        xmm1
2196fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        add         rdx,        8
2216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        cmp         edx,        dword arg(5) ;cols
2226fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        jl          .acrossnextcol;
2236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ; last 8 pixels
2256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movq        QWORD PTR [rdi+rdx-8],  mm0
2266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ; done with this rwo
2286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        add         rsi,rax               ; next line
2296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
2306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        add         rdi,rax               ; next destination
2316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
2326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        dec         rcx                   ; decrement count
2346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        jnz         .nextrow              ; next row
2356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%if ABI_IS_32BIT=1 && CONFIG_PIC=1
2376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    add rsp,16
2386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop rsp
2396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%endif
2406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; begin epilog
2416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop rdi
2426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop rsi
2436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    RESTORE_GOT
2446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    RESTORE_XMM
2456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    UNSHADOW_ARGS
2466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop         rbp
2476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ret
2486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%undef RD42
2496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
2526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;                            int pitch, int rows, int cols,int flimit)
2536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.orgextern sym(vp9_rv)
254afc4a270e3f2ecbbb47aad63c6a6a77ca902d30efgalligan@chromium.orgglobal sym(vp9_mbpost_proc_down_xmm) PRIVATE
2556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.orgsym(vp9_mbpost_proc_down_xmm):
2566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rbp
2576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         rbp, rsp
2586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    SHADOW_ARGS_TO_STACK 5
2596fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    SAVE_XMM 7
2606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    GET_GOT     rbx
2616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rsi
2626fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rdi
2636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; end prolog
2646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2656fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ALIGN_STACK 16, rax
2666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    sub         rsp, 128+16
2676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; unsigned char d[16][8] at [rsp]
2696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; create flimit2 at [rsp+128]
2706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         eax, dword ptr arg(4) ;flimit
2716fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         [rsp+128], eax
2726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         [rsp+128+4], eax
2736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         [rsp+128+8], eax
2746fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         [rsp+128+12], eax
2756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%define flimit4 [rsp+128]
2766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2776fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%if ABI_IS_32BIT=0
2786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    lea         r8,       [GLOBAL(sym(vp9_rv))]
2796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%endif
2806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ;rows +=8;
2826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    add         dword arg(2), 8
2836fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ;for(c=0; c<cols; c+=8)
2856fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org.loop_col:
2866fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            mov         rsi,        arg(0) ; s
2876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pxor        xmm0,       xmm0        ;
2886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movsxd      rax,        dword ptr arg(1) ;pitch       ;
2906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            neg         rax                                     ; rax = -pitch
2916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2926fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
2936fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            neg         rax
2946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2956fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2966fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pxor        xmm5,       xmm5
2976fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pxor        xmm6,       xmm6        ;
2986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
2996fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pxor        xmm7,       xmm7        ;
3006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            mov         rdi,        rsi
3016fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            mov         rcx,        15          ;
3036fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org.loop_initvar:
3056fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movq        xmm1,       QWORD PTR [rdi];
3066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklbw   xmm1,       xmm0        ;
3076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddw       xmm5,       xmm1        ;
3096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pmullw      xmm1,       xmm1        ;
3106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqa      xmm2,       xmm1        ;
3126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklwd   xmm1,       xmm0        ;
3136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpckhwd   xmm2,       xmm0        ;
3156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm6,       xmm1        ;
3166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm7,       xmm2        ;
3186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            lea         rdi,        [rdi+rax]   ;
3196fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            dec         rcx
3216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            jne         .loop_initvar
3226fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            ;save the var and sum
3236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            xor         rdx,        rdx
3246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org.loop_row:
3256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
3266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
3276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklbw   xmm1,       xmm0
3296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklbw   xmm2,       xmm0
3306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddw       xmm5,       xmm2
3326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubw       xmm5,       xmm1
3336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pmullw      xmm2,       xmm2
3356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqa      xmm4,       xmm2
3366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklwd   xmm2,       xmm0
3386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpckhwd   xmm4,       xmm0
3396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm6,       xmm2
3416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm7,       xmm4
3426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pmullw      xmm1,       xmm1
3446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqa      xmm2,       xmm1
3456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklwd   xmm1,       xmm0
3476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm6,       xmm1
3486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpckhwd   xmm2,       xmm0
3506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm7,       xmm2
3516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqa      xmm3,       xmm6
3546fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pslld       xmm3,       4
3556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm3,       xmm6
3576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqa      xmm1,       xmm5
3586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3596fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqa      xmm4,       xmm5
3606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pmullw      xmm1,       xmm1
3616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3626fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pmulhw      xmm4,       xmm4
3636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqa      xmm2,       xmm1
3646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3656fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklwd   xmm1,       xmm4
3666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpckhwd   xmm2,       xmm4
3676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqa      xmm4,       xmm7
3696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pslld       xmm4,       4
3706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3716fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm4,       xmm7
3726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm3,       xmm1
3746fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm4,       xmm2
3756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm3,       flimit4
3776fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm4,       flimit4
3786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psrad       xmm3,       31
3806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psrad       xmm4,       31
3816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            packssdw    xmm3,       xmm4
3836fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            packsswb    xmm3,       xmm0
3846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3856fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movq        xmm1,       QWORD PTR [rsi+rax*8]
3866fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movq        xmm2,       xmm1
3886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklbw   xmm1,       xmm0
3896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddw       xmm1,       xmm5
3916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            mov         rcx,        rdx
3926fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
3936fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            and         rcx,        127
3946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%if ABI_IS_32BIT=1 && CONFIG_PIC=1
3956fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            push        rax
3966fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            lea         rax,        [GLOBAL(sym(vp9_rv))]
3976fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]
3986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pop         rax
3996fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%elif ABI_IS_32BIT=0
4006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]
4016fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%else
4026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqu      xmm4,       [sym(vp9_rv) + rcx*2]
4036fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%endif
4046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4056fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddw       xmm1,       xmm4
4066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            ;paddw     xmm1,       eight8s
4076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psraw       xmm1,       4
4086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            packuswb    xmm1,       xmm0
4106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pand        xmm1,       xmm3
4116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pandn       xmm3,       xmm2
4136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            por         xmm1,       xmm3
4146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            and         rcx,        15
4166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
4176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            mov         rcx,        rdx
4196fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            sub         rcx,        8
4206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            and         rcx,        15
4226fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
4236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movq        [rsi],      mm0
4256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            lea         rsi,        [rsi+rax]
4266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            lea         rdi,        [rdi+rax]
4286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            add         rdx,        1
4296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            cmp         edx,        dword arg(2) ;rows
4316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            jl          .loop_row
4326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        add         dword arg(0), 8 ; s += 8
4346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        sub         dword arg(3), 8 ; cols -= 8
4356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        cmp         dword arg(3), 0
4366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        jg          .loop_col
4376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    add         rsp, 128+16
4396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop         rsp
4406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; begin epilog
4426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop rdi
4436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop rsi
4446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    RESTORE_GOT
4456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    RESTORE_XMM
4466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    UNSHADOW_ARGS
4476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop         rbp
4486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ret
4496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%undef flimit4
4506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
4536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;                                int pitch, int rows, int cols,int flimit)
454afc4a270e3f2ecbbb47aad63c6a6a77ca902d30efgalligan@chromium.orgglobal sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
4556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.orgsym(vp9_mbpost_proc_across_ip_xmm):
4566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rbp
4576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         rbp, rsp
4586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    SHADOW_ARGS_TO_STACK 5
4596fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    SAVE_XMM 7
4606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    GET_GOT     rbx
4616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rsi
4626fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rdi
4636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; end prolog
4646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4656fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ALIGN_STACK 16, rax
4666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    sub         rsp, 16
4676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; create flimit4 at [rsp]
4696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         eax, dword ptr arg(4) ;flimit
4706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         [rsp], eax
4716fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         [rsp+4], eax
4726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         [rsp+8], eax
4736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         [rsp+12], eax
4746fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%define flimit4 [rsp]
4756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4776fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ;for(r=0;r<rows;r++)
4786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org.ip_row_loop:
4796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        xor         rdx,    rdx ;sumsq=0;
4816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        xor         rcx,    rcx ;sum=0;
4826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        mov         rsi,    arg(0); s
4836fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        mov         rdi,    -8
4846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org.ip_var_loop:
4856fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ;for(i=-8;i<=6;i++)
4866fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ;{
4876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ;    sumsq += s[i]*s[i];
4886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ;    sum   += s[i];
4896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ;}
4906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movzx       eax, byte [rsi+rdi]
4916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        add         ecx, eax
4926fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        mul         al
4936fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        add         edx, eax
4946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        add         rdi, 1
4956fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        cmp         rdi, 6
4966fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        jle         .ip_var_loop
4976fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
4996fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            ;mov         rax,    sumsq
5006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            ;movd        xmm7,   rax
5016fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movd        xmm7,   edx
5026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5036fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            ;mov         rax,    sum
5046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            ;movd        xmm6,   rax
5056fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movd        xmm6,   ecx
5066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            mov         rsi,    arg(0) ;s
5086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            xor         rcx,    rcx
5096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movsxd      rdx,    dword arg(3) ;cols
5116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            add         rdx,    8
5126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pxor        mm0,    mm0
5136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pxor        mm1,    mm1
5146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pxor        xmm0,   xmm0
5166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org.nextcol4:
5176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
5196fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
5206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklbw   xmm1,   xmm0                    ; expanding
5226fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklbw   xmm2,   xmm0                    ; expanding
5236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
5256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
5266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
5286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
5296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
5316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
5326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm6,   xmm2
5346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm7,   xmm1
5356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
5376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
5386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
5406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
5416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
5436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
5446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm6,   xmm4
5466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm7,   xmm3
5476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
5496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
5506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm7,   xmm3
5526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm6,   xmm4
5536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5546fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
5556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
5566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm7,   xmm3
5586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm6,   xmm4
5596fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqa      xmm3,   xmm6
5616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pmaddwd     xmm3,   xmm3
5626fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqa      xmm5,   xmm7
5646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pslld       xmm5,   4
5656fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm5,   xmm7
5676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm5,   xmm3
5686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubd       xmm5,   flimit4
5706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psrad       xmm5,   31
5716fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            packssdw    xmm5,   xmm0
5736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            packsswb    xmm5,   xmm0
5746fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movd        xmm1,   DWORD PTR [rsi+rcx]
5766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movq        xmm2,   xmm1
5776fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklbw   xmm1,   xmm0
5796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            punpcklwd   xmm1,   xmm0
5806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm1,   xmm6
5826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddd       xmm1,   [GLOBAL(four8s)]
5836fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psrad       xmm1,   4
5856fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            packssdw    xmm1,   xmm0
5866fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            packuswb    xmm1,   xmm0
5886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pand        xmm1,   xmm5
5896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            pandn       xmm5,   xmm2
5916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            por         xmm5,   xmm1
5926fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5936fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movd        [rsi+rcx-8],  mm0
5946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movq        mm0,    mm1
5956fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5966fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdq2q     mm1,    xmm5
5976fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psrldq      xmm7,   12
5986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
5996fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psrldq      xmm6,   12
6006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            add         rcx,    4
6016fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            cmp         rcx,    rdx
6036fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            jl          .nextcol4
6046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6056fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        ;s+=pitch;
6066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        movsxd rax, dword arg(1)
6076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        add    arg(0), rax
6086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        sub dword arg(2), 1 ;rows-=1
6106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        cmp dword arg(2), 0
6116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org        jg .ip_row_loop
6126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    add         rsp, 16
6146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop         rsp
6156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; begin epilog
6176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop rdi
6186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop rsi
6196fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    RESTORE_GOT
6206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    RESTORE_XMM
6216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    UNSHADOW_ARGS
6226fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop         rbp
6236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ret
6246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org%undef flimit4
6256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
62706d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
6286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;                            unsigned char blackclamp[16],
6296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;                            unsigned char whiteclamp[16],
6306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org;                            unsigned char bothclamp[16],
63106d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org;                            unsigned int width, unsigned int height, int pitch)
632afc4a270e3f2ecbbb47aad63c6a6a77ca902d30efgalligan@chromium.orgglobal sym(vp9_plane_add_noise_wmt) PRIVATE
6336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.orgsym(vp9_plane_add_noise_wmt):
6346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rbp
6356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov         rbp, rsp
6366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    SHADOW_ARGS_TO_STACK 8
6376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    GET_GOT     rbx
6386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rsi
6396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    push        rdi
6406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; end prolog
6416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org.addnoise_loop:
6439d92657d2ee8ab69da0e227c7fb81f04fe518a72Ben Murdoch    call sym(LIBVPX_RAND) WRT_PLT
6446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov     rcx, arg(1) ;noise
6456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    and     rax, 0xff
6466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    add     rcx, rax
6476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; we rely on the fact that the clamping vectors are stored contiguously
6496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; in black/white/both order. Note that we have to reload this here because
6506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; rdx could be trashed by rand()
6516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mov     rdx, arg(2) ; blackclamp
6526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6546fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            mov     rdi, rcx
6556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movsxd  rcx, dword arg(5) ;[Width]
6566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            mov     rsi, arg(0) ;Pos
6576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            xor         rax,rax
6586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6596fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org.addnoise_nextset:
6606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqu      xmm1,[rsi+rax]         ; get the source
6616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6626fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
6636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddusb     xmm1, [rdx+32] ;bothclamp
6646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            psubusb     xmm1, [rdx+16] ;whiteclamp
6656fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
6676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            paddb       xmm1,xmm2              ; add it in
6686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            movdqu      [rsi+rax],xmm1         ; store the result
6696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            add         rax,16                 ; move to the next line
6716fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            cmp         rax, rcx
6736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org            jl          .addnoise_nextset
6746fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    movsxd  rax, dword arg(7) ; Pitch
6766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    add     arg(0), rax ; Start += Pitch
6776fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    sub     dword arg(6), 1   ; Height -= 1
6786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    jg      .addnoise_loop
6796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ; begin epilog
6816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop rdi
6826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop rsi
6836fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    RESTORE_GOT
6846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    UNSHADOW_ARGS
6856fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    pop         rbp
6866fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    ret
6876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
6896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.orgSECTION_RODATA
6906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.orgalign 16
6916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.orgrd42:
6926fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    times 8 dw 0x04
6936fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.orgfour8s:
6946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    times 4 dd 8
695