1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "third_party/x86inc/x86inc.asm" 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan 13233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION .text 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_subtract_block(int rows, int cols, 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int16_t *diff, ptrdiff_t diff_stride, 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *src, ptrdiff_t src_stride, 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *pred, ptrdiff_t pred_stride) 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan 20233d2500723e5594f3e7c70896ffeeef32b9c950ywanINIT_XMM sse2 21233d2500723e5594f3e7c70896ffeeef32b9c950ywancglobal subtract_block, 7, 7, 8, \ 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan rows, cols, diff, diff_stride, src, src_stride, \ 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan pred, pred_stride 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define pred_str colsq 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor m7, m7 ; dedicated zero register 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp colsd, 4 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan je .case_4 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp colsd, 8 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan je .case_8 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp colsd, 16 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan je .case_16 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp colsd, 32 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan je .case_32 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro loop16 6 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan mova m0, [srcq+%1] 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan mova m4, [srcq+%2] 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan mova m1, [predq+%3] 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan mova m5, [predq+%4] 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw m2, m0, m7 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw m3, m1, m7 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw m0, m7 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw m1, m7 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw m2, m3 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw m0, m1 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw m1, m4, m7 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw m3, m5, m7 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw m4, m7 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw m5, m7 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw m1, m3 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw m4, m5 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan mova [diffq+mmsize*0+%5], m0 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan mova [diffq+mmsize*1+%5], m2 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan mova [diffq+mmsize*0+%6], m4 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan mova [diffq+mmsize*1+%6], m1 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov pred_str, pred_stridemp 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_64: 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea diffq, [diffq+diff_strideq*2] 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan add predq, pred_str 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan add srcq, src_strideq 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan dec rowsd 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan jg .loop_64 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan RET 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan.case_32: 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov pred_str, pred_stridemp 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_32: 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan loop16 0, mmsize, 0, mmsize, 0, 2*mmsize 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea diffq, [diffq+diff_strideq*2] 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan add predq, pred_str 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan add srcq, src_strideq 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan dec rowsd 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan jg .loop_32 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan RET 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan.case_16: 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov pred_str, pred_stridemp 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_16: 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea diffq, [diffq+diff_strideq*4] 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea predq, [predq+pred_str*2] 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea srcq, [srcq+src_strideq*2] 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rowsd, 2 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan jg .loop_16 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan RET 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro loop_h 0 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan movh m0, [srcq] 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan movh m2, [srcq+src_strideq] 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan movh m1, [predq] 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan movh m3, [predq+pred_str] 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw m0, m7 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw m1, m7 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw m2, m7 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw m3, m7 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw m0, m1 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw m2, m3 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan mova [diffq], m0 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan mova [diffq+diff_strideq*2], m2 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan.case_8: 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov pred_str, pred_stridemp 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_8: 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan loop_h 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea diffq, [diffq+diff_strideq*4] 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea srcq, [srcq+src_strideq*2] 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea predq, [predq+pred_str*2] 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rowsd, 2 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan jg .loop_8 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan RET 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan 117233d2500723e5594f3e7c70896ffeeef32b9c950ywanINIT_MMX 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan.case_4: 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov pred_str, pred_stridemp 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_4: 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan loop_h 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea diffq, [diffq+diff_strideq*4] 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea srcq, [srcq+src_strideq*2] 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea predq, [predq+pred_str*2] 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rowsd, 2 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan jg .loop_4 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan RET 128