191037db265ecdd914a26e056cf69207b4f50924ehkuang; 291037db265ecdd914a26e056cf69207b4f50924ehkuang; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 391037db265ecdd914a26e056cf69207b4f50924ehkuang; 491037db265ecdd914a26e056cf69207b4f50924ehkuang; Use of this source code is governed by a BSD-style license 591037db265ecdd914a26e056cf69207b4f50924ehkuang; that can be found in the LICENSE file in the root of the source 691037db265ecdd914a26e056cf69207b4f50924ehkuang; tree. An additional intellectual property rights grant can be found 791037db265ecdd914a26e056cf69207b4f50924ehkuang; in the file PATENTS. All contributing project authors may 891037db265ecdd914a26e056cf69207b4f50924ehkuang; be found in the AUTHORS file in the root of the source tree. 991037db265ecdd914a26e056cf69207b4f50924ehkuang; 1091037db265ecdd914a26e056cf69207b4f50924ehkuang 1191037db265ecdd914a26e056cf69207b4f50924ehkuang%include "third_party/x86inc/x86inc.asm" 1291037db265ecdd914a26e056cf69207b4f50924ehkuang 13f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangSECTION_RODATA 14f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 15f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangpb_1: times 16 db 1 16f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangpw_2: times 8 dw 2 17f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangpb_7m1: times 8 db 7, -1 18f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangpb_15: times 16 db 15 19f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 20f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7 21f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7 22f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 23f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 24f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 25f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 26f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b2w01234577: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 7, -1, 7, -1 27f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b2w12345677: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1 28f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b2w23456777: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1, 7, -1 29f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b2w01234567: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1 30f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b2w12345678: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 31f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b2w23456789: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1 32f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b2w89abcdef: db 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1 33f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b2w9abcdeff: db 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1 34f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b2wabcdefff: db 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1, 15, -1 35f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 36f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangsh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 37f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 3891037db265ecdd914a26e056cf69207b4f50924ehkuangSECTION .text 3991037db265ecdd914a26e056cf69207b4f50924ehkuang 4091037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_MMX ssse3 4191037db265ecdd914a26e056cf69207b4f50924ehkuangcglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left 4291037db265ecdd914a26e056cf69207b4f50924ehkuang movifnidn leftq, leftmp 4391037db265ecdd914a26e056cf69207b4f50924ehkuang add leftq, 4 4491037db265ecdd914a26e056cf69207b4f50924ehkuang mov lineq, -2 4591037db265ecdd914a26e056cf69207b4f50924ehkuang pxor m0, m0 4691037db265ecdd914a26e056cf69207b4f50924ehkuang.loop: 4791037db265ecdd914a26e056cf69207b4f50924ehkuang movd m1, [leftq+lineq*2 ] 4891037db265ecdd914a26e056cf69207b4f50924ehkuang movd m2, [leftq+lineq*2+1] 4991037db265ecdd914a26e056cf69207b4f50924ehkuang pshufb m1, m0 5091037db265ecdd914a26e056cf69207b4f50924ehkuang pshufb m2, m0 5191037db265ecdd914a26e056cf69207b4f50924ehkuang movd [dstq ], m1 5291037db265ecdd914a26e056cf69207b4f50924ehkuang movd [dstq+strideq], m2 5391037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+strideq*2] 5491037db265ecdd914a26e056cf69207b4f50924ehkuang inc lineq 5591037db265ecdd914a26e056cf69207b4f50924ehkuang jnz .loop 5691037db265ecdd914a26e056cf69207b4f50924ehkuang REP_RET 5791037db265ecdd914a26e056cf69207b4f50924ehkuang 5891037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_MMX ssse3 5991037db265ecdd914a26e056cf69207b4f50924ehkuangcglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left 6091037db265ecdd914a26e056cf69207b4f50924ehkuang movifnidn leftq, leftmp 6191037db265ecdd914a26e056cf69207b4f50924ehkuang add leftq, 8 6291037db265ecdd914a26e056cf69207b4f50924ehkuang mov lineq, -4 6391037db265ecdd914a26e056cf69207b4f50924ehkuang pxor m0, m0 6491037db265ecdd914a26e056cf69207b4f50924ehkuang.loop: 6591037db265ecdd914a26e056cf69207b4f50924ehkuang movd m1, [leftq+lineq*2 ] 6691037db265ecdd914a26e056cf69207b4f50924ehkuang movd m2, [leftq+lineq*2+1] 6791037db265ecdd914a26e056cf69207b4f50924ehkuang pshufb m1, m0 6891037db265ecdd914a26e056cf69207b4f50924ehkuang pshufb m2, m0 6991037db265ecdd914a26e056cf69207b4f50924ehkuang movq [dstq ], m1 7091037db265ecdd914a26e056cf69207b4f50924ehkuang movq [dstq+strideq], m2 7191037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+strideq*2] 7291037db265ecdd914a26e056cf69207b4f50924ehkuang inc lineq 7391037db265ecdd914a26e056cf69207b4f50924ehkuang jnz .loop 7491037db265ecdd914a26e056cf69207b4f50924ehkuang REP_RET 7591037db265ecdd914a26e056cf69207b4f50924ehkuang 7691037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM ssse3 7791037db265ecdd914a26e056cf69207b4f50924ehkuangcglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left 7891037db265ecdd914a26e056cf69207b4f50924ehkuang movifnidn leftq, leftmp 7991037db265ecdd914a26e056cf69207b4f50924ehkuang add leftq, 16 8091037db265ecdd914a26e056cf69207b4f50924ehkuang mov lineq, -8 8191037db265ecdd914a26e056cf69207b4f50924ehkuang pxor m0, m0 8291037db265ecdd914a26e056cf69207b4f50924ehkuang.loop: 8391037db265ecdd914a26e056cf69207b4f50924ehkuang movd m1, [leftq+lineq*2 ] 8491037db265ecdd914a26e056cf69207b4f50924ehkuang movd m2, [leftq+lineq*2+1] 8591037db265ecdd914a26e056cf69207b4f50924ehkuang pshufb m1, m0 8691037db265ecdd914a26e056cf69207b4f50924ehkuang pshufb m2, m0 8791037db265ecdd914a26e056cf69207b4f50924ehkuang mova [dstq ], m1 8891037db265ecdd914a26e056cf69207b4f50924ehkuang mova [dstq+strideq], m2 8991037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+strideq*2] 9091037db265ecdd914a26e056cf69207b4f50924ehkuang inc lineq 9191037db265ecdd914a26e056cf69207b4f50924ehkuang jnz .loop 9291037db265ecdd914a26e056cf69207b4f50924ehkuang REP_RET 9391037db265ecdd914a26e056cf69207b4f50924ehkuang 9491037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM ssse3 9591037db265ecdd914a26e056cf69207b4f50924ehkuangcglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left 9691037db265ecdd914a26e056cf69207b4f50924ehkuang movifnidn leftq, leftmp 9791037db265ecdd914a26e056cf69207b4f50924ehkuang add leftq, 32 9891037db265ecdd914a26e056cf69207b4f50924ehkuang mov lineq, -16 9991037db265ecdd914a26e056cf69207b4f50924ehkuang pxor m0, m0 10091037db265ecdd914a26e056cf69207b4f50924ehkuang.loop: 10191037db265ecdd914a26e056cf69207b4f50924ehkuang movd m1, [leftq+lineq*2 ] 10291037db265ecdd914a26e056cf69207b4f50924ehkuang movd m2, [leftq+lineq*2+1] 10391037db265ecdd914a26e056cf69207b4f50924ehkuang pshufb m1, m0 10491037db265ecdd914a26e056cf69207b4f50924ehkuang pshufb m2, m0 10591037db265ecdd914a26e056cf69207b4f50924ehkuang mova [dstq ], m1 10691037db265ecdd914a26e056cf69207b4f50924ehkuang mova [dstq +16], m1 10791037db265ecdd914a26e056cf69207b4f50924ehkuang mova [dstq+strideq ], m2 10891037db265ecdd914a26e056cf69207b4f50924ehkuang mova [dstq+strideq+16], m2 10991037db265ecdd914a26e056cf69207b4f50924ehkuang lea dstq, [dstq+strideq*2] 11091037db265ecdd914a26e056cf69207b4f50924ehkuang inc lineq 11191037db265ecdd914a26e056cf69207b4f50924ehkuang jnz .loop 11291037db265ecdd914a26e056cf69207b4f50924ehkuang REP_RET 113f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 114f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangINIT_MMX ssse3 115f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangcglobal d45_predictor_4x4, 3, 3, 4, dst, stride, above 116f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movq m0, [aboveq] 117f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m2, m0, [sh_b23456777] 118f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m1, m0, [sh_b01234577] 119f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, [sh_b12345677] 120f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pavgb m3, m2, m1 121f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pxor m2, m1 122f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pand m2, [pb_1] 123f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang psubb m3, m2 124f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pavgb m0, m3 125f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 126f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang ; store 4 lines 127f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movd [dstq ], m0 128f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang psrlq m0, 8 129f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movd [dstq+strideq], m0 130f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dstq, [dstq+strideq*2] 131f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang psrlq m0, 8 132f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movd [dstq ], m0 133f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang psrlq m0, 8 134f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movd [dstq+strideq], m0 135f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RET 136f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 137f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangINIT_MMX ssse3 138f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangcglobal d45_predictor_8x8, 3, 3, 4, dst, stride, above 139f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movq m0, [aboveq] 140f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova m1, [sh_b12345677] 141f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang DEFINE_ARGS dst, stride, stride3, line 142f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea stride3q, [strideq*3] 143f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m2, m0, [sh_b23456777] 144f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pavgb m3, m2, m0 145f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pxor m2, m0 146f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 147f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pand m2, [pb_1] 148f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang psubb m3, m2 149f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pavgb m0, m3 150f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 151f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang ; store 4 lines 152f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movq [dstq ], m0 153f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 154f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movq [dstq+strideq ], m0 155f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 156f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movq [dstq+strideq*2], m0 157f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 158f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movq [dstq+stride3q ], m0 159f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 160f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dstq, [dstq+strideq*4] 161f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 162f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang ; store next 4 lines 163f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movq [dstq ], m0 164f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 165f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movq [dstq+strideq ], m0 166f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 167f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movq [dstq+strideq*2], m0 168f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 169f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movq [dstq+stride3q ], m0 170f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RET 171f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 172f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangINIT_XMM ssse3 173f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangcglobal d45_predictor_16x16, 3, 5, 4, dst, stride, above, dst8, line 174f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova m0, [aboveq] 175f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang DEFINE_ARGS dst, stride, stride3, dst8, line 176f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea stride3q, [strideq*3] 177f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dst8q, [dstq+strideq*8] 178f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova m1, [sh_b123456789abcdeff] 179f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m2, m0, [sh_b23456789abcdefff] 180f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pavgb m3, m2, m0 181f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pxor m2, m0 182f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 183f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pand m2, [pb_1] 184f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang psubb m3, m2 185f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pavgb m0, m3 186f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 187f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang ; first 4 lines and first half of 3rd 4 lines 188f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mov lined, 2 189f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang.loop: 190f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq ], m0 191f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dst8q ], m0 192f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 193f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq ], m0 194f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dst8q+strideq ], m0 195f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 196f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq*2 ], m0 197f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dst8q+strideq*2 ], m0 198f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 199f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +stride3q ], m0 200f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dst8q+stride3q ], m0 201f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m0, m1 202f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dstq, [dstq +strideq*4] 203f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dst8q, [dst8q+strideq*4] 204f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang dec lined 205f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang jnz .loop 206f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 207f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang ; bottom-right 8x8 block 208f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dstq +8], m0 209f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dstq+strideq +8], m0 210f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dstq+strideq*2+8], m0 211f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dstq+stride3q +8], m0 212f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dstq, [dstq+strideq*4] 213f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dstq +8], m0 214f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dstq+strideq +8], m0 215f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dstq+strideq*2+8], m0 216f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang movhps [dstq+stride3q +8], m0 217f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RET 218f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 219f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangINIT_XMM ssse3 220f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuangcglobal d45_predictor_32x32, 3, 5, 7, dst, stride, above, dst16, line 221f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova m0, [aboveq] 222f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova m4, [aboveq+16] 223f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang DEFINE_ARGS dst, stride, stride3, dst16, line 224f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea stride3q, [strideq*3] 225f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dst16q, [dstq +strideq*8] 226f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dst16q, [dst16q+strideq*8] 227f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova m1, [sh_b123456789abcdeff] 228f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m2, m4, [sh_b23456789abcdefff] 229f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pavgb m3, m2, m4 230f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pxor m2, m4 231f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang palignr m5, m4, m0, 1 232f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang palignr m6, m4, m0, 2 233f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m4, m1 234f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pand m2, [pb_1] 235f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang psubb m3, m2 236f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pavgb m4, m3 237f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pavgb m3, m0, m6 238f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pxor m0, m6 239f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pand m0, [pb_1] 240f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang psubb m3, m0 241f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pavgb m5, m3 242f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 243f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang ; write 4x4 lines (and the first half of the second 4x4 lines) 244f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mov lined, 4 245f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang.loop: 246f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq ], m5 247f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +16], m4 248f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dst16q ], m4 249f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang palignr m3, m4, m5, 1 250f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m4, m1 251f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq ], m3 252f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq +16], m4 253f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dst16q+strideq ], m4 254f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang palignr m5, m4, m3, 1 255f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m4, m1 256f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq*2 ], m5 257f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq*2+16], m4 258f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dst16q+strideq*2 ], m4 259f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang palignr m3, m4, m5, 1 260f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m4, m1 261f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +stride3q ], m3 262f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +stride3q +16], m4 263f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dst16q+stride3q ], m4 264f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang palignr m5, m4, m3, 1 265f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang pshufb m4, m1 266f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dstq, [dstq +strideq*4] 267f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dst16q, [dst16q+strideq*4] 268f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang dec lined 269f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang jnz .loop 270f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang 271f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang ; write second half of second 4x4 lines 272f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +16], m4 273f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq +16], m4 274f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq*2+16], m4 275f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +stride3q +16], m4 276f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dstq, [dstq +strideq*4] 277f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +16], m4 278f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq +16], m4 279f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq*2+16], m4 280f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +stride3q +16], m4 281f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dstq, [dstq +strideq*4] 282f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +16], m4 283f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq +16], m4 284f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq*2+16], m4 285f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +stride3q +16], m4 286f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang lea dstq, [dstq +strideq*4] 287f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +16], m4 288f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq +16], m4 289f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +strideq*2+16], m4 290f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang mova [dstq +stride3q +16], m4 291f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang RET 292