1/* 2 * Copyright © 2013 The Android Open Source Project 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23/* 24 * Copyright © 2009 Nokia Corporation 25 * 26 * Permission is hereby granted, free of charge, to any person obtaining a 27 * copy of this software and associated documentation files (the "Software"), 28 * to deal in the Software without restriction, including without limitation 29 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 30 * and/or sell copies of the Software, and to permit persons to whom the 31 * Software is furnished to do so, subject to the following conditions: 32 * 33 * The above copyright notice and this permission notice (including the next 34 * paragraph) shall be included in all copies or substantial portions of the 35 * Software. 36 * 37 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 38 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 39 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 40 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 41 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 42 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 43 * DEALINGS IN THE SOFTWARE. 44 * 45 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 46 */ 47 48#if defined(__linux__) && defined(__ELF__) 49.section .note.GNU-stack,"",%progbits 50#endif 51 52 .text 53 .fpu neon 54 .arch armv7a 55 .object_arch armv4 56 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ 57 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ 58 .arm 59 .altmacro 60 .p2align 2 61 62#include "pixman-private.h" 63#include "pixman-arm-neon-asm.h" 64 65.set RESPECT_STRICT_ALIGNMENT, 1 66.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED 67.set PREFETCH_DISTANCE_SIMPLE, 64 68 69.set BILINEAR_FLAG_UNROLL_4, 0 70.set BILINEAR_FLAG_UNROLL_8, 1 71.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 72 73/* Supplementary macro for setting function attributes */ 74.macro pixman_asm_function fname 75 .func fname 76 .global fname 77#ifdef __ELF__ 78 .hidden fname 79 .type fname, %function 80#endif 81fname: 82.endm 83 84.macro bilinear_load_8888 reg1, reg2, tmp 85 mov TMP1, X, asr #16 86 add X, X, UX 87 add TMP1, TOP, TMP1, asl #2 88 vld1.32 {reg1}, [TMP1], STRIDE 89 vld1.32 {reg2}, [TMP1] 90.endm 91 92.macro bilinear_load_and_vertical_interpolate_two_8888 \ 93 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 94 95 bilinear_load_8888 reg1, reg2, tmp1 96 vmull.u8 acc1, reg1, d28 97 vmlal.u8 acc1, reg2, d29 98 bilinear_load_8888 reg3, reg4, tmp2 99 vmull.u8 acc2, reg3, d28 100 vmlal.u8 acc2, reg4, d29 101.endm 102 103.macro bilinear_store_8888 numpix, tmp1, tmp2 104.if numpix == 4 105 vst1.32 {d0, d1}, [OUT, :128]! 106.elseif numpix == 2 107 vst1.32 {d0}, [OUT, :64]! 108.elseif numpix == 1 109 vst1.32 {d0[0]}, [OUT, :32]! 110.else 111 .error bilinear_store_8888 numpix is unsupported 112.endif 113.endm 114 115.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt 116 bilinear_load_&src_fmt d0, d1, d2 117 vmull.u8 q1, d0, d28 118 vmlal.u8 q1, d1, d29 119 /* 5 cycles bubble */ 120 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 121 vmlsl.u16 q0, d2, d30 122 vmlal.u16 q0, d3, d30 123 /* 5 cycles bubble */ 124 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 125 /* 3 cycles bubble */ 126 vmovn.u16 d0, q0 127 /* 1 cycle bubble */ 128 bilinear_store_&dst_fmt 1, q2, q3 129.endm 130 131.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt 132 bilinear_load_and_vertical_interpolate_two_&src_fmt \ 133 q1, q11, d0, d1, d20, d21, d22, d23 134 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 135 vmlsl.u16 q0, d2, d30 136 vmlal.u16 q0, d3, d30 137 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 138 vmlsl.u16 q10, d22, d31 139 vmlal.u16 q10, d23, d31 140 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 141 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 142 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 143 vadd.u16 q12, q12, q13 144 vmovn.u16 d0, q0 145 bilinear_store_&dst_fmt 2, q2, q3 146.endm 147 148.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt 149 bilinear_load_and_vertical_interpolate_four_&src_fmt \ 150 q1, q11, d0, d1, d20, d21, d22, d23 \ 151 q3, q9, d4, d5, d16, d17, d18, d19 152 pld [TMP1, PF_OFFS] 153 sub TMP1, TMP1, STRIDE 154 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 155 vmlsl.u16 q0, d2, d30 156 vmlal.u16 q0, d3, d30 157 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 158 vmlsl.u16 q10, d22, d31 159 vmlal.u16 q10, d23, d31 160 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 161 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS 162 vmlsl.u16 q2, d6, d30 163 vmlal.u16 q2, d7, d30 164 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS 165 pld [TMP2, PF_OFFS] 166 vmlsl.u16 q8, d18, d31 167 vmlal.u16 q8, d19, d31 168 vadd.u16 q12, q12, q13 169 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 170 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 171 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 172 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) 173 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 174 vmovn.u16 d0, q0 175 vmovn.u16 d1, q2 176 vadd.u16 q12, q12, q13 177 bilinear_store_&dst_fmt 4, q2, q3 178.endm 179 180.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 181.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 182 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head 183.else 184 bilinear_interpolate_four_pixels src_fmt, dst_fmt 185.endif 186.endm 187 188.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 189.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 190 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail 191.endif 192.endm 193 194.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 195.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 196 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head 197.else 198 bilinear_interpolate_four_pixels src_fmt, dst_fmt 199.endif 200.endm 201 202.macro bilinear_load_and_vertical_interpolate_four_8888 \ 203 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 204 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 205 206 bilinear_load_and_vertical_interpolate_two_8888 \ 207 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi 208 bilinear_load_and_vertical_interpolate_two_8888 \ 209 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 210.endm 211 212.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ 213 src_bpp_shift, dst_bpp_shift, \ 214 prefetch_distance, flags 215 216pixman_asm_function fname 217 OUT .req r0 218 TOP .req r1 219 BOTTOM .req r2 220 WT .req r3 221 WB .req r4 222 X .req r5 223 UX .req r6 224 WIDTH .req ip 225 TMP1 .req r3 226 TMP2 .req r4 227 PF_OFFS .req r7 228 TMP3 .req r8 229 TMP4 .req r9 230 STRIDE .req r2 231 232 mov ip, sp 233 push {r4, r5, r6, r7, r8, r9} 234 mov PF_OFFS, #prefetch_distance 235 ldmia ip, {WB, X, UX, WIDTH} 236 mul PF_OFFS, PF_OFFS, UX 237 238.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 239 vpush {d8-d15} 240.endif 241 242 sub STRIDE, BOTTOM, TOP 243 .unreq BOTTOM 244 245 cmp WIDTH, #0 246 ble 3f 247 248 vdup.u16 q12, X 249 vdup.u16 q13, UX 250 vdup.u8 d28, WT 251 vdup.u8 d29, WB 252 vadd.u16 d25, d25, d26 253 254 /* ensure good destination alignment */ 255 cmp WIDTH, #1 256 blt 0f 257 tst OUT, #(1 << dst_bpp_shift) 258 beq 0f 259 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 260 vadd.u16 q12, q12, q13 261 bilinear_interpolate_last_pixel src_fmt, dst_fmt 262 sub WIDTH, WIDTH, #1 2630: 264 vadd.u16 q13, q13, q13 265 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 266 vadd.u16 q12, q12, q13 267 268 cmp WIDTH, #2 269 blt 0f 270 tst OUT, #(1 << (dst_bpp_shift + 1)) 271 beq 0f 272 bilinear_interpolate_two_pixels src_fmt, dst_fmt 273 sub WIDTH, WIDTH, #2 2740: 275.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 276/*********** 8 pixels per iteration *****************/ 277 cmp WIDTH, #4 278 blt 0f 279 tst OUT, #(1 << (dst_bpp_shift + 2)) 280 beq 0f 281 bilinear_interpolate_four_pixels src_fmt, dst_fmt 282 sub WIDTH, WIDTH, #4 2830: 284 subs WIDTH, WIDTH, #8 285 blt 1f 286 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 287 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt 288 subs WIDTH, WIDTH, #8 289 blt 5f 2900: 291 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt 292 subs WIDTH, WIDTH, #8 293 bge 0b 2945: 295 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt 2961: 297 tst WIDTH, #4 298 beq 2f 299 bilinear_interpolate_four_pixels src_fmt, dst_fmt 3002: 301.else 302/*********** 4 pixels per iteration *****************/ 303 subs WIDTH, WIDTH, #4 304 blt 1f 305 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 306 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 307 subs WIDTH, WIDTH, #4 308 blt 5f 3090: 310 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 311 subs WIDTH, WIDTH, #4 312 bge 0b 3135: 314 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 3151: 316/****************************************************/ 317.endif 318 /* handle the remaining trailing pixels */ 319 tst WIDTH, #2 320 beq 2f 321 bilinear_interpolate_two_pixels src_fmt, dst_fmt 3222: 323 tst WIDTH, #1 324 beq 3f 325 bilinear_interpolate_last_pixel src_fmt, dst_fmt 3263: 327.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 328 vpop {d8-d15} 329.endif 330 pop {r4, r5, r6, r7, r8, r9} 331 bx lr 332 333 .unreq OUT 334 .unreq TOP 335 .unreq WT 336 .unreq WB 337 .unreq X 338 .unreq UX 339 .unreq WIDTH 340 .unreq TMP1 341 .unreq TMP2 342 .unreq PF_OFFS 343 .unreq TMP3 344 .unreq TMP4 345 .unreq STRIDE 346.endfunc 347 348.endm 349 350generate_bilinear_scanline_func \ 351 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ 352 2, 2, 28, BILINEAR_FLAG_UNROLL_4 353 354