pixman-arm-simd-asm-scaled.S revision 1176bdada62cabc6ec4b0308a930e83b679d5d36
1/* 2 * Copyright © 2008 Mozilla Corporation 3 * Copyright © 2010 Nokia Corporation 4 * 5 * Permission to use, copy, modify, distribute, and sell this software and its 6 * documentation for any purpose is hereby granted without fee, provided that 7 * the above copyright notice appear in all copies and that both that 8 * copyright notice and this permission notice appear in supporting 9 * documentation, and that the name of Mozilla Corporation not be used in 10 * advertising or publicity pertaining to distribution of the software without 11 * specific, written prior permission. Mozilla Corporation makes no 12 * representations about the suitability of this software for any purpose. It 13 * is provided "as is" without express or implied warranty. 14 * 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 22 * SOFTWARE. 23 * 24 * Author: Jeff Muizelaar (jeff@infidigm.net) 25 * 26 */ 27 28/* Prevent the stack from becoming executable */ 29#if defined(__linux__) && defined(__ELF__) 30.section .note.GNU-stack,"",%progbits 31#endif 32 33 .text 34 .arch armv6 35 .object_arch armv4 36 .arm 37 .altmacro 38 .p2align 2 39 40/* Supplementary macro for setting function attributes */ 41.macro pixman_asm_function fname 42 .func fname 43 .global fname 44#ifdef __ELF__ 45 .hidden fname 46 .type fname, %function 47#endif 48fname: 49.endm 50 51/* 52 * Note: This code is only using armv5te instructions (not even armv6), 53 * but is scheduled for ARM Cortex-A8 pipeline. So it might need to 54 * be split into a few variants, tuned for each microarchitecture. 55 * 56 * TODO: In order to get good performance on ARM9/ARM11 cores (which don't 57 * have efficient write combining), it needs to be changed to use 16-byte 58 * aligned writes using STM instruction. 59 * 60 * Nearest scanline scaler macro template uses the following arguments: 61 * fname - name of the function to generate 62 * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes 63 * t - type suffix for LDR/STR instructions 64 * prefetch_distance - prefetch in the source image by that many 65 * pixels ahead 66 * prefetch_braking_distance - stop prefetching when that many pixels are 67 * remaining before the end of scanline 68 */ 69 70.macro generate_nearest_scanline_func fname, bpp_shift, t, \ 71 prefetch_distance, \ 72 prefetch_braking_distance 73 74pixman_asm_function fname 75 W .req r0 76 DST .req r1 77 SRC .req r2 78 VX .req r3 79 UNIT_X .req ip 80 TMP1 .req r4 81 TMP2 .req r5 82 VXMASK .req r6 83 PF_OFFS .req r7 84 SRC_WIDTH_FIXED .req r8 85 86 ldr UNIT_X, [sp] 87 push {r4, r5, r6, r7, r8, r10} 88 mvn VXMASK, #((1 << bpp_shift) - 1) 89 ldr SRC_WIDTH_FIXED, [sp, #28] 90 91 /* define helper macro */ 92 .macro scale_2_pixels 93 ldr&t TMP1, [SRC, TMP1] 94 and TMP2, VXMASK, VX, asr #(16 - bpp_shift) 95 adds VX, VX, UNIT_X 96 str&t TMP1, [DST], #(1 << bpp_shift) 979: subpls VX, VX, SRC_WIDTH_FIXED 98 bpl 9b 99 100 ldr&t TMP2, [SRC, TMP2] 101 and TMP1, VXMASK, VX, asr #(16 - bpp_shift) 102 adds VX, VX, UNIT_X 103 str&t TMP2, [DST], #(1 << bpp_shift) 1049: subpls VX, VX, SRC_WIDTH_FIXED 105 bpl 9b 106 .endm 107 108 /* now do the scaling */ 109 and TMP1, VXMASK, VX, asr #(16 - bpp_shift) 110 adds VX, VX, UNIT_X 1119: subpls VX, VX, SRC_WIDTH_FIXED 112 bpl 9b 113 subs W, W, #(8 + prefetch_braking_distance) 114 blt 2f 115 /* calculate prefetch offset */ 116 mov PF_OFFS, #prefetch_distance 117 mla PF_OFFS, UNIT_X, PF_OFFS, VX 1181: /* main loop, process 8 pixels per iteration with prefetch */ 119 pld [SRC, PF_OFFS, asr #(16 - bpp_shift)] 120 add PF_OFFS, UNIT_X, lsl #3 121 scale_2_pixels 122 scale_2_pixels 123 scale_2_pixels 124 scale_2_pixels 125 subs W, W, #8 126 bge 1b 1272: 128 subs W, W, #(4 - 8 - prefetch_braking_distance) 129 blt 2f 1301: /* process the remaining pixels */ 131 scale_2_pixels 132 scale_2_pixels 133 subs W, W, #4 134 bge 1b 1352: 136 tst W, #2 137 beq 2f 138 scale_2_pixels 1392: 140 tst W, #1 141 ldrne&t TMP1, [SRC, TMP1] 142 strne&t TMP1, [DST] 143 /* cleanup helper macro */ 144 .purgem scale_2_pixels 145 .unreq DST 146 .unreq SRC 147 .unreq W 148 .unreq VX 149 .unreq UNIT_X 150 .unreq TMP1 151 .unreq TMP2 152 .unreq VXMASK 153 .unreq PF_OFFS 154 .unreq SRC_WIDTH_FIXED 155 /* return */ 156 pop {r4, r5, r6, r7, r8, r10} 157 bx lr 158.endfunc 159.endm 160 161generate_nearest_scanline_func \ 162 pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 163 164generate_nearest_scanline_func \ 165 pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32 166