1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkBlitMask.h" 9#include "SkBlitRow.h" 10#include "SkColorPriv.h" 11#include "SkDither.h" 12#include "SkMathPriv.h" 13#include "SkUtils.h" 14#include "SkUtilsArm.h" 15 16#include "SkCachePreload_arm.h" 17 18// Define USE_NEON_CODE to indicate that we need to build NEON routines 19#define USE_NEON_CODE (!SK_ARM_NEON_IS_NONE) 20 21// Define USE_ARM_CODE to indicate that we need to build ARM routines 22#define USE_ARM_CODE (!SK_ARM_NEON_IS_ALWAYS) 23 24#if USE_NEON_CODE 25 #include "SkBlitRow_opts_arm_neon.h" 26#endif 27 28#if USE_ARM_CODE 29 30static void S32A_D565_Opaque(uint16_t* SK_RESTRICT dst, 31 const SkPMColor* SK_RESTRICT src, int count, 32 U8CPU alpha, int /*x*/, int /*y*/) { 33 SkASSERT(255 == alpha); 34 35 asm volatile ( 36 "1: \n\t" 37 "ldr r3, [%[src]], #4 \n\t" 38 "cmp r3, #0xff000000 \n\t" 39 "blo 2f \n\t" 40 "and r4, r3, #0x0000f8 \n\t" 41 "and r5, r3, #0x00fc00 \n\t" 42 "and r6, r3, #0xf80000 \n\t" 43 "pld [r1, #32] \n\t" 44 "lsl r3, r4, #8 \n\t" 45 "orr r3, r3, r5, lsr #5 \n\t" 46 "orr r3, r3, r6, lsr #19 \n\t" 47 "subs %[count], %[count], #1 \n\t" 48 "strh r3, [%[dst]], #2 \n\t" 49 "bne 1b \n\t" 50 "b 4f \n\t" 51 "2: \n\t" 52 "lsrs r7, r3, #24 \n\t" 53 "beq 3f \n\t" 54 "ldrh r4, [%[dst]] \n\t" 55 "rsb r7, r7, #255 \n\t" 56 "and r6, r4, #0x001f \n\t" 57#if SK_ARM_ARCH == 6 58 "lsl r5, r4, #21 \n\t" 59 "lsr r5, r5, #26 \n\t" 60#else 61 "ubfx r5, r4, #5, #6 \n\t" 62#endif 63 "pld [r0, #16] \n\t" 64 "lsr r4, r4, #11 \n\t" 65#ifdef SK_ARM_HAS_EDSP 66 "smulbb r6, r6, r7 \n\t" 67 "smulbb r5, r5, r7 \n\t" 68 "smulbb r4, r4, r7 \n\t" 69#else 70 "mul r6, r6, r7 \n\t" 71 "mul r5, r5, r7 \n\t" 72 "mul r4, r4, r7 \n\t" 73#endif 74 "uxtb r7, r3, ROR #16 \n\t" 75 "uxtb ip, r3, ROR #8 \n\t" 76 "and r3, r3, #0xff \n\t" 77 "add r6, r6, #16 \n\t" 78 "add r5, r5, #32 \n\t" 79 "add r4, r4, #16 \n\t" 80 "add r6, r6, r6, lsr #5 \n\t" 81 "add r5, r5, r5, lsr #6 \n\t" 82 "add r4, r4, r4, lsr #5 \n\t" 83 "add r6, r7, r6, lsr #5 \n\t" 84 "add r5, ip, r5, lsr #6 \n\t" 85 "add r4, r3, r4, lsr #5 \n\t" 86 "lsr r6, r6, #3 \n\t" 87 "and r5, r5, #0xfc \n\t" 88 "and r4, r4, #0xf8 \n\t" 89 "orr r6, r6, r5, lsl #3 \n\t" 90 "orr r4, r6, r4, lsl #8 \n\t" 91 "strh r4, [%[dst]], #2 \n\t" 92 "pld [r1, #32] \n\t" 93 "subs %[count], %[count], #1 \n\t" 94 "bne 1b \n\t" 95 "b 4f \n\t" 96 "3: \n\t" 97 "subs %[count], %[count], #1 \n\t" 98 "add %[dst], %[dst], #2 \n\t" 99 "bne 1b \n\t" 100 "4: \n\t" 101 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) 102 : 103 : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip" 104 ); 105} 106 107static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, 108 const SkPMColor* SK_RESTRICT src, 109 int count, U8CPU alpha) { 110 111 SkASSERT(255 == alpha); 112 113 asm volatile ( 114 "cmp %[count], #0 \n\t" /* comparing count with 0 */ 115 "beq 3f \n\t" /* if zero exit */ 116 117 "mov ip, #0xff \n\t" /* load the 0xff mask in ip */ 118 "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */ 119 120 "cmp %[count], #2 \n\t" /* compare count with 2 */ 121 "blt 2f \n\t" /* if less than 2 -> single loop */ 122 123 /* Double Loop */ 124 "1: \n\t" /* <double loop> */ 125 "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */ 126 "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */ 127 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 128 129 /* ----------- */ 130 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ 131 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ 132 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ 133 134 "mul r9, r9, r4 \n\t" /* br = br * scale */ 135 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 136 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 137 138 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ 139 "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 140 "orr r7, r9, r10 \n\t" /* br | ag*/ 141 142 "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */ 143 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */ 144 145 /* ----------- */ 146 "and r9, ip, r8 \n\t" /* r9 = br masked by ip */ 147 148 "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */ 149 "mul r9, r9, r4 \n\t" /* br = br * scale */ 150 "sub %[count], %[count], #2 \n\t" 151 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 152 153 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 154 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ 155 "cmp %[count], #1 \n\t" /* comparing count with 1 */ 156 "orr r8, r9, r10 \n\t" /* br | ag */ 157 158 "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */ 159 160 /* ----------------- */ 161 "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */ 162 /* ----------------- */ 163 164 "bgt 1b \n\t" /* if greater than 1 -> reloop */ 165 "blt 3f \n\t" /* if less than 1 -> exit */ 166 167 /* Single Loop */ 168 "2: \n\t" /* <single loop> */ 169 "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */ 170 "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */ 171 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 172 173 /* ----------- */ 174 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ 175 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ 176 177 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ 178 "mul r9, r9, r4 \n\t" /* br = br * scale */ 179 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 180 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 181 182 "and r10, r10, ip, lsl #8 \n\t" /* mask ag */ 183 "orr r7, r9, r10 \n\t" /* br | ag */ 184 185 "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */ 186 187 /* ----------------- */ 188 "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */ 189 /* ----------------- */ 190 191 "3: \n\t" /* <exit> */ 192 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) 193 : 194 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory" 195 ); 196} 197 198/* 199 * ARM asm version of S32A_Blend_BlitRow32 200 */ 201void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, 202 const SkPMColor* SK_RESTRICT src, 203 int count, U8CPU alpha) { 204 asm volatile ( 205 "cmp %[count], #0 \n\t" /* comparing count with 0 */ 206 "beq 3f \n\t" /* if zero exit */ 207 208 "mov r12, #0xff \n\t" /* load the 0xff mask in r12 */ 209 "orr r12, r12, r12, lsl #16 \n\t" /* convert it to 0xff00ff in r12 */ 210 211 /* src1,2_scale */ 212 "add %[alpha], %[alpha], #1 \n\t" /* loading %[alpha]=src_scale=alpha+1 */ 213 214 "cmp %[count], #2 \n\t" /* comparing count with 2 */ 215 "blt 2f \n\t" /* if less than 2 -> single loop */ 216 217 /* Double Loop */ 218 "1: \n\t" /* <double loop> */ 219 "ldm %[src]!, {r5, r6} \n\t" /* loading src pointers into r5 and r6 */ 220 "ldm %[dst], {r7, r8} \n\t" /* loading dst pointers into r7 and r8 */ 221 222 /* dst1_scale and dst2_scale*/ 223 "lsr r9, r5, #24 \n\t" /* src >> 24 */ 224 "lsr r10, r6, #24 \n\t" /* src >> 24 */ 225#ifdef SK_ARM_HAS_EDSP 226 "smulbb r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */ 227 "smulbb r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */ 228#else 229 "mul r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */ 230 "mul r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */ 231#endif 232 "lsr r9, r9, #8 \n\t" /* r9 >> 8 */ 233 "lsr r10, r10, #8 \n\t" /* r10 >> 8 */ 234 "rsb r9, r9, #256 \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */ 235 "rsb r10, r10, #256 \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */ 236 237 /* ---------------------- */ 238 239 /* src1, src1_scale */ 240 "and r11, r12, r5, lsr #8 \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */ 241 "and r4, r12, r5 \n\t" /* rb = r4 = r5 masked by r12 */ 242 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ 243 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ 244 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 245 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 246 "orr r5, r11, r4 \n\t" /* r5 = (src1, src_scale) */ 247 248 /* dst1, dst1_scale */ 249 "and r11, r12, r7, lsr #8 \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */ 250 "and r4, r12, r7 \n\t" /* rb = r4 = r7 masked by r12 */ 251 "mul r11, r11, r9 \n\t" /* ag = r11 times dst_scale (r9) */ 252 "mul r4, r4, r9 \n\t" /* rb = r4 times dst_scale (r9) */ 253 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 254 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 255 "orr r9, r11, r4 \n\t" /* r9 = (dst1, dst_scale) */ 256 257 /* ---------------------- */ 258 "add r9, r5, r9 \n\t" /* *dst = src plus dst both scaled */ 259 /* ---------------------- */ 260 261 /* ====================== */ 262 263 /* src2, src2_scale */ 264 "and r11, r12, r6, lsr #8 \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */ 265 "and r4, r12, r6 \n\t" /* rb = r4 = r6 masked by r12 */ 266 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ 267 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ 268 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 269 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 270 "orr r6, r11, r4 \n\t" /* r6 = (src2, src_scale) */ 271 272 /* dst2, dst2_scale */ 273 "and r11, r12, r8, lsr #8 \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */ 274 "and r4, r12, r8 \n\t" /* rb = r4 = r8 masked by r12 */ 275 "mul r11, r11, r10 \n\t" /* ag = r11 times dst_scale (r10) */ 276 "mul r4, r4, r10 \n\t" /* rb = r4 times dst_scale (r6) */ 277 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 278 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 279 "orr r10, r11, r4 \n\t" /* r10 = (dst2, dst_scale) */ 280 281 "sub %[count], %[count], #2 \n\t" /* decrease count by 2 */ 282 /* ---------------------- */ 283 "add r10, r6, r10 \n\t" /* *dst = src plus dst both scaled */ 284 /* ---------------------- */ 285 "cmp %[count], #1 \n\t" /* compare count with 1 */ 286 /* ----------------- */ 287 "stm %[dst]!, {r9, r10} \n\t" /* copy r9 and r10 to r7 and r8 respectively */ 288 /* ----------------- */ 289 290 "bgt 1b \n\t" /* if %[count] greater than 1 reloop */ 291 "blt 3f \n\t" /* if %[count] less than 1 exit */ 292 /* else get into the single loop */ 293 /* Single Loop */ 294 "2: \n\t" /* <single loop> */ 295 "ldr r5, [%[src]], #4 \n\t" /* loading src pointer into r5: r5=src */ 296 "ldr r7, [%[dst]] \n\t" /* loading dst pointer into r7: r7=dst */ 297 298 "lsr r6, r5, #24 \n\t" /* src >> 24 */ 299 "and r8, r12, r5, lsr #8 \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */ 300#ifdef SK_ARM_HAS_EDSP 301 "smulbb r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */ 302#else 303 "mul r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */ 304#endif 305 "and r9, r12, r5 \n\t" /* rb = r9 = r5 masked by r12 */ 306 "lsr r6, r6, #8 \n\t" /* r6 >> 8 */ 307 "mul r8, r8, %[alpha] \n\t" /* ag = r8 times scale */ 308 "rsb r6, r6, #256 \n\t" /* r6 = 255 - r6 + 1 */ 309 310 /* src, src_scale */ 311 "mul r9, r9, %[alpha] \n\t" /* rb = r9 times scale */ 312 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 313 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ 314 "orr r10, r8, r9 \n\t" /* r10 = (scr, src_scale) */ 315 316 /* dst, dst_scale */ 317 "and r8, r12, r7, lsr #8 \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */ 318 "and r9, r12, r7 \n\t" /* rb = r9 = r7 masked by r12 */ 319 "mul r8, r8, r6 \n\t" /* ag = r8 times scale (r6) */ 320 "mul r9, r9, r6 \n\t" /* rb = r9 times scale (r6) */ 321 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 322 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ 323 "orr r7, r8, r9 \n\t" /* r7 = (dst, dst_scale) */ 324 325 "add r10, r7, r10 \n\t" /* *dst = src plus dst both scaled */ 326 327 /* ----------------- */ 328 "str r10, [%[dst]], #4 \n\t" /* *dst = r10, postincrement dst by one (times 4) */ 329 /* ----------------- */ 330 331 "3: \n\t" /* <exit> */ 332 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha) 333 : 334 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory" 335 ); 336 337} 338 339/////////////////////////////////////////////////////////////////////////////// 340 341static const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm[] = { 342 // no dither 343 // NOTE: For the functions below, we don't have a special version 344 // that assumes that each source pixel is opaque. But our S32A is 345 // still faster than the default, so use it. 346 S32A_D565_Opaque, // S32_D565_Opaque 347 NULL, // S32_D565_Blend 348 S32A_D565_Opaque, // S32A_D565_Opaque 349 NULL, // S32A_D565_Blend 350 351 // dither 352 NULL, // S32_D565_Opaque_Dither 353 NULL, // S32_D565_Blend_Dither 354 NULL, // S32A_D565_Opaque_Dither 355 NULL, // S32A_D565_Blend_Dither 356}; 357 358static const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = { 359 NULL, // S32_Opaque, 360 NULL, // S32_Blend, 361 S32A_Opaque_BlitRow32_arm, // S32A_Opaque, 362 S32A_Blend_BlitRow32_arm // S32A_Blend 363}; 364 365#endif // USE_ARM_CODE 366 367SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { 368 return SK_ARM_NEON_WRAP(sk_blitrow_platform_565_procs_arm)[flags]; 369} 370 371SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { 372 return SK_ARM_NEON_WRAP(sk_blitrow_platform_32_procs_arm)[flags]; 373} 374 375/////////////////////////////////////////////////////////////////////////////// 376#define Color32_arm NULL 377SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { 378 return SK_ARM_NEON_WRAP(Color32_arm); 379} 380 381SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, 382 SkMask::Format maskFormat, 383 SkColor color) { 384 return NULL; 385} 386 387SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { 388 return NULL; 389} 390 391SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig, 392 SkMask::Format maskFormat, 393 RowFlags flags) { 394 return NULL; 395} 396