1/* 2 * Copyright 2014 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#ifdef CRBUG_399842_FIXED 9 10#if defined(__clang__) || (defined(__GNUC__) && !defined(SK_BUILD_FOR_MAC)) 11 12#define EXTRACT_ALPHA(var1, var2) \ 13 movdqa %var1, %var2; /* Clone source pixels to extract alpha */\ 14 psrlw $8, %var2; /* Discard red and blue, leaving alpha and green */\ 15 pshufhw $0xF5, %var2, %var2; /* Repeat alpha for scaling (high) */\ 16 movdqa %xmm6, %xmm4; \ 17 pshuflw $0xF5, %var2, %var2; /* Repeat alpha for scaling (low) */\ 18 movdqa %xmm5, %xmm3; \ 19 psubw %var2, %xmm4 /* Finalize alpha calculations */ 20 21#define SCALE_PIXELS \ 22 psllw $8, %xmm5; /* Filter out red and blue components */\ 23 pmulhuw %xmm4, %xmm5; /* Scale red and blue */\ 24 psrlw $8, %xmm3; /* Filter out alpha and green components */\ 25 pmullw %xmm4, %xmm3 /* Scale alpha and green */ 26 27 28/* 29 * void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, 30 * const SkPMColor* SK_RESTRICT src, 31 * int count, U8CPU alpha) 32 * 33 * This function is divided into six blocks: initialization, blit 4-15 pixels, 34 * blit 0-3 pixels, align destination for 16+ pixel blits, 35 * blit 16+ pixels with source unaligned, blit 16+ pixels with source aligned. 36 * There are some code reuse between the blocks. 37 * 38 * The primary optimization comes from checking the source pixels' alpha value. 39 * If the alpha is zero, the pixel can be skipped entirely. 40 * If the alpha is fully opaque, the pixel can be copied directly to the destination. 41 * According to collected statistics, these two cases are the most common. 42 * The main loop(s) uses pre-loading and unrolling in an attempt to reduce the 43 * memory latency worse-case. 44 */ 45 46#ifdef __clang__ 47 .text 48#else 49 .section .text.sse4.2,"ax",@progbits 50 .type S32A_Opaque_BlitRow32_SSE4_asm, @function 51#endif 52 .p2align 4 53#if defined(SK_BUILD_FOR_MAC) 54 .global _S32A_Opaque_BlitRow32_SSE4_asm 55 .private_extern _S32A_Opaque_BlitRow32_SSE4_asm 56_S32A_Opaque_BlitRow32_SSE4_asm: 57#else 58 .global S32A_Opaque_BlitRow32_SSE4_asm 59 .hidden S32A_Opaque_BlitRow32_SSE4_asm 60S32A_Opaque_BlitRow32_SSE4_asm: 61#endif 62 .cfi_startproc 63 prefetcht0 (%rsi) 64 movl %edx, %ecx // Pixel count 65 movq %rdi, %rdx // Destination pointer 66 movq %rsi, %rax // Source pointer 67 68 // Setup SSE constants 69 movdqa .LAlphaCheckMask(%rip), %xmm7 // 0xFF000000 mask to check alpha 70 movdqa .LInverseAlphaCalc(%rip), %xmm6// 16-bit 256 to calculate inv. alpha 71 movdqa .LResultMergeMask(%rip), %xmm0 // 0x00FF00FF mask (Must be in xmm0 because of pblendvb) 72 73 subl $4, %ecx // Check if we have only 0-3 pixels 74 js .LReallySmall 75 cmpl $11, %ecx // Do we have enough pixels to run the main loop? 76 ja .LBigBlit 77 78 // Handle small blits (4-15 pixels) 79 //////////////////////////////////////////////////////////////////////////////// 80 xorq %rdi, %rdi // Reset offset to zero 81 82.LSmallLoop: 83 lddqu (%rax, %rdi), %xmm1 // Load four source pixels 84 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaque 85 ja .LSmallAlphaNotOpaqueOrZero 86 jz .LSmallAlphaZero 87 movdqu %xmm1, (%rdx, %rdi) // Store four destination pixels 88.LSmallAlphaZero: 89 addq $16, %rdi 90 subl $4, %ecx // Check if there are four additional pixels, at least 91 jns .LSmallLoop 92 jmp .LSmallRemaining 93 94 // Handle mixed alphas (calculate and scale) 95 .p2align 4 96.LSmallAlphaNotOpaqueOrZero: 97 lddqu (%rdx, %rdi), %xmm5 // Load four destination pixels 98 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value 99 SCALE_PIXELS // Scale pixels using alpha 100 101 addq $16, %rdi 102 subl $4, %ecx // Check if there are four additional pixels, at least 103 pblendvb %xmm5, %xmm3 // Mask in %xmm0, implicitly 104 paddb %xmm3, %xmm1 // Add source and destination pixels together 105 movdqu %xmm1, -16(%rdx, %rdi) // Store four destination pixels 106 jns .LSmallLoop 107 108 // Handle the last 0-3 pixels (also used by the main loops) 109.LSmallRemaining: 110 cmpl $-4, %ecx // Check if we are done 111 je .LSmallExit 112 sall $2, %ecx // Calculate offset for last pixels 113 movslq %ecx, %rcx 114 addq %rcx, %rdi 115 116 lddqu (%rax, %rdi), %xmm1 // Load last four source pixels (overlapping) 117 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaque 118 jc .LSmallRemainingStoreAll// If all alphas are opaque, just store (overlapping) 119 jz .LSmallExit // If all alphas are zero, skip the pixels completely 120 121 // Handle mixed alphas (calculate and scale) 122 lddqu (%rdx, %rdi), %xmm5 // Load last four destination pixels (overlapping) 123 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value 124 125 psllw $8, %xmm3 // Filter out red and blue components 126 pmulhuw %xmm4, %xmm3 // Scale red and blue 127 movdqa %xmm5, %xmm2 128 psrlw $8, %xmm2 // Filter out alpha and green components 129 pmullw %xmm4, %xmm2 // Scale alpha and green 130 131 cmpl $-8, %ecx // Check how many pixels should be written 132 pblendvb %xmm3, %xmm2 // Combine results (mask in %xmm0, implicitly) 133 paddb %xmm2, %xmm1 // Add source and destination pixels together 134 jb .LSmallPixelsLeft1 135 ja .LSmallPixelsLeft3 // To avoid double-blending the overlapping pixels... 136 pblendw $0xF0, %xmm1, %xmm5 // Merge only the final two pixels to the destination 137 movdqu %xmm5, (%rdx, %rdi) // Store last two destination pixels 138.LSmallExit: 139 ret 140 141.LSmallPixelsLeft1: 142 pblendw $0xC0, %xmm1, %xmm5 // Merge only the final pixel to the destination 143 movdqu %xmm5, (%rdx, %rdi) // Store last destination pixel 144 ret 145 146.LSmallPixelsLeft3: 147 pblendw $0xFC, %xmm1, %xmm5 // Merge only the final three pixels to the destination 148 movdqu %xmm5, (%rdx, %rdi) // Store last three destination pixels 149 ret 150 151.LSmallRemainingStoreAll: 152 movdqu %xmm1, (%rdx, %rdi) // Store last destination pixels (overwrite) 153 ret 154 155 // Handle really small blits (0-3 pixels) 156 //////////////////////////////////////////////////////////////////////////////// 157.LReallySmall: 158 addl $4, %ecx 159 jle .LReallySmallExit 160 pcmpeqd %xmm1, %xmm1 161 cmpl $2, %ecx // Check how many pixels should be read 162 pinsrd $0x0, (%rax), %xmm1 // Load one source pixel 163 pinsrd $0x0, (%rdx), %xmm5 // Load one destination pixel 164 jb .LReallySmallCalc 165 pinsrd $0x1, 4(%rax), %xmm1 // Load second source pixel 166 pinsrd $0x1, 4(%rdx), %xmm5 // Load second destination pixel 167 je .LReallySmallCalc 168 pinsrd $0x2, 8(%rax), %xmm1 // Load third source pixel 169 pinsrd $0x2, 8(%rdx), %xmm5 // Load third destination pixel 170 171.LReallySmallCalc: 172 ptest %xmm7, %xmm1 // Check if all alphas are opaque 173 jc .LReallySmallStore // If all alphas are opaque, just store 174 175 // Handle mixed alphas (calculate and scale) 176 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value 177 178 pand %xmm0, %xmm5 // Filter out red and blue components 179 pmullw %xmm4, %xmm5 // Scale red and blue 180 psrlw $8, %xmm3 // Filter out alpha and green components 181 pmullw %xmm4, %xmm3 // Scale alpha and green 182 183 psrlw $8, %xmm5 // Combine results 184 pblendvb %xmm5, %xmm3 // Mask in %xmm0, implicitly 185 paddb %xmm3, %xmm1 // Add source and destination pixels together 186 187.LReallySmallStore: 188 cmpl $2, %ecx // Check how many pixels should be written 189 pextrd $0x0, %xmm1, (%rdx) // Store one destination pixel 190 jb .LReallySmallExit 191 pextrd $0x1, %xmm1, 4(%rdx) // Store second destination pixel 192 je .LReallySmallExit 193 pextrd $0x2, %xmm1, 8(%rdx) // Store third destination pixel 194.LReallySmallExit: 195 ret 196 197 // Handle bigger blit operations (16+ pixels) 198 //////////////////////////////////////////////////////////////////////////////// 199 .p2align 4 200.LBigBlit: 201 // Align destination? 202 testl $0xF, %edx 203 lddqu (%rax), %xmm1 // Pre-load four source pixels 204 jz .LAligned 205 206 movq %rdx, %rdi // Calculate alignment of destination pointer 207 negq %rdi 208 andl $0xF, %edi 209 210 // Handle 1-3 pixels to align destination 211 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaque 212 jz .LAlignDone // If all alphas are zero, just skip 213 lddqu (%rdx), %xmm5 // Load four destination pixels 214 jc .LAlignStore // If all alphas are opaque, just store 215 216 // Handle mixed alphas (calculate and scale) 217 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value 218 219 psllw $8, %xmm3 // Filter out red and blue components 220 pmulhuw %xmm4, %xmm3 // Scale red and blue 221 movdqa %xmm5, %xmm2 222 psrlw $8, %xmm2 // Filter out alpha and green components 223 pmullw %xmm4, %xmm2 // Scale alpha and green 224 225 pblendvb %xmm3, %xmm2 // Combine results (mask in %xmm0, implicitly) 226 paddb %xmm2, %xmm1 // Add source and destination pixels together 227 228.LAlignStore: 229 cmpl $8, %edi // Check how many pixels should be written 230 jb .LAlignPixelsLeft1 231 ja .LAlignPixelsLeft3 232 pblendw $0x0F, %xmm1, %xmm5 // Blend two pixels 233 jmp .LAlignStorePixels 234 235.LAlignPixelsLeft1: 236 pblendw $0x03, %xmm1, %xmm5 // Blend one pixel 237 jmp .LAlignStorePixels 238 239.LAlignPixelsLeft3: 240 pblendw $0x3F, %xmm1, %xmm5 // Blend three pixels 241 242.LAlignStorePixels: 243 movdqu %xmm5, (%rdx) // Store destination pixels 244 245.LAlignDone: 246 addq %rdi, %rax // Adjust pointers and pixel count 247 addq %rdi, %rdx 248 shrq $2, %rdi 249 lddqu (%rax), %xmm1 // Pre-load new source pixels (after alignment) 250 subl %edi, %ecx 251 252.LAligned: // Destination is guaranteed to be 16 byte aligned 253 xorq %rdi, %rdi // Reset offset to zero 254 subl $8, %ecx // Decrease counter (Reserve four pixels for the cleanup) 255 testl $0xF, %eax // Check alignment of source pointer 256 jz .LAlignedLoop 257 258 // Source not aligned to destination 259 //////////////////////////////////////////////////////////////////////////////// 260 .p2align 4 261.LUnalignedLoop: // Main loop for unaligned, handles eight pixels per iteration 262 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaque 263 ja .LAlphaNotOpaqueOrZero00 264 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels 265 jz .LAlphaZero00 266 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels 267 268.LAlphaZero00: 269 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaque 270 ja .LAlphaNotOpaqueOrZero01 271 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels 272 jz .LAlphaZero01 273 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels 274 275.LAlphaZero01: 276 addq $32, %rdi // Adjust offset and pixel count 277 subl $8, %ecx 278 jae .LUnalignedLoop 279 addl $8, %ecx // Adjust pixel count 280 jmp .LLoopCleanup0 281 282 .p2align 4 283.LAlphaNotOpaqueOrZero00: 284 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels 285 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value 286 SCALE_PIXELS // Scale pixels using alpha 287 288 lddqu 16(%rax, %rdi), %xmm2 // Pre-load four source pixels 289 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, implicitly) 290 paddb %xmm3, %xmm1 // Add source and destination pixels together 291 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels 292 293 // Handle next four pixels 294 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaque 295 ja .LAlphaNotOpaqueOrZero01 296 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels 297 jz .LAlphaZero02 298 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels 299.LAlphaZero02: 300 addq $32, %rdi // Adjust offset and pixel count 301 subl $8, %ecx 302 jae .LUnalignedLoop 303 addl $8, %ecx // Adjust pixel count 304 jmp .LLoopCleanup0 305 306 .p2align 4 307.LAlphaNotOpaqueOrZero01: 308 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels 309 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value 310 SCALE_PIXELS // Scale pixels using alpha 311 312 lddqu 32(%rax, %rdi), %xmm1 // Pre-load four source pixels 313 addq $32, %rdi 314 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, implicitly) 315 paddb %xmm3, %xmm2 // Add source and destination pixels together 316 subl $8, %ecx 317 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels 318 jae .LUnalignedLoop 319 addl $8, %ecx // Adjust pixel count 320 321 // Cleanup - handle pending pixels from loop 322.LLoopCleanup0: 323 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaque 324 ja .LAlphaNotOpaqueOrZero02 325 jz .LAlphaZero03 326 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels 327.LAlphaZero03: 328 addq $16, %rdi 329 subl $4, %ecx 330 js .LSmallRemaining // Reuse code from small loop 331 332.LRemain0: 333 lddqu (%rax, %rdi), %xmm1 // Load four source pixels 334 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaque 335 ja .LAlphaNotOpaqueOrZero02 336 jz .LAlphaZero04 337 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels 338.LAlphaZero04: 339 addq $16, %rdi 340 subl $4, %ecx 341 jmp .LSmallRemaining // Reuse code from small loop 342 343.LAlphaNotOpaqueOrZero02: 344 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels 345 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value 346 SCALE_PIXELS // Scale pixels using alpha 347 348 addq $16, %rdi 349 subl $4, %ecx 350 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, implicitly) 351 paddb %xmm3, %xmm1 // Add source and destination pixels together 352 movdqa %xmm1, -16(%rdx, %rdi) // Store four destination pixels 353 js .LSmallRemaining // Reuse code from small loop 354 jmp .LRemain0 355 356 // Source aligned to destination 357 //////////////////////////////////////////////////////////////////////////////// 358 .p2align 4 359.LAlignedLoop: // Main loop for aligned, handles eight pixels per iteration 360 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaque 361 ja .LAlphaNotOpaqueOrZero10 362 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels 363 jz .LAlphaZero10 364 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels 365 366.LAlphaZero10: 367 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaque 368 ja .LAlphaNotOpaqueOrZero11 369 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels 370 jz .LAlphaZero11 371 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels 372 373.LAlphaZero11: 374 addq $32, %rdi // Adjust offset and pixel count 375 subl $8, %ecx 376 jae .LAlignedLoop 377 addl $8, %ecx // Adjust pixel count 378 jmp .LLoopCleanup1 379 380 .p2align 4 381.LAlphaNotOpaqueOrZero10: 382 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels 383 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value 384 SCALE_PIXELS // Scale pixels using alpha 385 386 movdqa 16(%rax, %rdi), %xmm2 // Pre-load four source pixels 387 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, implicitly) 388 paddb %xmm3, %xmm1 // Add source and destination pixels together 389 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels 390 391 // Handle next four pixels 392 ptest %xmm7, %xmm2 // Check if all alphas are zero or opaque 393 ja .LAlphaNotOpaqueOrZero11 394 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels 395 jz .LAlphaZero12 396 movdqa %xmm2, 16(%rdx, %rdi) // Store four destination pixels 397.LAlphaZero12: 398 addq $32, %rdi // Adjust offset and pixel count 399 subl $8, %ecx 400 jae .LAlignedLoop 401 addl $8, %ecx // Adjust pixel count 402 jmp .LLoopCleanup1 403 404 .p2align 4 405.LAlphaNotOpaqueOrZero11: 406 movdqa 16(%rdx, %rdi), %xmm5 // Load four destination pixels 407 EXTRACT_ALPHA(xmm2, xmm1) // Extract and clone alpha value 408 SCALE_PIXELS // Scale pixels using alpha 409 movdqa 32(%rax, %rdi), %xmm1 // Pre-load four source pixels 410 411 addq $32, %rdi 412 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, implicitly) 413 paddb %xmm3, %xmm2 // Add source and destination pixels together 414 subl $8, %ecx 415 movdqa %xmm2, -16(%rdx, %rdi) // Store four destination pixels 416 jae .LAlignedLoop 417 addl $8, %ecx // Adjust pixel count 418 419 // Cleanup - handle four pending pixels from loop 420.LLoopCleanup1: 421 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaque 422 ja .LAlphaNotOpaqueOrZero12 423 jz .LAlphaZero13 424 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels 425.LAlphaZero13: 426 addq $16, %rdi 427 subl $4, %ecx 428 js .LSmallRemaining // Reuse code from small loop 429 430.LRemain1: 431 movdqa (%rax, %rdi), %xmm1 // Pre-load four source pixels 432 ptest %xmm7, %xmm1 // Check if all alphas are zero or opaque 433 ja .LAlphaNotOpaqueOrZero12 434 jz .LAlphaZero14 435 movdqa %xmm1, (%rdx, %rdi) // Store four destination pixels 436.LAlphaZero14: 437 addq $16, %rdi 438 subl $4, %ecx 439 jmp .LSmallRemaining // Reuse code from small loop 440 441.LAlphaNotOpaqueOrZero12: 442 movdqa (%rdx, %rdi), %xmm5 // Load four destination pixels 443 EXTRACT_ALPHA(xmm1, xmm2) // Extract and clone alpha value 444 SCALE_PIXELS // Scale pixels using alpha 445 446 addq $16, %rdi 447 subl $4, %ecx 448 pblendvb %xmm5, %xmm3 // Combine results (mask in %xmm0, implicitly) 449 paddb %xmm3, %xmm1 // Add source and destination pixels together 450 movdqa %xmm1, -16(%rdx, %rdi) // Store four destination pixels 451 js .LSmallRemaining // Reuse code from small loop 452 jmp .LRemain1 453 454 .cfi_endproc 455#ifndef __clang__ 456 .size S32A_Opaque_BlitRow32_SSE4_asm, .-S32A_Opaque_BlitRow32_SSE4_asm 457#endif 458 459 // Constants for SSE code 460#ifndef __clang__ 461 .section .rodata 462#endif 463 .p2align 4 464.LAlphaCheckMask: 465 .long 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000 466.LInverseAlphaCalc: 467 .word 256, 256, 256, 256, 256, 256, 256, 256 468.LResultMergeMask: 469 .long 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF 470#endif 471 472#endif // CRBUG_399842_FIXED 473