SkBlitRow_opts_arm_neon.cpp revision fbfcd5602128ec010c82cb733c9cdc0a3254f9f3
1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkBlitRow_opts_arm.h" 9 10#include "SkBlitMask.h" 11#include "SkBlitRow.h" 12#include "SkColorPriv.h" 13#include "SkDither.h" 14#include "SkMathPriv.h" 15#include "SkUtils.h" 16 17#include "SkCachePreload_arm.h" 18 19#include <arm_neon.h> 20 21void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 22 const SkPMColor* SK_RESTRICT src, int count, 23 U8CPU alpha, int /*x*/, int /*y*/) { 24 SkASSERT(255 == alpha); 25 26 if (count >= 8) { 27 uint16_t* SK_RESTRICT keep_dst; 28 29 asm volatile ( 30 "ands ip, %[count], #7 \n\t" 31 "vmov.u8 d31, #1<<7 \n\t" 32 "vld1.16 {q12}, [%[dst]] \n\t" 33 "vld4.8 {d0-d3}, [%[src]] \n\t" 34 // Thumb does not support the standard ARM conditional 35 // instructions but instead requires the 'it' instruction 36 // to signal conditional execution 37 "it eq \n\t" 38 "moveq ip, #8 \n\t" 39 "mov %[keep_dst], %[dst] \n\t" 40 41 "add %[src], %[src], ip, LSL#2 \n\t" 42 "add %[dst], %[dst], ip, LSL#1 \n\t" 43 "subs %[count], %[count], ip \n\t" 44 "b 9f \n\t" 45 // LOOP 46 "2: \n\t" 47 48 "vld1.16 {q12}, [%[dst]]! \n\t" 49 "vld4.8 {d0-d3}, [%[src]]! \n\t" 50 "vst1.16 {q10}, [%[keep_dst]] \n\t" 51 "sub %[keep_dst], %[dst], #8*2 \n\t" 52 "subs %[count], %[count], #8 \n\t" 53 "9: \n\t" 54 "pld [%[dst],#32] \n\t" 55 // expand 0565 q12 to 8888 {d4-d7} 56 "vmovn.u16 d4, q12 \n\t" 57 "vshr.u16 q11, q12, #5 \n\t" 58 "vshr.u16 q10, q12, #6+5 \n\t" 59 "vmovn.u16 d5, q11 \n\t" 60 "vmovn.u16 d6, q10 \n\t" 61 "vshl.u8 d4, d4, #3 \n\t" 62 "vshl.u8 d5, d5, #2 \n\t" 63 "vshl.u8 d6, d6, #3 \n\t" 64 65 "vmovl.u8 q14, d31 \n\t" 66 "vmovl.u8 q13, d31 \n\t" 67 "vmovl.u8 q12, d31 \n\t" 68 69 // duplicate in 4/2/1 & 8pix vsns 70 "vmvn.8 d30, d3 \n\t" 71 "vmlal.u8 q14, d30, d6 \n\t" 72 "vmlal.u8 q13, d30, d5 \n\t" 73 "vmlal.u8 q12, d30, d4 \n\t" 74 "vshr.u16 q8, q14, #5 \n\t" 75 "vshr.u16 q9, q13, #6 \n\t" 76 "vaddhn.u16 d6, q14, q8 \n\t" 77 "vshr.u16 q8, q12, #5 \n\t" 78 "vaddhn.u16 d5, q13, q9 \n\t" 79 "vqadd.u8 d6, d6, d0 \n\t" // moved up 80 "vaddhn.u16 d4, q12, q8 \n\t" 81 // intentionally don't calculate alpha 82 // result in d4-d6 83 84 "vqadd.u8 d5, d5, d1 \n\t" 85 "vqadd.u8 d4, d4, d2 \n\t" 86 87 // pack 8888 {d4-d6} to 0565 q10 88 "vshll.u8 q10, d6, #8 \n\t" 89 "vshll.u8 q3, d5, #8 \n\t" 90 "vshll.u8 q2, d4, #8 \n\t" 91 "vsri.u16 q10, q3, #5 \n\t" 92 "vsri.u16 q10, q2, #11 \n\t" 93 94 "bne 2b \n\t" 95 96 "1: \n\t" 97 "vst1.16 {q10}, [%[keep_dst]] \n\t" 98 : [count] "+r" (count) 99 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 100 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 101 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 102 "d30","d31" 103 ); 104 } 105 else 106 { // handle count < 8 107 uint16_t* SK_RESTRICT keep_dst; 108 109 asm volatile ( 110 "vmov.u8 d31, #1<<7 \n\t" 111 "mov %[keep_dst], %[dst] \n\t" 112 113 "tst %[count], #4 \n\t" 114 "beq 14f \n\t" 115 "vld1.16 {d25}, [%[dst]]! \n\t" 116 "vld1.32 {q1}, [%[src]]! \n\t" 117 118 "14: \n\t" 119 "tst %[count], #2 \n\t" 120 "beq 12f \n\t" 121 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 122 "vld1.32 {d1}, [%[src]]! \n\t" 123 124 "12: \n\t" 125 "tst %[count], #1 \n\t" 126 "beq 11f \n\t" 127 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 128 "vld1.32 {d0[1]}, [%[src]]! \n\t" 129 130 "11: \n\t" 131 // unzips achieve the same as a vld4 operation 132 "vuzpq.u16 q0, q1 \n\t" 133 "vuzp.u8 d0, d1 \n\t" 134 "vuzp.u8 d2, d3 \n\t" 135 // expand 0565 q12 to 8888 {d4-d7} 136 "vmovn.u16 d4, q12 \n\t" 137 "vshr.u16 q11, q12, #5 \n\t" 138 "vshr.u16 q10, q12, #6+5 \n\t" 139 "vmovn.u16 d5, q11 \n\t" 140 "vmovn.u16 d6, q10 \n\t" 141 "vshl.u8 d4, d4, #3 \n\t" 142 "vshl.u8 d5, d5, #2 \n\t" 143 "vshl.u8 d6, d6, #3 \n\t" 144 145 "vmovl.u8 q14, d31 \n\t" 146 "vmovl.u8 q13, d31 \n\t" 147 "vmovl.u8 q12, d31 \n\t" 148 149 // duplicate in 4/2/1 & 8pix vsns 150 "vmvn.8 d30, d3 \n\t" 151 "vmlal.u8 q14, d30, d6 \n\t" 152 "vmlal.u8 q13, d30, d5 \n\t" 153 "vmlal.u8 q12, d30, d4 \n\t" 154 "vshr.u16 q8, q14, #5 \n\t" 155 "vshr.u16 q9, q13, #6 \n\t" 156 "vaddhn.u16 d6, q14, q8 \n\t" 157 "vshr.u16 q8, q12, #5 \n\t" 158 "vaddhn.u16 d5, q13, q9 \n\t" 159 "vqadd.u8 d6, d6, d0 \n\t" // moved up 160 "vaddhn.u16 d4, q12, q8 \n\t" 161 // intentionally don't calculate alpha 162 // result in d4-d6 163 164 "vqadd.u8 d5, d5, d1 \n\t" 165 "vqadd.u8 d4, d4, d2 \n\t" 166 167 // pack 8888 {d4-d6} to 0565 q10 168 "vshll.u8 q10, d6, #8 \n\t" 169 "vshll.u8 q3, d5, #8 \n\t" 170 "vshll.u8 q2, d4, #8 \n\t" 171 "vsri.u16 q10, q3, #5 \n\t" 172 "vsri.u16 q10, q2, #11 \n\t" 173 174 // store 175 "tst %[count], #4 \n\t" 176 "beq 24f \n\t" 177 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 178 179 "24: \n\t" 180 "tst %[count], #2 \n\t" 181 "beq 22f \n\t" 182 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 183 184 "22: \n\t" 185 "tst %[count], #1 \n\t" 186 "beq 21f \n\t" 187 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 188 189 "21: \n\t" 190 : [count] "+r" (count) 191 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 192 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 193 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 194 "d30","d31" 195 ); 196 } 197} 198 199void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 200 const SkPMColor* SK_RESTRICT src, int count, 201 U8CPU alpha, int /*x*/, int /*y*/) { 202 203 U8CPU alpha_for_asm = alpha; 204 205 asm volatile ( 206 /* This code implements a Neon version of S32A_D565_Blend. The output differs from 207 * the original in two respects: 208 * 1. The results have a few mismatches compared to the original code. These mismatches 209 * never exceed 1. It's possible to improve accuracy vs. a floating point 210 * implementation by introducing rounding right shifts (vrshr) for the final stage. 211 * Rounding is not present in the code below, because although results would be closer 212 * to a floating point implementation, the number of mismatches compared to the 213 * original code would be far greater. 214 * 2. On certain inputs, the original code can overflow, causing colour channels to 215 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel 216 * to affect another. 217 */ 218 219#if 1 220 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ 221 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 222#else 223 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 224#endif 225 "vmov.u16 q3, #255 \n\t" // set up constant 226 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 227 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon 228 "beq 2f \n\t" // if count8 == 0, exit 229 "vmov.u16 q15, #0x1f \n\t" // set up blue mask 230 231 "1: \n\t" 232 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels 233 "subs r4, r4, #1 \n\t" // decrement loop counter 234 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels 235 // and deinterleave 236 237 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes 238 "vand q10, q0, q15 \n\t" // extract blue 239 "vshr.u16 q8, q0, #11 \n\t" // extract red 240 "vshr.u16 q9, q9, #10 \n\t" // extract green 241 // dstrgb = {q8, q9, q10} 242 243 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range 244 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range 245 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range 246 247 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits 248 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits 249 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits 250 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits 251 // srcrgba = {q11, q12, q13, q14} 252 253 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale 254 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale 255 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale 256 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale 257 258 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 259 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) 260 // dst_scale = q2 261 262 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale 263 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale 264 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale 265 266#if 1 267 // trying for a better match with SkDiv255Round(a) 268 // C alg is: a+=128; (a+a>>8)>>8 269 // we'll use just a rounding shift [q2 is available for scratch] 270 "vrshr.u16 q11, q11, #8 \n\t" // shift down red 271 "vrshr.u16 q12, q12, #8 \n\t" // shift down green 272 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue 273#else 274 // arm's original "truncating divide by 256" 275 "vshr.u16 q11, q11, #8 \n\t" // shift down red 276 "vshr.u16 q12, q12, #8 \n\t" // shift down green 277 "vshr.u16 q13, q13, #8 \n\t" // shift down blue 278#endif 279 280 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue 281 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue 282 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr 283 284 "bne 1b \n\t" // if counter != 0, loop 285 "2: \n\t" // exit 286 287 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) 288 : 289 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 290 ); 291 292 count &= 7; 293 if (count > 0) { 294 do { 295 SkPMColor sc = *src++; 296 if (sc) { 297 uint16_t dc = *dst; 298 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 299 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 300 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 301 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 302 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 303 } 304 dst += 1; 305 } while (--count != 0); 306 } 307} 308 309/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 310 * each dither value is spaced out into byte lanes, and repeated 311 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 312 * start of each row. 313 */ 314static const uint8_t gDitherMatrix_Neon[48] = { 315 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 316 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 317 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 318 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 319 320}; 321 322void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 323 int count, U8CPU alpha, int x, int y) 324{ 325 /* select row and offset for dither array */ 326 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 327 328 /* rescale alpha to range 0 - 256 */ 329 int scale = SkAlpha255To256(alpha); 330 331 asm volatile ( 332 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values 333 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values 334 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg 335 "vmov.i8 d29, #0x3f \n\t" // set up green mask 336 "vmov.i8 d28, #0x1f \n\t" // set up blue mask 337 "1: \n\t" 338 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb 339 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5 340 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6 341 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5 342 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen 343 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen 344 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen 345 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result 346 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result 347 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result 348 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits 349 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits 350 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits 351 // load 8 pixels from dst, extract rgb 352 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels 353 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits 354 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes 355 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red 356 "vand d17, d17, d29 \n\t" // and green with green mask 357 "vand d18, d18, d28 \n\t" // and blue with blue mask 358 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes 359 // src = {d22 (r), d23 (g), d24 (b)} 360 // dst = {d16 (r), d17 (g), d18 (b)} 361 // subtract dst from src and widen 362 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst 363 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst 364 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst 365 // multiply diffs by scale and shift 366 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale 367 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale 368 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale 369 "subs %[count], %[count], #8 \n\t" // decrement loop counter 370 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow 371 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow 372 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow 373 // add dst to result 374 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red 375 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green 376 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue 377 // put result into 565 format 378 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue 379 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue 380 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result 381 "bgt 1b \n\t" // loop if count > 0 382 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 383 : [dstart] "r" (dstart), [scale] "r" (scale) 384 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31" 385 ); 386 387 DITHER_565_SCAN(y); 388 389 while((count & 7) > 0) 390 { 391 SkPMColor c = *src++; 392 393 int dither = DITHER_VALUE(x); 394 int sr = SkGetPackedR32(c); 395 int sg = SkGetPackedG32(c); 396 int sb = SkGetPackedB32(c); 397 sr = SkDITHER_R32To565(sr, dither); 398 sg = SkDITHER_G32To565(sg, dither); 399 sb = SkDITHER_B32To565(sb, dither); 400 401 uint16_t d = *dst; 402 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 403 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 404 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 405 DITHER_INC_X(x); 406 count--; 407 } 408} 409 410void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 411 const SkPMColor* SK_RESTRICT src, 412 int count, U8CPU alpha) { 413 414 SkASSERT(255 == alpha); 415 if (count > 0) { 416 417 418 uint8x8_t alpha_mask; 419 420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 421 alpha_mask = vld1_u8(alpha_mask_setup); 422 423 /* do the NEON unrolled code */ 424#define UNROLL 4 425 while (count >= UNROLL) { 426 uint8x8_t src_raw, dst_raw, dst_final; 427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 428 429 /* get the source */ 430 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 431#if UNROLL > 2 432 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 433#endif 434 435 /* get and hold the dst too */ 436 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 437#if UNROLL > 2 438 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 439#endif 440 441 /* 1st and 2nd bits of the unrolling */ 442 { 443 uint8x8_t dst_cooked; 444 uint16x8_t dst_wide; 445 uint8x8_t alpha_narrow; 446 uint16x8_t alpha_wide; 447 448 /* get the alphas spread out properly */ 449 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 450#if 1 451 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 452 /* we collapsed (255-a)+1 ... */ 453 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 454#else 455 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); 456 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); 457#endif 458 459 /* spread the dest */ 460 dst_wide = vmovl_u8(dst_raw); 461 462 /* alpha mul the dest */ 463 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 464 dst_cooked = vshrn_n_u16(dst_wide, 8); 465 466 /* sum -- ignoring any byte lane overflows */ 467 dst_final = vadd_u8(src_raw, dst_cooked); 468 } 469 470#if UNROLL > 2 471 /* the 3rd and 4th bits of our unrolling */ 472 { 473 uint8x8_t dst_cooked; 474 uint16x8_t dst_wide; 475 uint8x8_t alpha_narrow; 476 uint16x8_t alpha_wide; 477 478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 479#if 1 480 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 481 /* we collapsed (255-a)+1 ... */ 482 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 483#else 484 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); 485 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); 486#endif 487 488 /* spread the dest */ 489 dst_wide = vmovl_u8(dst_raw_2); 490 491 /* alpha mul the dest */ 492 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 493 dst_cooked = vshrn_n_u16(dst_wide, 8); 494 495 /* sum -- ignoring any byte lane overflows */ 496 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 497 } 498#endif 499 500 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 501#if UNROLL > 2 502 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 503#endif 504 505 src += UNROLL; 506 dst += UNROLL; 507 count -= UNROLL; 508 } 509#undef UNROLL 510 511 /* do any residual iterations */ 512 while (--count >= 0) { 513 *dst = SkPMSrcOver(*src, *dst); 514 src += 1; 515 dst += 1; 516 } 517 } 518} 519 520 521/* Neon version of S32_Blend_BlitRow32() 522 * portable version is in src/core/SkBlitRow_D32.cpp 523 */ 524void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 525 const SkPMColor* SK_RESTRICT src, 526 int count, U8CPU alpha) { 527 SkASSERT(alpha <= 255); 528 if (count > 0) { 529 uint16_t src_scale = SkAlpha255To256(alpha); 530 uint16_t dst_scale = 256 - src_scale; 531 532 /* run them N at a time through the NEON unit */ 533 /* note that each 1 is 4 bytes, each treated exactly the same, 534 * so we can work under that guise. We *do* know that the src&dst 535 * will be 32-bit aligned quantities, so we can specify that on 536 * the load/store ops and do a neon 'reinterpret' to get us to 537 * byte-sized (pun intended) pieces that we widen/multiply/shift 538 * we're limited at 128 bits in the wide ops, which is 8x16bits 539 * or a pair of 32 bit src/dsts. 540 */ 541 /* we *could* manually unroll this loop so that we load 128 bits 542 * (as a pair of 64s) from each of src and dst, processing them 543 * in pieces. This might give us a little better management of 544 * the memory latency, but my initial attempts here did not 545 * produce an instruction stream that looked all that nice. 546 */ 547#define UNROLL 2 548 while (count >= UNROLL) { 549 uint8x8_t src_raw, dst_raw, dst_final; 550 uint16x8_t src_wide, dst_wide; 551 552 /* get 64 bits of src, widen it, multiply by src_scale */ 553 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 554 src_wide = vmovl_u8(src_raw); 555 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ 556 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); 557 558 /* ditto with dst */ 559 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 560 dst_wide = vmovl_u8(dst_raw); 561 562 /* combine add with dst multiply into mul-accumulate */ 563 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); 564 565 dst_final = vshrn_n_u16(dst_wide, 8); 566 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 567 568 src += UNROLL; 569 dst += UNROLL; 570 count -= UNROLL; 571 } 572 /* RBE: well, i don't like how gcc manages src/dst across the above 573 * loop it's constantly calculating src+bias, dst+bias and it only 574 * adjusts the real ones when we leave the loop. Not sure why 575 * it's "hoisting down" (hoisting implies above in my lexicon ;)) 576 * the adjustments to src/dst/count, but it does... 577 * (might be SSA-style internal logic... 578 */ 579 580#if UNROLL == 2 581 if (count == 1) { 582 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 583 } 584#else 585 if (count > 0) { 586 do { 587 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 588 src += 1; 589 dst += 1; 590 } while (--count > 0); 591 } 592#endif 593 594#undef UNROLL 595 } 596} 597 598/////////////////////////////////////////////////////////////////////////////// 599 600#undef DEBUG_OPAQUE_DITHER 601 602#if defined(DEBUG_OPAQUE_DITHER) 603static void showme8(char *str, void *p, int len) 604{ 605 static char buf[256]; 606 char tbuf[32]; 607 int i; 608 char *pc = (char*) p; 609 sprintf(buf,"%8s:", str); 610 for(i=0;i<len;i++) { 611 sprintf(tbuf, " %02x", pc[i]); 612 strcat(buf, tbuf); 613 } 614 SkDebugf("%s\n", buf); 615} 616static void showme16(char *str, void *p, int len) 617{ 618 static char buf[256]; 619 char tbuf[32]; 620 int i; 621 uint16_t *pc = (uint16_t*) p; 622 sprintf(buf,"%8s:", str); 623 len = (len / sizeof(uint16_t)); /* passed as bytes */ 624 for(i=0;i<len;i++) { 625 sprintf(tbuf, " %04x", pc[i]); 626 strcat(buf, tbuf); 627 } 628 SkDebugf("%s\n", buf); 629} 630#endif 631 632void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 633 const SkPMColor* SK_RESTRICT src, 634 int count, U8CPU alpha, int x, int y) { 635 SkASSERT(255 == alpha); 636 637#define UNROLL 8 638 639 if (count >= UNROLL) { 640 uint8x8_t dbase; 641 642#if defined(DEBUG_OPAQUE_DITHER) 643 uint16_t tmpbuf[UNROLL]; 644 int td[UNROLL]; 645 int tdv[UNROLL]; 646 int ta[UNROLL]; 647 int tap[UNROLL]; 648 uint16_t in_dst[UNROLL]; 649 int offset = 0; 650 int noisy = 0; 651#endif 652 653 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 654 dbase = vld1_u8(dstart); 655 656 do { 657 uint8x8_t sr, sg, sb, sa, d; 658 uint16x8_t dst8, scale8, alpha8; 659 uint16x8_t dst_r, dst_g, dst_b; 660 661#if defined(DEBUG_OPAQUE_DITHER) 662 /* calculate 8 elements worth into a temp buffer */ 663 { 664 int my_y = y; 665 int my_x = x; 666 SkPMColor* my_src = (SkPMColor*)src; 667 uint16_t* my_dst = dst; 668 int i; 669 670 DITHER_565_SCAN(my_y); 671 for(i=0;i<UNROLL;i++) { 672 SkPMColor c = *my_src++; 673 SkPMColorAssert(c); 674 if (c) { 675 unsigned a = SkGetPackedA32(c); 676 677 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 678 tdv[i] = DITHER_VALUE(my_x); 679 ta[i] = a; 680 tap[i] = SkAlpha255To256(a); 681 td[i] = d; 682 683 unsigned sr = SkGetPackedR32(c); 684 unsigned sg = SkGetPackedG32(c); 685 unsigned sb = SkGetPackedB32(c); 686 sr = SkDITHER_R32_FOR_565(sr, d); 687 sg = SkDITHER_G32_FOR_565(sg, d); 688 sb = SkDITHER_B32_FOR_565(sb, d); 689 690 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 691 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 692 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 693 // now src and dst expanded are in g:11 r:10 x:1 b:10 694 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 695 td[i] = d; 696 697 } else { 698 tmpbuf[i] = *my_dst; 699 ta[i] = tdv[i] = td[i] = 0xbeef; 700 } 701 in_dst[i] = *my_dst; 702 my_dst += 1; 703 DITHER_INC_X(my_x); 704 } 705 } 706#endif 707 708 /* source is in ABGR */ 709 { 710 register uint8x8_t d0 asm("d0"); 711 register uint8x8_t d1 asm("d1"); 712 register uint8x8_t d2 asm("d2"); 713 register uint8x8_t d3 asm("d3"); 714 715 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 716 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 717 : "r" (src) 718 ); 719 sr = d0; sg = d1; sb = d2; sa = d3; 720 } 721 722 /* calculate 'd', which will be 0..7 */ 723 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 724#if defined(SK_BUILD_FOR_ANDROID) 725 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 726 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 727#else 728 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); 729#endif 730 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); 731 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ 732 733 /* sr = sr - (sr>>5) + d */ 734 /* watching for 8-bit overflow. d is 0..7; risky range of 735 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 736 * safe as long as we do ((sr-sr>>5) + d) */ 737 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 738 sr = vadd_u8(sr, d); 739 740 /* sb = sb - (sb>>5) + d */ 741 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 742 sb = vadd_u8(sb, d); 743 744 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 745 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 746 sg = vadd_u8(sg, vshr_n_u8(d,1)); 747 748 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 749 dst8 = vld1q_u16(dst); 750 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 751 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 752 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 753 754 /* blend */ 755#if 1 756 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 757 /* originally 255-sa + 1 */ 758 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 759#else 760 scale8 = vsubw_u8(vdupq_n_u16(255), sa); 761 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); 762#endif 763 764#if 1 765 /* combine the addq and mul, save 3 insns */ 766 scale8 = vshrq_n_u16(scale8, 3); 767 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 768 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 769 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 770#else 771 /* known correct, but +3 insns over above */ 772 scale8 = vshrq_n_u16(scale8, 3); 773 dst_b = vmulq_u16(dst_b, scale8); 774 dst_g = vmulq_u16(dst_g, scale8); 775 dst_r = vmulq_u16(dst_r, scale8); 776 777 /* combine */ 778 /* NB: vshll widens, need to preserve those bits */ 779 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); 780 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); 781 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); 782#endif 783 784 /* repack to store */ 785 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); 786 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 787 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 788 789 vst1q_u16(dst, dst8); 790 791#if defined(DEBUG_OPAQUE_DITHER) 792 /* verify my 8 elements match the temp buffer */ 793 { 794 int i, bad=0; 795 static int invocation; 796 797 for (i=0;i<UNROLL;i++) 798 if (tmpbuf[i] != dst[i]) bad=1; 799 if (bad) { 800 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 801 invocation, offset); 802 SkDebugf(" alpha 0x%x\n", alpha); 803 for (i=0;i<UNROLL;i++) 804 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 805 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 806 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 807 808 showme16("alpha8", &alpha8, sizeof(alpha8)); 809 showme16("scale8", &scale8, sizeof(scale8)); 810 showme8("d", &d, sizeof(d)); 811 showme16("dst8", &dst8, sizeof(dst8)); 812 showme16("dst_b", &dst_b, sizeof(dst_b)); 813 showme16("dst_g", &dst_g, sizeof(dst_g)); 814 showme16("dst_r", &dst_r, sizeof(dst_r)); 815 showme8("sb", &sb, sizeof(sb)); 816 showme8("sg", &sg, sizeof(sg)); 817 showme8("sr", &sr, sizeof(sr)); 818 819 /* cop out */ 820 return; 821 } 822 offset += UNROLL; 823 invocation++; 824 } 825#endif 826 827 dst += UNROLL; 828 src += UNROLL; 829 count -= UNROLL; 830 /* skip x += UNROLL, since it's unchanged mod-4 */ 831 } while (count >= UNROLL); 832 } 833#undef UNROLL 834 835 /* residuals */ 836 if (count > 0) { 837 DITHER_565_SCAN(y); 838 do { 839 SkPMColor c = *src++; 840 SkPMColorAssert(c); 841 if (c) { 842 unsigned a = SkGetPackedA32(c); 843 844 // dither and alpha are just temporary variables to work-around 845 // an ICE in debug. 846 unsigned dither = DITHER_VALUE(x); 847 unsigned alpha = SkAlpha255To256(a); 848 int d = SkAlphaMul(dither, alpha); 849 850 unsigned sr = SkGetPackedR32(c); 851 unsigned sg = SkGetPackedG32(c); 852 unsigned sb = SkGetPackedB32(c); 853 sr = SkDITHER_R32_FOR_565(sr, d); 854 sg = SkDITHER_G32_FOR_565(sg, d); 855 sb = SkDITHER_B32_FOR_565(sb, d); 856 857 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 858 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 859 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 860 // now src and dst expanded are in g:11 r:10 x:1 b:10 861 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 862 } 863 dst += 1; 864 DITHER_INC_X(x); 865 } while (--count != 0); 866 } 867} 868 869/////////////////////////////////////////////////////////////////////////////// 870 871/* 2009/10/27: RBE says "a work in progress"; debugging says ok; 872 * speedup untested, but ARM version is 26 insns/iteration and 873 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) 874 * which is 10x the native version; that's pure instruction counts, 875 * not accounting for any instruction or memory latencies. 876 */ 877 878#undef DEBUG_S32_OPAQUE_DITHER 879 880void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 881 const SkPMColor* SK_RESTRICT src, 882 int count, U8CPU alpha, int x, int y) { 883 SkASSERT(255 == alpha); 884 885#define UNROLL 8 886 if (count >= UNROLL) { 887 uint8x8_t d; 888 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 889 d = vld1_u8(dstart); 890 891 while (count >= UNROLL) { 892 uint8x8_t sr, sg, sb, sa; 893 uint16x8_t dr, dg, db, da; 894 uint16x8_t dst8; 895 896 /* source is in ABGR ordering (R == lsb) */ 897 { 898 register uint8x8_t d0 asm("d0"); 899 register uint8x8_t d1 asm("d1"); 900 register uint8x8_t d2 asm("d2"); 901 register uint8x8_t d3 asm("d3"); 902 903 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 904 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 905 : "r" (src) 906 ); 907 sr = d0; sg = d1; sb = d2; sa = d3; 908 } 909 /* XXX: if we want to prefetch, hide it in the above asm() 910 * using the gcc __builtin_prefetch(), the prefetch will 911 * fall to the bottom of the loop -- it won't stick up 912 * at the top of the loop, just after the vld4. 913 */ 914 915 /* sr = sr - (sr>>5) + d */ 916 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 917 dr = vaddl_u8(sr, d); 918 919 /* sb = sb - (sb>>5) + d */ 920 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 921 db = vaddl_u8(sb, d); 922 923 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 924 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 925 dg = vaddl_u8(sg, vshr_n_u8(d,1)); 926 /* XXX: check that the "d>>1" here is hoisted */ 927 928 /* pack high bits of each into 565 format (rgb, b is lsb) */ 929 dst8 = vshrq_n_u16(db, 3); 930 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 931 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); 932 933 /* store it */ 934 vst1q_u16(dst, dst8); 935 936#if defined(DEBUG_S32_OPAQUE_DITHER) 937 /* always good to know if we generated good results */ 938 { 939 int i, myx = x, myy = y; 940 DITHER_565_SCAN(myy); 941 for (i=0;i<UNROLL;i++) { 942 SkPMColor c = src[i]; 943 unsigned dither = DITHER_VALUE(myx); 944 uint16_t val = SkDitherRGB32To565(c, dither); 945 if (val != dst[i]) { 946 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 947 c, dither, val, dst[i], dstart[i]); 948 } 949 DITHER_INC_X(myx); 950 } 951 } 952#endif 953 954 dst += UNROLL; 955 src += UNROLL; 956 count -= UNROLL; 957 x += UNROLL; /* probably superfluous */ 958 } 959 } 960#undef UNROLL 961 962 /* residuals */ 963 if (count > 0) { 964 DITHER_565_SCAN(y); 965 do { 966 SkPMColor c = *src++; 967 SkPMColorAssert(c); 968 SkASSERT(SkGetPackedA32(c) == 255); 969 970 unsigned dither = DITHER_VALUE(x); 971 *dst++ = SkDitherRGB32To565(c, dither); 972 DITHER_INC_X(x); 973 } while (--count != 0); 974 } 975} 976 977void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 978 SkPMColor color) { 979 if (count <= 0) { 980 return; 981 } 982 983 if (0 == color) { 984 if (src != dst) { 985 memcpy(dst, src, count * sizeof(SkPMColor)); 986 } 987 return; 988 } 989 990 unsigned colorA = SkGetPackedA32(color); 991 if (255 == colorA) { 992 sk_memset32(dst, color, count); 993 } else { 994 unsigned scale = 256 - SkAlpha255To256(colorA); 995 996 if (count >= 8) { 997 // at the end of this assembly, count will have been decremented 998 // to a negative value. That is, if count mod 8 = x, it will be 999 // -8 +x coming out. 1000 asm volatile ( 1001 PLD128(src, 0) 1002 1003 "vdup.32 q0, %[color] \n\t" 1004 1005 PLD128(src, 128) 1006 1007 // scale numerical interval [0-255], so load as 8 bits 1008 "vdup.8 d2, %[scale] \n\t" 1009 1010 PLD128(src, 256) 1011 1012 "subs %[count], %[count], #8 \n\t" 1013 1014 PLD128(src, 384) 1015 1016 "Loop_Color32: \n\t" 1017 1018 // load src color, 8 pixels, 4 64 bit registers 1019 // (and increment src). 1020 "vld1.32 {d4-d7}, [%[src]]! \n\t" 1021 1022 PLD128(src, 384) 1023 1024 // multiply long by scale, 64 bits at a time, 1025 // destination into a 128 bit register. 1026 "vmull.u8 q4, d4, d2 \n\t" 1027 "vmull.u8 q5, d5, d2 \n\t" 1028 "vmull.u8 q6, d6, d2 \n\t" 1029 "vmull.u8 q7, d7, d2 \n\t" 1030 1031 // shift the 128 bit registers, containing the 16 1032 // bit scaled values back to 8 bits, narrowing the 1033 // results to 64 bit registers. 1034 "vshrn.i16 d8, q4, #8 \n\t" 1035 "vshrn.i16 d9, q5, #8 \n\t" 1036 "vshrn.i16 d10, q6, #8 \n\t" 1037 "vshrn.i16 d11, q7, #8 \n\t" 1038 1039 // adding back the color, using 128 bit registers. 1040 "vadd.i8 q6, q4, q0 \n\t" 1041 "vadd.i8 q7, q5, q0 \n\t" 1042 1043 // store back the 8 calculated pixels (2 128 bit 1044 // registers), and increment dst. 1045 "vst1.32 {d12-d15}, [%[dst]]! \n\t" 1046 1047 "subs %[count], %[count], #8 \n\t" 1048 "bge Loop_Color32 \n\t" 1049 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 1050 : [color] "r" (color), [scale] "r" (scale) 1051 : "cc", "memory", 1052 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1053 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" 1054 ); 1055 // At this point, if we went through the inline assembly, count is 1056 // a negative value: 1057 // if the value is -8, there is no pixel left to process. 1058 // if the value is -7, there is one pixel left to process 1059 // ... 1060 // And'ing it with 7 will give us the number of pixels 1061 // left to process. 1062 count = count & 0x7; 1063 } 1064 1065 while (count > 0) { 1066 *dst = color + SkAlphaMulQ(*src, scale); 1067 src += 1; 1068 dst += 1; 1069 count--; 1070 } 1071 } 1072} 1073 1074/////////////////////////////////////////////////////////////////////////////// 1075 1076const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1077 // no dither 1078 // NOTE: For the two functions below, we don't have a special version 1079 // that assumes that each source pixel is opaque. But our S32A is 1080 // still faster than the default, so use it. 1081 S32A_D565_Opaque_neon, // really S32_D565_Opaque 1082 S32A_D565_Blend_neon, // really S32_D565_Blend 1083 S32A_D565_Opaque_neon, 1084 S32A_D565_Blend_neon, 1085 1086 // dither 1087 S32_D565_Opaque_Dither_neon, 1088 S32_D565_Blend_Dither_neon, 1089 S32A_D565_Opaque_Dither_neon, 1090 NULL, // S32A_D565_Blend_Dither 1091}; 1092 1093const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm_neon[] = { 1094 // no dither 1095 NULL, // S32_D4444_Opaque, 1096 NULL, // S32_D4444_Blend, 1097 NULL, // S32A_D4444_Opaque, 1098 NULL, // S32A_D4444_Blend, 1099 1100 // dither 1101 NULL, // S32_D4444_Opaque_Dither, 1102 NULL, // S32_D4444_Blend_Dither, 1103 NULL, // S32A_D4444_Opaque_Dither, 1104 NULL, // S32A_D4444_Blend_Dither 1105}; 1106 1107const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1108 NULL, // S32_Opaque, 1109 S32_Blend_BlitRow32_neon, // S32_Blend, 1110 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1111 S32A_Blend_BlitRow32_arm // S32A_Blend 1112}; 1113