mmx_blend.S revision 55d9ee83b4c29e8f7c373ee6326bbb4f77402bee
1/* 2 * Written by Jos� Fonseca <j_r_fonseca@yahoo.co.uk> 3 */ 4 5#include "matypes.h" 6 7 8/* integer multiplication - alpha plus one 9 * 10 * makes the following approximation to the division (Sree) 11 * 12 * rgb*a/255 ~= (rgb*(a+1)) >> 256 13 * 14 * which is the fastest method that satisfies the following OpenGL criteria 15 * 16 * 0*0 = 0 and 255*255 = 255 17 * 18 * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making 19 * 20 * PCMPEQW ( MX1, MX1 ) 21 */ 22#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \ 23 PSUBW ( MX1, MA1 ) /* a1 + 1 | a1 + 1 | a1 + 1 | a1 + 1 */ ;\ 24TWO(PSUBW ( MX1, MA2 )) /* a2 + 1 | a2 + 1 | a2 + 1 | a2 + 1 */ ;\ 25 ;\ 26 PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 27TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 28 ;\ 29 PSRLW ( CONST(8), MA1 ) /* t1 >> 8 ~= t1/255 */ ;\ 30TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 ~= t2/255 */ 31 32 33/* integer multiplication - geometric series 34 * 35 * takes the geometric series approximation to the division 36 * 37 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 38 * 39 * in this case just the first two terms to fit in 16bit arithmetic 40 * 41 * t/255 ~= (t + (t >> 8)) >> 8 42 * 43 * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254, 44 * so the special case a = 255 must be accounted or roundoff must be used 45 */ 46#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \ 47 PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 48TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 49 ;\ 50 MOVQ ( MA1, MP1 ) ;\ 51TWO(MOVQ ( MA2, MP2 )) ;\ 52 ;\ 53 PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\ 54TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\ 55 ;\ 56 PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 57TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 58 ;\ 59 PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 60TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 61 62 63/* integer multiplication - geometric series plus rounding 64 * 65 * when using a geometric series division instead of truncating the result 66 * use roundoff in the approximation (Jim Blinn) 67 * 68 * t = rgb*a + 0x80 69 * 70 * achieving the exact results 71 * 72 * note that M80 is register with the 0x0080008000800080 constant 73 */ 74#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \ 75 PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 76TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 77 ;\ 78 PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ 79TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ 80 ;\ 81 MOVQ ( MA1, MP1 ) ;\ 82TWO(MOVQ ( MA2, MP2 )) ;\ 83 ;\ 84 PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\ 85TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\ 86 ;\ 87 PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 88TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 89 ;\ 90 PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 91TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 92 93 94/* linear interpolation - geometric series 95 */ 96#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \ 97 PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 98TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 99 ;\ 100 PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 101TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 102 ;\ 103 PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 104TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 105 ;\ 106 MOVQ ( MA1, MP1 ) ;\ 107TWO(MOVQ ( MA2, MP2 )) ;\ 108 ;\ 109 PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\ 110TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\ 111 ;\ 112 PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 113TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 114 ;\ 115 PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 116TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 117 ;\ 118 PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 119TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 120 121 122/* linear interpolation - geometric series with roundoff 123 * 124 * this is a generalization of Blinn's formula to signed arithmetic 125 * 126 * note that M80 is a register with the 0x0080008000800080 constant 127 */ 128#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \ 129 PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 130TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 131 ;\ 132 PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 133TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 134 ;\ 135 PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 136TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 137 ;\ 138 PSRLW ( CONST(15), MP1 ) /* q1 > p1 ? 1 : 0 */ ;\ 139TWO(PSRLW ( CONST(15), MP2 )) /* q2 > q2 ? 1 : 0 */ ;\ 140 ;\ 141 PSLLW ( CONST(8), MP1 ) /* q1 > p1 ? 0x100 : 0 */ ;\ 142TWO(PSLLW ( CONST(8), MP2 )) /* q2 > q2 ? 0x100 : 0 */ ;\ 143 ;\ 144 PSUBW ( MP1, MA1 ) /* t1 -=? 0x100 */ ;\ 145TWO(PSUBW ( MP2, MA2 )) /* t2 -=? 0x100 */ ;\ 146 ;\ 147 PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ 148TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ 149 ;\ 150 MOVQ ( MA1, MP1 ) ;\ 151TWO(MOVQ ( MA2, MP2 )) ;\ 152 ;\ 153 PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\ 154TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\ 155 ;\ 156 PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 157TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 158 ;\ 159 PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 160TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 161 ;\ 162 PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 163TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 164 165 166/* linear interpolation - geometric series with correction 167 * 168 * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria 169 * 170 * t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8 171 * 172 * note that although is faster than rounding off it doesn't give always the exact results 173 */ 174#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \ 175 PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 176TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 177 ;\ 178 PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 179TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 180 ;\ 181 PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 182TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 183 ;\ 184 MOVQ ( MA1, MP1 ) ;\ 185TWO(MOVQ ( MA2, MP2 )) ;\ 186 ;\ 187 PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\ 188TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\ 189 ;\ 190 PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 191TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 192 ;\ 193 PSRLW ( CONST(7), MP1 ) /* t1 >> 15 */ ;\ 194TWO(PSRLW ( CONST(7), MP2 )) /* t2 >> 15 */ ;\ 195 ;\ 196 PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ ;\ 197TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ ;\ 198 ;\ 199 PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 200TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 201 ;\ 202 PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 203TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 204 205 206/* common blending initialization code 207 */ 208#if 0 /* rounding not used */ 209 SEG_DATA 210 211ALIGNDATA8 212const_80: 213 D_LONG 0x00800080, 0x00800080 214 215#define GMB_INIT( M00, M80 ) \ 216 PXOR ( M00, M00 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ 217 MOVQ ( CONTENT(const_80), M80 ) /* 0xffff | 0xffff | 0xffff | 0xffff */ 218 219#else 220 221#define GMB_INIT( M00 ) \ 222 PXOR ( M00, M00 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ 223 224#endif 225 226/* common blending loading code 227 * 228 * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making 229 * 230 * PXOR ( M00, M00 ) 231 */ 232#define GMB_LOAD(rgba, dest, MP1, MQ1, MA1, MP2, MQ2, MA2, M00) \ 233ONE(MOVD ( REGIND(rgba), MP1 )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\ 234ONE(MOVD ( REGIND(dest), MQ1 )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\ 235 ;\ 236TWO(MOVQ ( REGIND(rgba), MP1 )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\ 237TWO(MOVQ ( REGIND(dest), MQ1 )) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */ ;\ 238 ;\ 239TWO(MOVQ ( MP1, MP2 )) ;\ 240TWO(MOVQ ( MQ1, MQ2 )) ;\ 241 ;\ 242 PUNPCKLBW ( M00, MQ1 ) /* qa1 | qb1 | qg1 | qr1 */ ;\ 243TWO(PUNPCKHBW ( M00, MQ2 )) /* qa2 | qb2 | qg2 | qr2 */ ;\ 244 PUNPCKLBW ( M00, MP1 ) /* pa1 | pb1 | pg1 | pr1 */ ;\ 245TWO(PUNPCKHBW ( M00, MP2 )) /* pa2 | pb2 | pg2 | pr2 */ ;\ 246 ;\ 247 MOVQ ( MP1, MA1 ) ;\ 248TWO(MOVQ ( MP2, MA2 )) ;\ 249 ;\ 250 PUNPCKHWD ( MA1, MA1 ) /* pa1 | pa1 | | */ ;\ 251TWO(PUNPCKHWD ( MA2, MA2 )) /* pa2 | pa2 | | */ ;\ 252 PUNPCKHDQ ( MA1, MA1 ) /* pa1 | pa1 | pa1 | pa1 */ ;\ 253TWO(PUNPCKHDQ ( MA2, MA2 )) /* pa2 | pa2 | pa2 | pa2 */ 254 255 256/* common blending storing code 257 */ 258#define GMB_STORE(rgba, MA1, MA2) \ 259 PACKUSWB ( MA2, MA1 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\ 260 ;\ 261ONE(MOVD ( MA1, REGIND(rgba) )) ;\ 262TWO(MOVQ ( MA1, REGIND(rgba) )) 263 264 265 SEG_TEXT 266 267 268/* common transparency blending mode 269 */ 270 271#define TAG(x) x##_transparency 272 273#define INIT \ 274 GMB_INIT( MM0 ) 275 276#define MAIN \ 277 GMB_LOAD( EDI, ESI, MM1, MM2, MM3, MM4, MM5, MM6, MM0) ;\ 278 GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 ) ;\ 279 GMB_STORE( EDI, MM3, MM6 ) 280 281#include "mmx_blendtmp.h" 282 283