1; 2; jdsample.asm - upsampling (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on 7; x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_CONST 23 24 alignz 16 25 global EXTN(jconst_fancy_upsample_mmx) 26 27EXTN(jconst_fancy_upsample_mmx): 28 29PW_ONE times 4 dw 1 30PW_TWO times 4 dw 2 31PW_THREE times 4 dw 3 32PW_SEVEN times 4 dw 7 33PW_EIGHT times 4 dw 8 34 35 alignz 16 36 37; -------------------------------------------------------------------------- 38 SECTION SEG_TEXT 39 BITS 32 40; 41; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 42; 43; The upsampling algorithm is linear interpolation between pixel centers, 44; also known as a "triangle filter". This is a good compromise between 45; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 46; of the way between input pixel centers. 47; 48; GLOBAL(void) 49; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor, 50; JDIMENSION downsampled_width, 51; JSAMPARRAY input_data, 52; JSAMPARRAY * output_data_ptr); 53; 54 55%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 56%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width 57%define input_data(b) (b)+16 ; JSAMPARRAY input_data 58%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 59 60 align 16 61 global EXTN(jsimd_h2v1_fancy_upsample_mmx) 62 63EXTN(jsimd_h2v1_fancy_upsample_mmx): 64 push ebp 65 mov ebp,esp 66 pushpic ebx 67; push ecx ; need not be preserved 68; push edx ; need not be preserved 69 push esi 70 push edi 71 72 get_GOT ebx ; get GOT address 73 74 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 75 test eax,eax 76 jz near .return 77 78 mov ecx, INT [max_v_samp(ebp)] ; rowctr 79 test ecx,ecx 80 jz near .return 81 82 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 83 mov edi, POINTER [output_data_ptr(ebp)] 84 mov edi, JSAMPARRAY [edi] ; output_data 85 alignx 16,7 86.rowloop: 87 push eax ; colctr 88 push edi 89 push esi 90 91 mov esi, JSAMPROW [esi] ; inptr 92 mov edi, JSAMPROW [edi] ; outptr 93 94 test eax, SIZEOF_MMWORD-1 95 jz short .skip 96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 98.skip: 99 pxor mm0,mm0 ; mm0=(all 0's) 100 pcmpeqb mm7,mm7 101 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT 102 pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] 103 104 add eax, byte SIZEOF_MMWORD-1 105 and eax, byte -SIZEOF_MMWORD 106 cmp eax, byte SIZEOF_MMWORD 107 ja short .columnloop 108 alignx 16,7 109 110.columnloop_last: 111 pcmpeqb mm6,mm6 112 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT 113 pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] 114 jmp short .upsample 115 alignx 16,7 116 117.columnloop: 118 movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] 119 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT 120 121.upsample: 122 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 123 movq mm2,mm1 124 movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7) 125 psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) 126 psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) 127 128 por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6) 129 por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8) 130 131 movq mm7,mm1 132 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) 133 134 movq mm4,mm1 135 punpcklbw mm1,mm0 ; mm1=( 0 1 2 3) 136 punpckhbw mm4,mm0 ; mm4=( 4 5 6 7) 137 movq mm5,mm2 138 punpcklbw mm2,mm0 ; mm2=(-1 0 1 2) 139 punpckhbw mm5,mm0 ; mm5=( 3 4 5 6) 140 movq mm6,mm3 141 punpcklbw mm3,mm0 ; mm3=( 1 2 3 4) 142 punpckhbw mm6,mm0 ; mm6=( 5 6 7 8) 143 144 pmullw mm1,[GOTOFF(ebx,PW_THREE)] 145 pmullw mm4,[GOTOFF(ebx,PW_THREE)] 146 paddw mm2,[GOTOFF(ebx,PW_ONE)] 147 paddw mm5,[GOTOFF(ebx,PW_ONE)] 148 paddw mm3,[GOTOFF(ebx,PW_TWO)] 149 paddw mm6,[GOTOFF(ebx,PW_TWO)] 150 151 paddw mm2,mm1 152 paddw mm5,mm4 153 psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6) 154 psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14) 155 paddw mm3,mm1 156 paddw mm6,mm4 157 psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7) 158 psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15) 159 160 psllw mm3,BYTE_BIT 161 psllw mm6,BYTE_BIT 162 por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) 163 por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) 164 165 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 166 movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 167 168 sub eax, byte SIZEOF_MMWORD 169 add esi, byte 1*SIZEOF_MMWORD ; inptr 170 add edi, byte 2*SIZEOF_MMWORD ; outptr 171 cmp eax, byte SIZEOF_MMWORD 172 ja near .columnloop 173 test eax,eax 174 jnz near .columnloop_last 175 176 pop esi 177 pop edi 178 pop eax 179 180 add esi, byte SIZEOF_JSAMPROW ; input_data 181 add edi, byte SIZEOF_JSAMPROW ; output_data 182 dec ecx ; rowctr 183 jg near .rowloop 184 185 emms ; empty MMX state 186 187.return: 188 pop edi 189 pop esi 190; pop edx ; need not be preserved 191; pop ecx ; need not be preserved 192 poppic ebx 193 pop ebp 194 ret 195 196; -------------------------------------------------------------------------- 197; 198; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 199; Again a triangle filter; see comments for h2v1 case, above. 200; 201; GLOBAL(void) 202; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor, 203; JDIMENSION downsampled_width, 204; JSAMPARRAY input_data, 205; JSAMPARRAY * output_data_ptr); 206; 207 208%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 209%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width 210%define input_data(b) (b)+16 ; JSAMPARRAY input_data 211%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 212 213%define original_ebp ebp+0 214%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 215%define WK_NUM 4 216%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr 217 218 align 16 219 global EXTN(jsimd_h2v2_fancy_upsample_mmx) 220 221EXTN(jsimd_h2v2_fancy_upsample_mmx): 222 push ebp 223 mov eax,esp ; eax = original ebp 224 sub esp, byte 4 225 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 226 mov [esp],eax 227 mov ebp,esp ; ebp = aligned ebp 228 lea esp, [wk(0)] 229 pushpic eax ; make a room for GOT address 230 push ebx 231; push ecx ; need not be preserved 232; push edx ; need not be preserved 233 push esi 234 push edi 235 236 get_GOT ebx ; get GOT address 237 movpic POINTER [gotptr], ebx ; save GOT address 238 239 mov edx,eax ; edx = original ebp 240 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 241 test eax,eax 242 jz near .return 243 244 mov ecx, INT [max_v_samp(edx)] ; rowctr 245 test ecx,ecx 246 jz near .return 247 248 mov esi, JSAMPARRAY [input_data(edx)] ; input_data 249 mov edi, POINTER [output_data_ptr(edx)] 250 mov edi, JSAMPARRAY [edi] ; output_data 251 alignx 16,7 252.rowloop: 253 push eax ; colctr 254 push ecx 255 push edi 256 push esi 257 258 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 259 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 260 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 261 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 262 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 263 264 test eax, SIZEOF_MMWORD-1 265 jz short .skip 266 push edx 267 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 268 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 269 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 270 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 271 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 272 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 273 pop edx 274.skip: 275 ; -- process the first column block 276 277 movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] 278 movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] 279 movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] 280 281 pushpic ebx 282 movpic ebx, POINTER [gotptr] ; load GOT address 283 284 pxor mm3,mm3 ; mm3=(all 0's) 285 movq mm4,mm0 286 punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3) 287 punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7) 288 movq mm5,mm1 289 punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3) 290 punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7) 291 movq mm6,mm2 292 punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3) 293 punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7) 294 295 pmullw mm0,[GOTOFF(ebx,PW_THREE)] 296 pmullw mm4,[GOTOFF(ebx,PW_THREE)] 297 298 pcmpeqb mm7,mm7 299 psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT 300 301 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) 302 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) 303 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) 304 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) 305 306 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save 307 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data 308 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 309 movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 310 311 pand mm1,mm7 ; mm1=( 0 - - -) 312 pand mm2,mm7 ; mm2=( 0 - - -) 313 314 movq MMWORD [wk(0)], mm1 315 movq MMWORD [wk(1)], mm2 316 317 poppic ebx 318 319 add eax, byte SIZEOF_MMWORD-1 320 and eax, byte -SIZEOF_MMWORD 321 cmp eax, byte SIZEOF_MMWORD 322 ja short .columnloop 323 alignx 16,7 324 325.columnloop_last: 326 ; -- process the last column block 327 328 pushpic ebx 329 movpic ebx, POINTER [gotptr] ; load GOT address 330 331 pcmpeqb mm1,mm1 332 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT 333 movq mm2,mm1 334 335 pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) 336 pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) 337 338 movq MMWORD [wk(2)], mm1 339 movq MMWORD [wk(3)], mm2 340 341 jmp short .upsample 342 alignx 16,7 343 344.columnloop: 345 ; -- process the next column block 346 347 movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] 348 movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] 349 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] 350 351 pushpic ebx 352 movpic ebx, POINTER [gotptr] ; load GOT address 353 354 pxor mm3,mm3 ; mm3=(all 0's) 355 movq mm4,mm0 356 punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3) 357 punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7) 358 movq mm5,mm1 359 punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3) 360 punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7) 361 movq mm6,mm2 362 punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3) 363 punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7) 364 365 pmullw mm0,[GOTOFF(ebx,PW_THREE)] 366 pmullw mm4,[GOTOFF(ebx,PW_THREE)] 367 368 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) 369 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) 370 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) 371 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) 372 373 movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save 374 movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data 375 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 376 movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 377 378 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) 379 psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) 380 381 movq MMWORD [wk(2)], mm1 382 movq MMWORD [wk(3)], mm2 383 384.upsample: 385 ; -- process the upper row 386 387 movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) 388 movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) 389 390 movq mm0,mm7 391 movq mm4,mm3 392 psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -) 393 psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) 394 movq mm5,mm7 395 movq mm6,mm3 396 psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) 397 psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6) 398 399 por mm0,mm4 ; mm0=( 1 2 3 4) 400 por mm5,mm6 ; mm5=( 3 4 5 6) 401 402 movq mm1,mm7 403 movq mm2,mm3 404 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) 405 psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -) 406 movq mm4,mm3 407 psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) 408 409 por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) 410 por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) 411 412 movq MMWORD [wk(0)], mm4 413 414 pmullw mm7,[GOTOFF(ebx,PW_THREE)] 415 pmullw mm3,[GOTOFF(ebx,PW_THREE)] 416 paddw mm1,[GOTOFF(ebx,PW_EIGHT)] 417 paddw mm5,[GOTOFF(ebx,PW_EIGHT)] 418 paddw mm0,[GOTOFF(ebx,PW_SEVEN)] 419 paddw mm2,[GOTOFF(ebx,PW_SEVEN)] 420 421 paddw mm1,mm7 422 paddw mm5,mm3 423 psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6) 424 psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14) 425 paddw mm0,mm7 426 paddw mm2,mm3 427 psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7) 428 psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15) 429 430 psllw mm0,BYTE_BIT 431 psllw mm2,BYTE_BIT 432 por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) 433 por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) 434 435 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 436 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 437 438 ; -- process the lower row 439 440 movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) 441 movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) 442 443 movq mm7,mm6 444 movq mm3,mm4 445 psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -) 446 psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) 447 movq mm0,mm6 448 movq mm2,mm4 449 psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) 450 psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6) 451 452 por mm7,mm3 ; mm7=( 1 2 3 4) 453 por mm0,mm2 ; mm0=( 3 4 5 6) 454 455 movq mm1,mm6 456 movq mm5,mm4 457 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) 458 psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -) 459 movq mm3,mm4 460 psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) 461 462 por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) 463 por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) 464 465 movq MMWORD [wk(1)], mm3 466 467 pmullw mm6,[GOTOFF(ebx,PW_THREE)] 468 pmullw mm4,[GOTOFF(ebx,PW_THREE)] 469 paddw mm1,[GOTOFF(ebx,PW_EIGHT)] 470 paddw mm0,[GOTOFF(ebx,PW_EIGHT)] 471 paddw mm7,[GOTOFF(ebx,PW_SEVEN)] 472 paddw mm5,[GOTOFF(ebx,PW_SEVEN)] 473 474 paddw mm1,mm6 475 paddw mm0,mm4 476 psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6) 477 psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14) 478 paddw mm7,mm6 479 paddw mm5,mm4 480 psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7) 481 psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15) 482 483 psllw mm7,BYTE_BIT 484 psllw mm5,BYTE_BIT 485 por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) 486 por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) 487 488 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 489 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 490 491 poppic ebx 492 493 sub eax, byte SIZEOF_MMWORD 494 add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) 495 add ebx, byte 1*SIZEOF_MMWORD ; inptr0 496 add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) 497 add edx, byte 2*SIZEOF_MMWORD ; outptr0 498 add edi, byte 2*SIZEOF_MMWORD ; outptr1 499 cmp eax, byte SIZEOF_MMWORD 500 ja near .columnloop 501 test eax,eax 502 jnz near .columnloop_last 503 504 pop esi 505 pop edi 506 pop ecx 507 pop eax 508 509 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 510 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 511 sub ecx, byte 2 ; rowctr 512 jg near .rowloop 513 514 emms ; empty MMX state 515 516.return: 517 pop edi 518 pop esi 519; pop edx ; need not be preserved 520; pop ecx ; need not be preserved 521 pop ebx 522 mov esp,ebp ; esp <- aligned ebp 523 pop esp ; esp <- original ebp 524 pop ebp 525 ret 526 527; -------------------------------------------------------------------------- 528; 529; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 530; It's still a box filter. 531; 532; GLOBAL(void) 533; jsimd_h2v1_upsample_mmx (int max_v_samp_factor, 534; JDIMENSION output_width, 535; JSAMPARRAY input_data, 536; JSAMPARRAY * output_data_ptr); 537; 538 539%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 540%define output_width(b) (b)+12 ; JDIMENSION output_width 541%define input_data(b) (b)+16 ; JSAMPARRAY input_data 542%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 543 544 align 16 545 global EXTN(jsimd_h2v1_upsample_mmx) 546 547EXTN(jsimd_h2v1_upsample_mmx): 548 push ebp 549 mov ebp,esp 550; push ebx ; unused 551; push ecx ; need not be preserved 552; push edx ; need not be preserved 553 push esi 554 push edi 555 556 mov edx, JDIMENSION [output_width(ebp)] 557 add edx, byte (2*SIZEOF_MMWORD)-1 558 and edx, byte -(2*SIZEOF_MMWORD) 559 jz short .return 560 561 mov ecx, INT [max_v_samp(ebp)] ; rowctr 562 test ecx,ecx 563 jz short .return 564 565 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 566 mov edi, POINTER [output_data_ptr(ebp)] 567 mov edi, JSAMPARRAY [edi] ; output_data 568 alignx 16,7 569.rowloop: 570 push edi 571 push esi 572 573 mov esi, JSAMPROW [esi] ; inptr 574 mov edi, JSAMPROW [edi] ; outptr 575 mov eax,edx ; colctr 576 alignx 16,7 577.columnloop: 578 579 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 580 581 movq mm1,mm0 582 punpcklbw mm0,mm0 583 punpckhbw mm1,mm1 584 585 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 586 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 587 588 sub eax, byte 2*SIZEOF_MMWORD 589 jz short .nextrow 590 591 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 592 593 movq mm3,mm2 594 punpcklbw mm2,mm2 595 punpckhbw mm3,mm3 596 597 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 598 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 599 600 sub eax, byte 2*SIZEOF_MMWORD 601 jz short .nextrow 602 603 add esi, byte 2*SIZEOF_MMWORD ; inptr 604 add edi, byte 4*SIZEOF_MMWORD ; outptr 605 jmp short .columnloop 606 alignx 16,7 607 608.nextrow: 609 pop esi 610 pop edi 611 612 add esi, byte SIZEOF_JSAMPROW ; input_data 613 add edi, byte SIZEOF_JSAMPROW ; output_data 614 dec ecx ; rowctr 615 jg short .rowloop 616 617 emms ; empty MMX state 618 619.return: 620 pop edi 621 pop esi 622; pop edx ; need not be preserved 623; pop ecx ; need not be preserved 624; pop ebx ; unused 625 pop ebp 626 ret 627 628; -------------------------------------------------------------------------- 629; 630; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 631; It's still a box filter. 632; 633; GLOBAL(void) 634; jsimd_h2v2_upsample_mmx (int max_v_samp_factor, 635; JDIMENSION output_width, 636; JSAMPARRAY input_data, 637; JSAMPARRAY * output_data_ptr); 638; 639 640%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 641%define output_width(b) (b)+12 ; JDIMENSION output_width 642%define input_data(b) (b)+16 ; JSAMPARRAY input_data 643%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 644 645 align 16 646 global EXTN(jsimd_h2v2_upsample_mmx) 647 648EXTN(jsimd_h2v2_upsample_mmx): 649 push ebp 650 mov ebp,esp 651 push ebx 652; push ecx ; need not be preserved 653; push edx ; need not be preserved 654 push esi 655 push edi 656 657 mov edx, JDIMENSION [output_width(ebp)] 658 add edx, byte (2*SIZEOF_MMWORD)-1 659 and edx, byte -(2*SIZEOF_MMWORD) 660 jz near .return 661 662 mov ecx, INT [max_v_samp(ebp)] ; rowctr 663 test ecx,ecx 664 jz short .return 665 666 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 667 mov edi, POINTER [output_data_ptr(ebp)] 668 mov edi, JSAMPARRAY [edi] ; output_data 669 alignx 16,7 670.rowloop: 671 push edi 672 push esi 673 674 mov esi, JSAMPROW [esi] ; inptr 675 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 676 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 677 mov eax,edx ; colctr 678 alignx 16,7 679.columnloop: 680 681 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 682 683 movq mm1,mm0 684 punpcklbw mm0,mm0 685 punpckhbw mm1,mm1 686 687 movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 688 movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 689 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 690 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 691 692 sub eax, byte 2*SIZEOF_MMWORD 693 jz short .nextrow 694 695 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 696 697 movq mm3,mm2 698 punpcklbw mm2,mm2 699 punpckhbw mm3,mm3 700 701 movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 702 movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 703 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 704 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 705 706 sub eax, byte 2*SIZEOF_MMWORD 707 jz short .nextrow 708 709 add esi, byte 2*SIZEOF_MMWORD ; inptr 710 add ebx, byte 4*SIZEOF_MMWORD ; outptr0 711 add edi, byte 4*SIZEOF_MMWORD ; outptr1 712 jmp short .columnloop 713 alignx 16,7 714 715.nextrow: 716 pop esi 717 pop edi 718 719 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 720 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 721 sub ecx, byte 2 ; rowctr 722 jg short .rowloop 723 724 emms ; empty MMX state 725 726.return: 727 pop edi 728 pop esi 729; pop edx ; need not be preserved 730; pop ecx ; need not be preserved 731 pop ebx 732 pop ebp 733 ret 734 735; For some reason, the OS X linker does not honor the request to align the 736; segment unless we do this. 737 align 16 738