1268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj 2268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj#include <stdio.h> 383b62cbbab29bde83eba40231f307c2a311e73c8njn#include <stdlib.h> 4268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj#include <assert.h> 5268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj 6268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjtypedef unsigned char UChar; 7268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjtypedef unsigned int UInt; 8268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj 9268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjstatic UInt randomUInt ( void ) 10268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj{ 11268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj static UInt n = 0; 12268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj /* From "Numerical Recipes in C" 2nd Edition */ 13268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj n = 1664525UL * n + 1013904223UL; 14268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj return n >> 17; 15268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj} 16268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj 17268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjvoid maskmovq_mmx ( UChar* regL, UChar* regR ) 18268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj{ 19268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj int i; 20268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj UChar* dst = malloc(8); 21268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj assert(dst); 22268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj for (i = 0; i < 8; i++) 23268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj dst[i] = 17 * (i+1); 24268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj __asm__ __volatile__( 25268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj "emms\n\t" 26268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj "movq (%0), %%mm1\n\t" 27268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj "movq (%1), %%mm2\n\t" 28268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj "movq %2, %%rdi\n\t" 29268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj "maskmovq %%mm1,%%mm2" 30268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj : /*out*/ 31268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj : /*in*/ "r"(regL), "r"(regR), "r"(&dst[0]) 32268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj : /*trash*/ "rdi", "memory", "cc" 33268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj ); 34268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj for (i = 0; i < 8; i++) 35268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf("%02x", dst[i]); 36268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj free(dst); 37268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj} 38268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj 39268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjvoid maskmovdqu_sse ( UChar* regL, UChar* regR ) 40268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj{ 41268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj int i; 42268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj UChar* dst = malloc(16); 43268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj assert(dst); 44268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj for (i = 0; i < 16; i++) 45268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj dst[i] = i; 46268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj __asm__ __volatile__( 47268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj "movups (%0), %%xmm1\n\t" 48268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj "movups (%1), %%xmm12\n\t" 49268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj "movq %2, %%rdi\n\t" 50268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj "maskmovdqu %%xmm12,%%xmm1\n\t" 51268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj "sfence" 52268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj : /*out*/ 53268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj : /*in*/ "r"(regL), "r"(regR), "r"(dst) 54268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj : /*trash*/ "rdi", "memory", "cc" 55268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj ); 56268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj for (i = 0; i < 16; i++) 57268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf("%02x", dst[i]); 58268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj free(dst); 59268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj} 60268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj 61268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjint main ( int argc, char** argv ) 62268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj{ 63268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj int i, j; 64268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj 65268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj /* mmx test */ 66268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj { 67268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj UChar* regL = malloc(8); 68268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj UChar* regR = malloc(8); 69268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj assert(regL); 70268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj assert(regR); 71268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj for (i = 0; i < 10; i++) { 72268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj for (j = 0; j < 8; j++) { 73268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj regL[j] = (UChar)randomUInt(); 74268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf("%02x", regL[j]); 75268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj } 76268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf(" "); 77268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj for (j = 0; j < 8; j++) { 78268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj regR[j] = (UChar)randomUInt(); 79268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf("%02x", regR[j]); 80268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj } 81268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf(" "); 82268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj maskmovq_mmx( regR, regL ); 83268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf("\n"); 84268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj } 85268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj } 86268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj 87268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj /* sse test */ 88268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj { 89268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj UChar* regL = malloc(16); 90268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj UChar* regR = malloc(16); 91268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj assert(regL); 92268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj assert(regR); 93268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj for (i = 0; i < 10; i++) { 94268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj for (j = 0; j < 16; j++) { 95268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj regL[j] = (UChar)randomUInt(); 96268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf("%02x", regL[j]); 97268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj } 98268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf(" "); 99268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj for (j = 0; j < 16; j++) { 100268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj regR[j] = (UChar)randomUInt(); 101268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf("%02x", regR[j]); 102268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj } 103268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf(" "); 104268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj maskmovdqu_sse( regR, regL ); 105268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj printf("\n"); 106268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj } 107268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj } 108268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj 109268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj return 0; 110268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj} 111