1268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj
2268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj#include <stdio.h>
383b62cbbab29bde83eba40231f307c2a311e73c8njn#include <stdlib.h>
4268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj#include <assert.h>
5268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj
6268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjtypedef  unsigned char  UChar;
7268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjtypedef  unsigned int   UInt;
8268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj
9268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjstatic UInt randomUInt ( void )
10268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj{
11268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   static UInt n = 0;
12268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   /* From "Numerical Recipes in C" 2nd Edition */
13268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   n = 1664525UL * n + 1013904223UL;
14268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   return n >> 17;
15268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj}
16268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj
17268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjvoid maskmovq_mmx ( UChar* regL, UChar* regR )
18268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj{
19268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   int i;
20268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   UChar* dst = malloc(8);
21268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   assert(dst);
22268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   for (i = 0; i < 8; i++)
23268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      dst[i] = 17 * (i+1);
24268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   __asm__ __volatile__(
25268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      "emms\n\t"
26268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      "movq (%0), %%mm1\n\t"
27268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      "movq (%1), %%mm2\n\t"
28268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      "movq %2, %%rdi\n\t"
29268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      "maskmovq %%mm1,%%mm2"
30268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      : /*out*/
31268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      : /*in*/ "r"(regL), "r"(regR), "r"(&dst[0])
32268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      : /*trash*/ "rdi", "memory", "cc"
33268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   );
34268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   for (i = 0; i < 8; i++)
35268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      printf("%02x", dst[i]);
36268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   free(dst);
37268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj}
38268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj
39268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjvoid maskmovdqu_sse ( UChar* regL, UChar* regR )
40268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj{
41268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   int i;
42268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   UChar* dst = malloc(16);
43268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   assert(dst);
44268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   for (i = 0; i < 16; i++)
45268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      dst[i] = i;
46268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   __asm__ __volatile__(
47268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      "movups (%0), %%xmm1\n\t"
48268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      "movups (%1), %%xmm12\n\t"
49268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      "movq %2, %%rdi\n\t"
50268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      "maskmovdqu %%xmm12,%%xmm1\n\t"
51268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      "sfence"
52268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      : /*out*/
53268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      : /*in*/ "r"(regL), "r"(regR), "r"(dst)
54268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      : /*trash*/ "rdi", "memory", "cc"
55268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   );
56268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   for (i = 0; i < 16; i++)
57268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      printf("%02x", dst[i]);
58268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   free(dst);
59268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj}
60268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj
61268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardjint main ( int argc, char** argv )
62268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj{
63268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   int i, j;
64268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj
65268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   /* mmx test */
66268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   {
67268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      UChar* regL = malloc(8);
68268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      UChar* regR = malloc(8);
69268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      assert(regL);
70268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      assert(regR);
71268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      for (i = 0; i < 10; i++) {
72268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         for (j = 0; j < 8; j++) {
73268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj            regL[j] = (UChar)randomUInt();
74268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj            printf("%02x", regL[j]);
75268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         }
76268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         printf(" ");
77268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         for (j = 0; j < 8; j++) {
78268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj            regR[j] = (UChar)randomUInt();
79268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj            printf("%02x", regR[j]);
80268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         }
81268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         printf(" ");
82268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         maskmovq_mmx( regR, regL );
83268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         printf("\n");
84268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      }
85268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   }
86268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj
87268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   /* sse test */
88268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   {
89268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      UChar* regL = malloc(16);
90268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      UChar* regR = malloc(16);
91268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      assert(regL);
92268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      assert(regR);
93268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      for (i = 0; i < 10; i++) {
94268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         for (j = 0; j < 16; j++) {
95268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj            regL[j] = (UChar)randomUInt();
96268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj            printf("%02x", regL[j]);
97268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         }
98268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         printf(" ");
99268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         for (j = 0; j < 16; j++) {
100268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj            regR[j] = (UChar)randomUInt();
101268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj            printf("%02x", regR[j]);
102268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         }
103268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         printf(" ");
104268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         maskmovdqu_sse( regR, regL );
105268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj         printf("\n");
106268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj      }
107268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   }
108268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj
109268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj   return 0;
110268b2f5c1f2b054bcb947248dc01441b0fa874a9sewardj}
111