x86_64-gcc.c revision 53b609c9db503488fc065728d214c9db67ad5740
1#include <openssl/bn.h>
2
3#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && !defined(OPENSSL_WINDOWS)
4
5#include "../internal.h"
6
7/* x86_64 BIGNUM accelerator version 0.1, December 2002.
8 *
9 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
10 * project.
11 *
12 * Rights for redistribution and usage in source and binary forms are
13 * granted according to the OpenSSL license. Warranty of any kind is
14 * disclaimed.
15 *
16 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
17 *    versions, like 1.0...
18 * A. Well, that's because this code is basically a quick-n-dirty
19 *    proof-of-concept hack. As you can see it's implemented with
20 *    inline assembler, which means that you're bound to GCC and that
21 *    there might be enough room for further improvement.
22 *
23 * Q. Why inline assembler?
24 * A. x86_64 features own ABI which I'm not familiar with. This is
25 *    why I decided to let the compiler take care of subroutine
26 *    prologue/epilogue as well as register allocation. For reference.
27 *    Win64 implements different ABI for AMD64, different from Linux.
28 *
29 * Q. How much faster does it get?
30 * A. 'apps/openssl speed rsa dsa' output with no-asm:
31 *
32 *	                  sign    verify    sign/s verify/s
33 *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
34 *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
35 *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
36 *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
37 *	                  sign    verify    sign/s verify/s
38 *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
39 *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
40 *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
41 *
42 *    'apps/openssl speed rsa dsa' output with this module:
43 *
44 *	                  sign    verify    sign/s verify/s
45 *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
46 *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
47 *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
48 *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
49 *	                  sign    verify    sign/s verify/s
50 *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
51 *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
52 *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
53 *
54 *    For the reference. IA-32 assembler implementation performs
55 *    very much like 64-bit code compiled with no-asm on the same
56 *    machine.
57 */
58
59 /* TODO(davidben): Get this file working on Windows x64. */
60
61#undef mul
62#undef mul_add
63
64#define asm __asm__
65
66/*
67 * "m"(a), "+m"(r)	is the way to favor DirectPath µ-code;
68 * "g"(0)		let the compiler to decide where does it
69 *			want to keep the value of zero;
70 */
71#define mul_add(r, a, word, carry)                                     \
72  do {                                                                 \
73    register BN_ULONG high, low;                                       \
74    asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
75    asm("addq %2,%0; adcq %3,%1"                                       \
76        : "+r"(carry), "+d"(high)                                      \
77        : "a"(low), "g"(0)                                             \
78        : "cc");                                                       \
79    asm("addq %2,%0; adcq %3,%1"                                       \
80        : "+m"(r), "+d"(high)                                          \
81        : "r"(carry), "g"(0)                                           \
82        : "cc");                                                       \
83    carry = high;                                                      \
84  } while (0)
85
86#define mul(r, a, word, carry)                                         \
87  do {                                                                 \
88    register BN_ULONG high, low;                                       \
89    asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
90    asm("addq %2,%0; adcq %3,%1"                                       \
91        : "+r"(carry), "+d"(high)                                      \
92        : "a"(low), "g"(0)                                             \
93        : "cc");                                                       \
94    (r) = carry, carry = high;                                         \
95  } while (0)
96#undef sqr
97#define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
98
99BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
100                          BN_ULONG w) {
101  BN_ULONG c1 = 0;
102
103  if (num <= 0)
104    return (c1);
105
106  while (num & ~3) {
107    mul_add(rp[0], ap[0], w, c1);
108    mul_add(rp[1], ap[1], w, c1);
109    mul_add(rp[2], ap[2], w, c1);
110    mul_add(rp[3], ap[3], w, c1);
111    ap += 4;
112    rp += 4;
113    num -= 4;
114  }
115  if (num) {
116    mul_add(rp[0], ap[0], w, c1);
117    if (--num == 0)
118      return c1;
119    mul_add(rp[1], ap[1], w, c1);
120    if (--num == 0)
121      return c1;
122    mul_add(rp[2], ap[2], w, c1);
123    return c1;
124  }
125
126  return (c1);
127}
128
129BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
130  BN_ULONG c1 = 0;
131
132  if (num <= 0)
133    return (c1);
134
135  while (num & ~3) {
136    mul(rp[0], ap[0], w, c1);
137    mul(rp[1], ap[1], w, c1);
138    mul(rp[2], ap[2], w, c1);
139    mul(rp[3], ap[3], w, c1);
140    ap += 4;
141    rp += 4;
142    num -= 4;
143  }
144  if (num) {
145    mul(rp[0], ap[0], w, c1);
146    if (--num == 0)
147      return c1;
148    mul(rp[1], ap[1], w, c1);
149    if (--num == 0)
150      return c1;
151    mul(rp[2], ap[2], w, c1);
152  }
153  return (c1);
154}
155
156void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
157  if (n <= 0)
158    return;
159
160  while (n & ~3) {
161    sqr(r[0], r[1], a[0]);
162    sqr(r[2], r[3], a[1]);
163    sqr(r[4], r[5], a[2]);
164    sqr(r[6], r[7], a[3]);
165    a += 4;
166    r += 8;
167    n -= 4;
168  }
169  if (n) {
170    sqr(r[0], r[1], a[0]);
171    if (--n == 0)
172      return;
173    sqr(r[2], r[3], a[1]);
174    if (--n == 0)
175      return;
176    sqr(r[4], r[5], a[2]);
177  }
178}
179
180BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
181  BN_ULONG ret, waste;
182
183  asm("divq	%4" : "=a"(ret), "=d"(waste) : "a"(l), "d"(h), "g"(d) : "cc");
184
185  return ret;
186}
187
188BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
189                      int n) {
190  BN_ULONG ret;
191  size_t i = 0;
192
193  if (n <= 0)
194    return 0;
195
196  asm volatile (
197      "	subq	%0,%0		\n" /* clear carry */
198      "	jmp	1f		\n"
199      ".p2align 4			\n"
200      "1:	movq	(%4,%2,8),%0	\n"
201      "	adcq	(%5,%2,8),%0	\n"
202      "	movq	%0,(%3,%2,8)	\n"
203      "	lea	1(%2),%2	\n"
204      "	loop	1b		\n"
205      "	sbbq	%0,%0		\n"
206      : "=&r"(ret), "+c"(n), "+r"(i)
207      : "r"(rp), "r"(ap), "r"(bp)
208      : "cc", "memory");
209
210  return ret & 1;
211}
212
213#ifndef SIMICS
214BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
215                      int n) {
216  BN_ULONG ret;
217  size_t i = 0;
218
219  if (n <= 0)
220    return 0;
221
222  asm volatile (
223      "	subq	%0,%0		\n" /* clear borrow */
224      "	jmp	1f		\n"
225      ".p2align 4			\n"
226      "1:	movq	(%4,%2,8),%0	\n"
227      "	sbbq	(%5,%2,8),%0	\n"
228      "	movq	%0,(%3,%2,8)	\n"
229      "	lea	1(%2),%2	\n"
230      "	loop	1b		\n"
231      "	sbbq	%0,%0		\n"
232      : "=&r"(ret), "+c"(n), "+r"(i)
233      : "r"(rp), "r"(ap), "r"(bp)
234      : "cc", "memory");
235
236  return ret & 1;
237}
238#else
239/* Simics 1.4<7 has buggy sbbq:-( */
240#define BN_MASK2 0xffffffffffffffffL
241BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) {
242  BN_ULONG t1, t2;
243  int c = 0;
244
245  if (n <= 0)
246    return ((BN_ULONG)0);
247
248  for (;;) {
249    t1 = a[0];
250    t2 = b[0];
251    r[0] = (t1 - t2 - c) & BN_MASK2;
252    if (t1 != t2)
253      c = (t1 < t2);
254    if (--n <= 0)
255      break;
256
257    t1 = a[1];
258    t2 = b[1];
259    r[1] = (t1 - t2 - c) & BN_MASK2;
260    if (t1 != t2)
261      c = (t1 < t2);
262    if (--n <= 0)
263      break;
264
265    t1 = a[2];
266    t2 = b[2];
267    r[2] = (t1 - t2 - c) & BN_MASK2;
268    if (t1 != t2)
269      c = (t1 < t2);
270    if (--n <= 0)
271      break;
272
273    t1 = a[3];
274    t2 = b[3];
275    r[3] = (t1 - t2 - c) & BN_MASK2;
276    if (t1 != t2)
277      c = (t1 < t2);
278    if (--n <= 0)
279      break;
280
281    a += 4;
282    b += 4;
283    r += 4;
284  }
285  return (c);
286}
287#endif
288
289/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
290/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
291/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
292/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
293 */
294
295/* Keep in mind that carrying into high part of multiplication result can not
296 * overflow, because it cannot be all-ones. */
297#define mul_add_c(a, b, c0, c1, c2)          \
298  do {                                       \
299    BN_ULONG t1, t2;                \
300    asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
301    asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
302        : "+r"(c0), "+r"(c1), "+r"(c2)       \
303        : "r"(t1), "r"(t2), "g"(0)           \
304        : "cc");                             \
305  } while (0)
306
307#define sqr_add_c(a, i, c0, c1, c2)          \
308  do {                                       \
309    BN_ULONG t1, t2;                         \
310    asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"(a[i]) : "cc"); \
311    asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
312        : "+r"(c0), "+r"(c1), "+r"(c2)       \
313        : "r"(t1), "r"(t2), "g"(0)           \
314        : "cc");                             \
315  } while (0)
316
317#define mul_add_c2(a, b, c0, c1, c2)         \
318  do {                                       \
319    BN_ULONG t1, t2;                                                    \
320    asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc");        \
321    asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
322        : "+r"(c0), "+r"(c1), "+r"(c2)       \
323        : "r"(t1), "r"(t2), "g"(0)           \
324        : "cc");                             \
325    asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
326        : "+r"(c0), "+r"(c1), "+r"(c2)       \
327        : "r"(t1), "r"(t2), "g"(0)           \
328        : "cc");                             \
329  } while (0)
330
331#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
332
333void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
334  BN_ULONG c1, c2, c3;
335
336  c1 = 0;
337  c2 = 0;
338  c3 = 0;
339  mul_add_c(a[0], b[0], c1, c2, c3);
340  r[0] = c1;
341  c1 = 0;
342  mul_add_c(a[0], b[1], c2, c3, c1);
343  mul_add_c(a[1], b[0], c2, c3, c1);
344  r[1] = c2;
345  c2 = 0;
346  mul_add_c(a[2], b[0], c3, c1, c2);
347  mul_add_c(a[1], b[1], c3, c1, c2);
348  mul_add_c(a[0], b[2], c3, c1, c2);
349  r[2] = c3;
350  c3 = 0;
351  mul_add_c(a[0], b[3], c1, c2, c3);
352  mul_add_c(a[1], b[2], c1, c2, c3);
353  mul_add_c(a[2], b[1], c1, c2, c3);
354  mul_add_c(a[3], b[0], c1, c2, c3);
355  r[3] = c1;
356  c1 = 0;
357  mul_add_c(a[4], b[0], c2, c3, c1);
358  mul_add_c(a[3], b[1], c2, c3, c1);
359  mul_add_c(a[2], b[2], c2, c3, c1);
360  mul_add_c(a[1], b[3], c2, c3, c1);
361  mul_add_c(a[0], b[4], c2, c3, c1);
362  r[4] = c2;
363  c2 = 0;
364  mul_add_c(a[0], b[5], c3, c1, c2);
365  mul_add_c(a[1], b[4], c3, c1, c2);
366  mul_add_c(a[2], b[3], c3, c1, c2);
367  mul_add_c(a[3], b[2], c3, c1, c2);
368  mul_add_c(a[4], b[1], c3, c1, c2);
369  mul_add_c(a[5], b[0], c3, c1, c2);
370  r[5] = c3;
371  c3 = 0;
372  mul_add_c(a[6], b[0], c1, c2, c3);
373  mul_add_c(a[5], b[1], c1, c2, c3);
374  mul_add_c(a[4], b[2], c1, c2, c3);
375  mul_add_c(a[3], b[3], c1, c2, c3);
376  mul_add_c(a[2], b[4], c1, c2, c3);
377  mul_add_c(a[1], b[5], c1, c2, c3);
378  mul_add_c(a[0], b[6], c1, c2, c3);
379  r[6] = c1;
380  c1 = 0;
381  mul_add_c(a[0], b[7], c2, c3, c1);
382  mul_add_c(a[1], b[6], c2, c3, c1);
383  mul_add_c(a[2], b[5], c2, c3, c1);
384  mul_add_c(a[3], b[4], c2, c3, c1);
385  mul_add_c(a[4], b[3], c2, c3, c1);
386  mul_add_c(a[5], b[2], c2, c3, c1);
387  mul_add_c(a[6], b[1], c2, c3, c1);
388  mul_add_c(a[7], b[0], c2, c3, c1);
389  r[7] = c2;
390  c2 = 0;
391  mul_add_c(a[7], b[1], c3, c1, c2);
392  mul_add_c(a[6], b[2], c3, c1, c2);
393  mul_add_c(a[5], b[3], c3, c1, c2);
394  mul_add_c(a[4], b[4], c3, c1, c2);
395  mul_add_c(a[3], b[5], c3, c1, c2);
396  mul_add_c(a[2], b[6], c3, c1, c2);
397  mul_add_c(a[1], b[7], c3, c1, c2);
398  r[8] = c3;
399  c3 = 0;
400  mul_add_c(a[2], b[7], c1, c2, c3);
401  mul_add_c(a[3], b[6], c1, c2, c3);
402  mul_add_c(a[4], b[5], c1, c2, c3);
403  mul_add_c(a[5], b[4], c1, c2, c3);
404  mul_add_c(a[6], b[3], c1, c2, c3);
405  mul_add_c(a[7], b[2], c1, c2, c3);
406  r[9] = c1;
407  c1 = 0;
408  mul_add_c(a[7], b[3], c2, c3, c1);
409  mul_add_c(a[6], b[4], c2, c3, c1);
410  mul_add_c(a[5], b[5], c2, c3, c1);
411  mul_add_c(a[4], b[6], c2, c3, c1);
412  mul_add_c(a[3], b[7], c2, c3, c1);
413  r[10] = c2;
414  c2 = 0;
415  mul_add_c(a[4], b[7], c3, c1, c2);
416  mul_add_c(a[5], b[6], c3, c1, c2);
417  mul_add_c(a[6], b[5], c3, c1, c2);
418  mul_add_c(a[7], b[4], c3, c1, c2);
419  r[11] = c3;
420  c3 = 0;
421  mul_add_c(a[7], b[5], c1, c2, c3);
422  mul_add_c(a[6], b[6], c1, c2, c3);
423  mul_add_c(a[5], b[7], c1, c2, c3);
424  r[12] = c1;
425  c1 = 0;
426  mul_add_c(a[6], b[7], c2, c3, c1);
427  mul_add_c(a[7], b[6], c2, c3, c1);
428  r[13] = c2;
429  c2 = 0;
430  mul_add_c(a[7], b[7], c3, c1, c2);
431  r[14] = c3;
432  r[15] = c1;
433}
434
435void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
436  BN_ULONG c1, c2, c3;
437
438  c1 = 0;
439  c2 = 0;
440  c3 = 0;
441  mul_add_c(a[0], b[0], c1, c2, c3);
442  r[0] = c1;
443  c1 = 0;
444  mul_add_c(a[0], b[1], c2, c3, c1);
445  mul_add_c(a[1], b[0], c2, c3, c1);
446  r[1] = c2;
447  c2 = 0;
448  mul_add_c(a[2], b[0], c3, c1, c2);
449  mul_add_c(a[1], b[1], c3, c1, c2);
450  mul_add_c(a[0], b[2], c3, c1, c2);
451  r[2] = c3;
452  c3 = 0;
453  mul_add_c(a[0], b[3], c1, c2, c3);
454  mul_add_c(a[1], b[2], c1, c2, c3);
455  mul_add_c(a[2], b[1], c1, c2, c3);
456  mul_add_c(a[3], b[0], c1, c2, c3);
457  r[3] = c1;
458  c1 = 0;
459  mul_add_c(a[3], b[1], c2, c3, c1);
460  mul_add_c(a[2], b[2], c2, c3, c1);
461  mul_add_c(a[1], b[3], c2, c3, c1);
462  r[4] = c2;
463  c2 = 0;
464  mul_add_c(a[2], b[3], c3, c1, c2);
465  mul_add_c(a[3], b[2], c3, c1, c2);
466  r[5] = c3;
467  c3 = 0;
468  mul_add_c(a[3], b[3], c1, c2, c3);
469  r[6] = c1;
470  r[7] = c2;
471}
472
473void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
474  BN_ULONG c1, c2, c3;
475
476  c1 = 0;
477  c2 = 0;
478  c3 = 0;
479  sqr_add_c(a, 0, c1, c2, c3);
480  r[0] = c1;
481  c1 = 0;
482  sqr_add_c2(a, 1, 0, c2, c3, c1);
483  r[1] = c2;
484  c2 = 0;
485  sqr_add_c(a, 1, c3, c1, c2);
486  sqr_add_c2(a, 2, 0, c3, c1, c2);
487  r[2] = c3;
488  c3 = 0;
489  sqr_add_c2(a, 3, 0, c1, c2, c3);
490  sqr_add_c2(a, 2, 1, c1, c2, c3);
491  r[3] = c1;
492  c1 = 0;
493  sqr_add_c(a, 2, c2, c3, c1);
494  sqr_add_c2(a, 3, 1, c2, c3, c1);
495  sqr_add_c2(a, 4, 0, c2, c3, c1);
496  r[4] = c2;
497  c2 = 0;
498  sqr_add_c2(a, 5, 0, c3, c1, c2);
499  sqr_add_c2(a, 4, 1, c3, c1, c2);
500  sqr_add_c2(a, 3, 2, c3, c1, c2);
501  r[5] = c3;
502  c3 = 0;
503  sqr_add_c(a, 3, c1, c2, c3);
504  sqr_add_c2(a, 4, 2, c1, c2, c3);
505  sqr_add_c2(a, 5, 1, c1, c2, c3);
506  sqr_add_c2(a, 6, 0, c1, c2, c3);
507  r[6] = c1;
508  c1 = 0;
509  sqr_add_c2(a, 7, 0, c2, c3, c1);
510  sqr_add_c2(a, 6, 1, c2, c3, c1);
511  sqr_add_c2(a, 5, 2, c2, c3, c1);
512  sqr_add_c2(a, 4, 3, c2, c3, c1);
513  r[7] = c2;
514  c2 = 0;
515  sqr_add_c(a, 4, c3, c1, c2);
516  sqr_add_c2(a, 5, 3, c3, c1, c2);
517  sqr_add_c2(a, 6, 2, c3, c1, c2);
518  sqr_add_c2(a, 7, 1, c3, c1, c2);
519  r[8] = c3;
520  c3 = 0;
521  sqr_add_c2(a, 7, 2, c1, c2, c3);
522  sqr_add_c2(a, 6, 3, c1, c2, c3);
523  sqr_add_c2(a, 5, 4, c1, c2, c3);
524  r[9] = c1;
525  c1 = 0;
526  sqr_add_c(a, 5, c2, c3, c1);
527  sqr_add_c2(a, 6, 4, c2, c3, c1);
528  sqr_add_c2(a, 7, 3, c2, c3, c1);
529  r[10] = c2;
530  c2 = 0;
531  sqr_add_c2(a, 7, 4, c3, c1, c2);
532  sqr_add_c2(a, 6, 5, c3, c1, c2);
533  r[11] = c3;
534  c3 = 0;
535  sqr_add_c(a, 6, c1, c2, c3);
536  sqr_add_c2(a, 7, 5, c1, c2, c3);
537  r[12] = c1;
538  c1 = 0;
539  sqr_add_c2(a, 7, 6, c2, c3, c1);
540  r[13] = c2;
541  c2 = 0;
542  sqr_add_c(a, 7, c3, c1, c2);
543  r[14] = c3;
544  r[15] = c1;
545}
546
547void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
548  BN_ULONG c1, c2, c3;
549
550  c1 = 0;
551  c2 = 0;
552  c3 = 0;
553  sqr_add_c(a, 0, c1, c2, c3);
554  r[0] = c1;
555  c1 = 0;
556  sqr_add_c2(a, 1, 0, c2, c3, c1);
557  r[1] = c2;
558  c2 = 0;
559  sqr_add_c(a, 1, c3, c1, c2);
560  sqr_add_c2(a, 2, 0, c3, c1, c2);
561  r[2] = c3;
562  c3 = 0;
563  sqr_add_c2(a, 3, 0, c1, c2, c3);
564  sqr_add_c2(a, 2, 1, c1, c2, c3);
565  r[3] = c1;
566  c1 = 0;
567  sqr_add_c(a, 2, c2, c3, c1);
568  sqr_add_c2(a, 3, 1, c2, c3, c1);
569  r[4] = c2;
570  c2 = 0;
571  sqr_add_c2(a, 3, 2, c3, c1, c2);
572  r[5] = c3;
573  c3 = 0;
574  sqr_add_c(a, 3, c1, c2, c3);
575  r[6] = c1;
576  r[7] = c2;
577}
578
579#endif  /* !NO_ASM && X86_64 && !WINDOWS */
580