p256_ec.c revision db0850c3b637faaa7cbe1bab2e6c91ad2af35426
1/*
2 * Copyright 2013 The Android Open Source Project
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *     * Redistributions of source code must retain the above copyright
7 *       notice, this list of conditions and the following disclaimer.
8 *     * Redistributions in binary form must reproduce the above copyright
9 *       notice, this list of conditions and the following disclaimer in the
10 *       documentation and/or other materials provided with the distribution.
11 *     * Neither the name of Google Inc. nor the names of its contributors may
12 *       be used to endorse or promote products derived from this software
13 *       without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
17 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
18 * EVENT SHALL Google Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
24 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27// This is an implementation of the P256 elliptic curve group. It's written to
28// be portable 32-bit, although it's still constant-time.
29//
30// WARNING: Implementing these functions in a constant-time manner is far from
31//          obvious. Be careful when touching this code.
32//
33// See http://www.imperialviolet.org/2010/12/04/ecc.html ([1]) for background.
34
35#include <stdint.h>
36#include <stdio.h>
37
38#include <string.h>
39#include <stdlib.h>
40
41#include "mincrypt/p256.h"
42
43typedef uint8_t u8;
44typedef uint32_t u32;
45typedef int32_t s32;
46typedef uint64_t u64;
47
48/* Our field elements are represented as nine 32-bit limbs.
49 *
50 * The value of an felem (field element) is:
51 *   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
52 *
53 * That is, each limb is alternately 29 or 28-bits wide in little-endian
54 * order.
55 *
56 * This means that an felem hits 2**257, rather than 2**256 as we would like. A
57 * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
58 * when multiplying as terms end up one bit short of a limb which would require
59 * much bit-shifting to correct.
60 *
61 * Finally, the values stored in an felem are in Montgomery form. So the value
62 * |y| is stored as (y*R) mod p, where p is the P-256 prime and R is 2**257.
63 */
64typedef u32 limb;
65#define NLIMBS 9
66typedef limb felem[NLIMBS];
67
68static const limb kBottom28Bits = 0xfffffff;
69static const limb kBottom29Bits = 0x1fffffff;
70
71/* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
72 * 28-bit words. */
73static const felem kOne = {
74    2, 0, 0, 0xffff800,
75    0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
76    0
77};
78static const felem kZero = {0};
79static const felem kP = {
80    0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
81    0, 0, 0x200000, 0xf000000,
82    0xfffffff
83};
84static const felem k2P = {
85    0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
86    0, 0, 0x400000, 0xe000000,
87    0x1fffffff
88};
89/* kPrecomputed contains precomputed values to aid the calculation of scalar
90 * multiples of the base point, G. It's actually two, equal length, tables
91 * concatenated.
92 *
93 * The first table contains (x,y) felem pairs for 16 multiples of the base
94 * point, G.
95 *
96 *   Index  |  Index (binary) | Value
97 *       0  |           0000  | 0G (all zeros, omitted)
98 *       1  |           0001  | G
99 *       2  |           0010  | 2**64G
100 *       3  |           0011  | 2**64G + G
101 *       4  |           0100  | 2**128G
102 *       5  |           0101  | 2**128G + G
103 *       6  |           0110  | 2**128G + 2**64G
104 *       7  |           0111  | 2**128G + 2**64G + G
105 *       8  |           1000  | 2**192G
106 *       9  |           1001  | 2**192G + G
107 *      10  |           1010  | 2**192G + 2**64G
108 *      11  |           1011  | 2**192G + 2**64G + G
109 *      12  |           1100  | 2**192G + 2**128G
110 *      13  |           1101  | 2**192G + 2**128G + G
111 *      14  |           1110  | 2**192G + 2**128G + 2**64G
112 *      15  |           1111  | 2**192G + 2**128G + 2**64G + G
113 *
114 * The second table follows the same style, but the terms are 2**32G,
115 * 2**96G, 2**160G, 2**224G.
116 *
117 * This is ~2KB of data. */
118static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
119    0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
120    0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
121    0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
122    0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
123    0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
124    0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
125    0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
126    0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
127    0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
128    0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
129    0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
130    0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
131    0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
132    0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
133    0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
134    0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
135    0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
136    0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
137    0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
138    0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
139    0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
140    0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
141    0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
142    0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
143    0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
144    0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
145    0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
146    0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
147    0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
148    0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
149    0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
150    0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
151    0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
152    0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
153    0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
154    0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
155    0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
156    0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
157    0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
158    0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
159    0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
160    0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
161    0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
162    0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
163    0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
164    0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
165    0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
166    0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
167    0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
168    0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
169    0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
170    0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
171    0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
172    0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
173    0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
174    0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
175    0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
176    0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
177    0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
178    0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
179};
180
181
182/* Field element operations: */
183
184/* NON_ZERO_TO_ALL_ONES returns:
185 *   0xffffffff for 0 < x <= 2**31
186 *   0 for x == 0 or x > 2**31.
187 *
188 * x must be a u32 or an equivalent type such as limb. */
189#define NON_ZERO_TO_ALL_ONES(x) ((((u32)(x) - 1) >> 31) - 1)
190
191/* felem_reduce_carry adds a multiple of p in order to cancel |carry|,
192 * which is a term at 2**257.
193 *
194 * On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
195 * On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29. */
196static void felem_reduce_carry(felem inout, limb carry) {
197  const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
198
199  inout[0] += carry << 1;
200  inout[3] += 0x10000000 & carry_mask;
201  /* carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
202   * previous line therefore this doesn't underflow. */
203  inout[3] -= carry << 11;
204  inout[4] += (0x20000000 - 1) & carry_mask;
205  inout[5] += (0x10000000 - 1) & carry_mask;
206  inout[6] += (0x20000000 - 1) & carry_mask;
207  inout[6] -= carry << 22;
208  /* This may underflow if carry is non-zero but, if so, we'll fix it in the
209   * next line. */
210  inout[7] -= 1 & carry_mask;
211  inout[7] += carry << 25;
212}
213
214/* felem_sum sets out = in+in2.
215 *
216 * On entry, in[i]+in2[i] must not overflow a 32-bit word.
217 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
218static void felem_sum(felem out, const felem in, const felem in2) {
219  limb carry = 0;
220  unsigned i;
221
222  for (i = 0;; i++) {
223    out[i] = in[i] + in2[i];
224    out[i] += carry;
225    carry = out[i] >> 29;
226    out[i] &= kBottom29Bits;
227
228    i++;
229    if (i == NLIMBS)
230      break;
231
232    out[i] = in[i] + in2[i];
233    out[i] += carry;
234    carry = out[i] >> 28;
235    out[i] &= kBottom28Bits;
236  }
237
238  felem_reduce_carry(out, carry);
239}
240
241#define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
242#define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
243#define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
244#define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
245#define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
246#define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
247
248/* zero31 is 0 mod p. */
249static const felem zero31 = { two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2 };
250
251/* felem_diff sets out = in-in2.
252 *
253 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
254 *           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
255 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
256static void felem_diff(felem out, const felem in, const felem in2) {
257  limb carry = 0;
258  unsigned i;
259
260   for (i = 0;; i++) {
261    out[i] = in[i] - in2[i];
262    out[i] += zero31[i];
263    out[i] += carry;
264    carry = out[i] >> 29;
265    out[i] &= kBottom29Bits;
266
267    i++;
268    if (i == NLIMBS)
269      break;
270
271    out[i] = in[i] - in2[i];
272    out[i] += zero31[i];
273    out[i] += carry;
274    carry = out[i] >> 28;
275    out[i] &= kBottom28Bits;
276  }
277
278  felem_reduce_carry(out, carry);
279}
280
281/* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
282 * with the same 29,28,... bit positions as an felem.
283 *
284 * The values in felems are in Montgomery form: x*R mod p where R = 2**257.
285 * Since we just multiplied two Montgomery values together, the result is
286 * x*y*R*R mod p. We wish to divide by R in order for the result also to be
287 * in Montgomery form.
288 *
289 * On entry: tmp[i] < 2**64
290 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29 */
291static void felem_reduce_degree(felem out, u64 tmp[17]) {
292   /* The following table may be helpful when reading this code:
293    *
294    * Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
295    * Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
296    * Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
297    *   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285 */
298  limb tmp2[18], carry, x, xMask;
299  unsigned i;
300
301  /* tmp contains 64-bit words with the same 29,28,29-bit positions as an
302   * felem. So the top of an element of tmp might overlap with another
303   * element two positions down. The following loop eliminates this
304   * overlap. */
305  tmp2[0] = (limb)(tmp[0] & kBottom29Bits);
306
307  /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
308   * and hint to the compiler that it can do a single-word shift by selecting
309   * the right register rather than doing a double-word shift and truncating
310   * afterwards. */
311  tmp2[1] = ((limb) tmp[0]) >> 29;
312  tmp2[1] |= (((limb)(tmp[0] >> 32)) << 3) & kBottom28Bits;
313  tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
314  carry = tmp2[1] >> 28;
315  tmp2[1] &= kBottom28Bits;
316
317  for (i = 2; i < 17; i++) {
318    tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
319    tmp2[i] += ((limb)(tmp[i - 1])) >> 28;
320    tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
321    tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
322    tmp2[i] += carry;
323    carry = tmp2[i] >> 29;
324    tmp2[i] &= kBottom29Bits;
325
326    i++;
327    if (i == 17)
328      break;
329    tmp2[i] = ((limb)(tmp[i - 2] >> 32)) >> 25;
330    tmp2[i] += ((limb)(tmp[i - 1])) >> 29;
331    tmp2[i] += (((limb)(tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
332    tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
333    tmp2[i] += carry;
334    carry = tmp2[i] >> 28;
335    tmp2[i] &= kBottom28Bits;
336  }
337
338  tmp2[17] = ((limb)(tmp[15] >> 32)) >> 25;
339  tmp2[17] += ((limb)(tmp[16])) >> 29;
340  tmp2[17] += (((limb)(tmp[16] >> 32)) << 3);
341  tmp2[17] += carry;
342
343  /* Montgomery elimination of terms.
344   *
345   * Since R is 2**257, we can divide by R with a bitwise shift if we can
346   * ensure that the right-most 257 bits are all zero. We can make that true by
347   * adding multiplies of p without affecting the value.
348   *
349   * So we eliminate limbs from right to left. Since the bottom 29 bits of p
350   * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
351   * We can do that for 8 further limbs and then right shift to eliminate the
352   * extra factor of R. */
353  for (i = 0;; i += 2) {
354    tmp2[i + 1] += tmp2[i] >> 29;
355    x = tmp2[i] & kBottom29Bits;
356    xMask = NON_ZERO_TO_ALL_ONES(x);
357    tmp2[i] = 0;
358
359    /* The bounds calculations for this loop are tricky. Each iteration of
360     * the loop eliminates two words by adding values to words to their
361     * right.
362     *
363     * The following table contains the amounts added to each word (as an
364     * offset from the value of i at the top of the loop). The amounts are
365     * accounted for from the first and second half of the loop separately
366     * and are written as, for example, 28 to mean a value <2**28.
367     *
368     * Word:                   3   4   5   6   7   8   9   10
369     * Added in top half:     28  11      29  21  29  28
370     *                                        28  29
371     *                                            29
372     * Added in bottom half:      29  10      28  21  28   28
373     *                                            29
374     *
375     * The value that is currently offset 7 will be offset 5 for the next
376     * iteration and then offset 3 for the iteration after that. Therefore
377     * the total value added will be the values added at 7, 5 and 3.
378     *
379     * The following table accumulates these values. The sums at the bottom
380     * are written as, for example, 29+28, to mean a value < 2**29+2**28.
381     *
382     * Word:                   3   4   5   6   7   8   9  10  11  12  13
383     *                        28  11  10  29  21  29  28  28  28  28  28
384     *                            29  28  11  28  29  28  29  28  29  28
385     *                                    29  28  21  21  29  21  29  21
386     *                                        10  29  28  21  28  21  28
387     *                                        28  29  28  29  28  29  28
388     *                                            11  10  29  10  29  10
389     *                                            29  28  11  28  11
390     *                                                    29      29
391     *                        --------------------------------------------
392     *                                                30+ 31+ 30+ 31+ 30+
393     *                                                28+ 29+ 28+ 29+ 21+
394     *                                                21+ 28+ 21+ 28+ 10
395     *                                                10  21+ 10  21+
396     *                                                    11      11
397     *
398     * So the greatest amount is added to tmp2[10] and tmp2[12]. If
399     * tmp2[10/12] has an initial value of <2**29, then the maximum value
400     * will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
401     * as required. */
402    tmp2[i + 3] += (x << 10) & kBottom28Bits;
403    tmp2[i + 4] += (x >> 18);
404
405    tmp2[i + 6] += (x << 21) & kBottom29Bits;
406    tmp2[i + 7] += x >> 8;
407
408    /* At position 200, which is the starting bit position for word 7, we
409     * have a factor of 0xf000000 = 2**28 - 2**24. */
410    tmp2[i + 7] += 0x10000000 & xMask;
411    /* Word 7 is 28 bits wide, so the 2**28 term exactly hits word 8. */
412    tmp2[i + 8] += (x - 1) & xMask;
413    tmp2[i + 7] -= (x << 24) & kBottom28Bits;
414    tmp2[i + 8] -= x >> 4;
415
416    tmp2[i + 8] += 0x20000000 & xMask;
417    tmp2[i + 8] -= x;
418    tmp2[i + 8] += (x << 28) & kBottom29Bits;
419    tmp2[i + 9] += ((x >> 1) - 1) & xMask;
420
421    if (i+1 == NLIMBS)
422      break;
423    tmp2[i + 2] += tmp2[i + 1] >> 28;
424    x = tmp2[i + 1] & kBottom28Bits;
425    xMask = NON_ZERO_TO_ALL_ONES(x);
426    tmp2[i + 1] = 0;
427
428    tmp2[i + 4] += (x << 11) & kBottom29Bits;
429    tmp2[i + 5] += (x >> 18);
430
431    tmp2[i + 7] += (x << 21) & kBottom28Bits;
432    tmp2[i + 8] += x >> 7;
433
434    /* At position 199, which is the starting bit of the 8th word when
435     * dealing with a context starting on an odd word, we have a factor of
436     * 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
437     * word from i+1 is i+8. */
438    tmp2[i + 8] += 0x20000000 & xMask;
439    tmp2[i + 9] += (x - 1) & xMask;
440    tmp2[i + 8] -= (x << 25) & kBottom29Bits;
441    tmp2[i + 9] -= x >> 4;
442
443    tmp2[i + 9] += 0x10000000 & xMask;
444    tmp2[i + 9] -= x;
445    tmp2[i + 10] += (x - 1) & xMask;
446  }
447
448  /* We merge the right shift with a carry chain. The words above 2**257 have
449   * widths of 28,29,... which we need to correct when copying them down.  */
450  carry = 0;
451  for (i = 0; i < 8; i++) {
452    /* The maximum value of tmp2[i + 9] occurs on the first iteration and
453     * is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
454     * therefore safe. */
455    out[i] = tmp2[i + 9];
456    out[i] += carry;
457    out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
458    carry = out[i] >> 29;
459    out[i] &= kBottom29Bits;
460
461    i++;
462    out[i] = tmp2[i + 9] >> 1;
463    out[i] += carry;
464    carry = out[i] >> 28;
465    out[i] &= kBottom28Bits;
466  }
467
468  out[8] = tmp2[17];
469  out[8] += carry;
470  carry = out[8] >> 29;
471  out[8] &= kBottom29Bits;
472
473  felem_reduce_carry(out, carry);
474}
475
476/* felem_square sets out=in*in.
477 *
478 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
479 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
480static void felem_square(felem out, const felem in) {
481  u64 tmp[17];
482
483  tmp[0] = ((u64) in[0]) * in[0];
484  tmp[1] = ((u64) in[0]) * (in[1] << 1);
485  tmp[2] = ((u64) in[0]) * (in[2] << 1) +
486           ((u64) in[1]) * (in[1] << 1);
487  tmp[3] = ((u64) in[0]) * (in[3] << 1) +
488           ((u64) in[1]) * (in[2] << 1);
489  tmp[4] = ((u64) in[0]) * (in[4] << 1) +
490           ((u64) in[1]) * (in[3] << 2) + ((u64) in[2]) * in[2];
491  tmp[5] = ((u64) in[0]) * (in[5] << 1) + ((u64) in[1]) *
492           (in[4] << 1) + ((u64) in[2]) * (in[3] << 1);
493  tmp[6] = ((u64) in[0]) * (in[6] << 1) + ((u64) in[1]) *
494           (in[5] << 2) + ((u64) in[2]) * (in[4] << 1) +
495           ((u64) in[3]) * (in[3] << 1);
496  tmp[7] = ((u64) in[0]) * (in[7] << 1) + ((u64) in[1]) *
497           (in[6] << 1) + ((u64) in[2]) * (in[5] << 1) +
498           ((u64) in[3]) * (in[4] << 1);
499  /* tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
500   * which is < 2**64 as required. */
501  tmp[8] = ((u64) in[0]) * (in[8] << 1) + ((u64) in[1]) *
502           (in[7] << 2) + ((u64) in[2]) * (in[6] << 1) +
503           ((u64) in[3]) * (in[5] << 2) + ((u64) in[4]) * in[4];
504  tmp[9] = ((u64) in[1]) * (in[8] << 1) + ((u64) in[2]) *
505           (in[7] << 1) + ((u64) in[3]) * (in[6] << 1) +
506           ((u64) in[4]) * (in[5] << 1);
507  tmp[10] = ((u64) in[2]) * (in[8] << 1) + ((u64) in[3]) *
508            (in[7] << 2) + ((u64) in[4]) * (in[6] << 1) +
509            ((u64) in[5]) * (in[5] << 1);
510  tmp[11] = ((u64) in[3]) * (in[8] << 1) + ((u64) in[4]) *
511            (in[7] << 1) + ((u64) in[5]) * (in[6] << 1);
512  tmp[12] = ((u64) in[4]) * (in[8] << 1) +
513            ((u64) in[5]) * (in[7] << 2) + ((u64) in[6]) * in[6];
514  tmp[13] = ((u64) in[5]) * (in[8] << 1) +
515            ((u64) in[6]) * (in[7] << 1);
516  tmp[14] = ((u64) in[6]) * (in[8] << 1) +
517            ((u64) in[7]) * (in[7] << 1);
518  tmp[15] = ((u64) in[7]) * (in[8] << 1);
519  tmp[16] = ((u64) in[8]) * in[8];
520
521  felem_reduce_degree(out, tmp);
522}
523
524/* felem_mul sets out=in*in2.
525 *
526 * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
527 *           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
528 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
529static void felem_mul(felem out, const felem in, const felem in2) {
530  u64 tmp[17];
531
532  tmp[0] = ((u64) in[0]) * in2[0];
533  tmp[1] = ((u64) in[0]) * (in2[1] << 0) +
534           ((u64) in[1]) * (in2[0] << 0);
535  tmp[2] = ((u64) in[0]) * (in2[2] << 0) + ((u64) in[1]) *
536           (in2[1] << 1) + ((u64) in[2]) * (in2[0] << 0);
537  tmp[3] = ((u64) in[0]) * (in2[3] << 0) + ((u64) in[1]) *
538           (in2[2] << 0) + ((u64) in[2]) * (in2[1] << 0) +
539           ((u64) in[3]) * (in2[0] << 0);
540  tmp[4] = ((u64) in[0]) * (in2[4] << 0) + ((u64) in[1]) *
541           (in2[3] << 1) + ((u64) in[2]) * (in2[2] << 0) +
542           ((u64) in[3]) * (in2[1] << 1) +
543           ((u64) in[4]) * (in2[0] << 0);
544  tmp[5] = ((u64) in[0]) * (in2[5] << 0) + ((u64) in[1]) *
545           (in2[4] << 0) + ((u64) in[2]) * (in2[3] << 0) +
546           ((u64) in[3]) * (in2[2] << 0) + ((u64) in[4]) *
547           (in2[1] << 0) + ((u64) in[5]) * (in2[0] << 0);
548  tmp[6] = ((u64) in[0]) * (in2[6] << 0) + ((u64) in[1]) *
549           (in2[5] << 1) + ((u64) in[2]) * (in2[4] << 0) +
550           ((u64) in[3]) * (in2[3] << 1) + ((u64) in[4]) *
551           (in2[2] << 0) + ((u64) in[5]) * (in2[1] << 1) +
552           ((u64) in[6]) * (in2[0] << 0);
553  tmp[7] = ((u64) in[0]) * (in2[7] << 0) + ((u64) in[1]) *
554           (in2[6] << 0) + ((u64) in[2]) * (in2[5] << 0) +
555           ((u64) in[3]) * (in2[4] << 0) + ((u64) in[4]) *
556           (in2[3] << 0) + ((u64) in[5]) * (in2[2] << 0) +
557           ((u64) in[6]) * (in2[1] << 0) +
558           ((u64) in[7]) * (in2[0] << 0);
559  /* tmp[8] has the greatest value but doesn't overflow. See logic in
560   * felem_square. */
561  tmp[8] = ((u64) in[0]) * (in2[8] << 0) + ((u64) in[1]) *
562           (in2[7] << 1) + ((u64) in[2]) * (in2[6] << 0) +
563           ((u64) in[3]) * (in2[5] << 1) + ((u64) in[4]) *
564           (in2[4] << 0) + ((u64) in[5]) * (in2[3] << 1) +
565           ((u64) in[6]) * (in2[2] << 0) + ((u64) in[7]) *
566           (in2[1] << 1) + ((u64) in[8]) * (in2[0] << 0);
567  tmp[9] = ((u64) in[1]) * (in2[8] << 0) + ((u64) in[2]) *
568           (in2[7] << 0) + ((u64) in[3]) * (in2[6] << 0) +
569           ((u64) in[4]) * (in2[5] << 0) + ((u64) in[5]) *
570           (in2[4] << 0) + ((u64) in[6]) * (in2[3] << 0) +
571           ((u64) in[7]) * (in2[2] << 0) +
572           ((u64) in[8]) * (in2[1] << 0);
573  tmp[10] = ((u64) in[2]) * (in2[8] << 0) + ((u64) in[3]) *
574            (in2[7] << 1) + ((u64) in[4]) * (in2[6] << 0) +
575            ((u64) in[5]) * (in2[5] << 1) + ((u64) in[6]) *
576            (in2[4] << 0) + ((u64) in[7]) * (in2[3] << 1) +
577            ((u64) in[8]) * (in2[2] << 0);
578  tmp[11] = ((u64) in[3]) * (in2[8] << 0) + ((u64) in[4]) *
579            (in2[7] << 0) + ((u64) in[5]) * (in2[6] << 0) +
580            ((u64) in[6]) * (in2[5] << 0) + ((u64) in[7]) *
581            (in2[4] << 0) + ((u64) in[8]) * (in2[3] << 0);
582  tmp[12] = ((u64) in[4]) * (in2[8] << 0) + ((u64) in[5]) *
583            (in2[7] << 1) + ((u64) in[6]) * (in2[6] << 0) +
584            ((u64) in[7]) * (in2[5] << 1) +
585            ((u64) in[8]) * (in2[4] << 0);
586  tmp[13] = ((u64) in[5]) * (in2[8] << 0) + ((u64) in[6]) *
587            (in2[7] << 0) + ((u64) in[7]) * (in2[6] << 0) +
588            ((u64) in[8]) * (in2[5] << 0);
589  tmp[14] = ((u64) in[6]) * (in2[8] << 0) + ((u64) in[7]) *
590            (in2[7] << 1) + ((u64) in[8]) * (in2[6] << 0);
591  tmp[15] = ((u64) in[7]) * (in2[8] << 0) +
592            ((u64) in[8]) * (in2[7] << 0);
593  tmp[16] = ((u64) in[8]) * (in2[8] << 0);
594
595  felem_reduce_degree(out, tmp);
596}
597
598static void felem_assign(felem out, const felem in) {
599  memcpy(out, in, sizeof(felem));
600}
601
602/* felem_inv calculates |out| = |in|^{-1}
603 *
604 * Based on Fermat's Little Theorem:
605 *   a^p = a (mod p)
606 *   a^{p-1} = 1 (mod p)
607 *   a^{p-2} = a^{-1} (mod p)
608 */
609static void felem_inv(felem out, const felem in) {
610  felem ftmp, ftmp2;
611  /* each e_I will hold |in|^{2^I - 1} */
612  felem e2, e4, e8, e16, e32, e64;
613  unsigned i;
614
615  felem_square(ftmp, in); /* 2^1 */
616  felem_mul(ftmp, in, ftmp); /* 2^2 - 2^0 */
617  felem_assign(e2, ftmp);
618  felem_square(ftmp, ftmp); /* 2^3 - 2^1 */
619  felem_square(ftmp, ftmp); /* 2^4 - 2^2 */
620  felem_mul(ftmp, ftmp, e2); /* 2^4 - 2^0 */
621  felem_assign(e4, ftmp);
622  felem_square(ftmp, ftmp); /* 2^5 - 2^1 */
623  felem_square(ftmp, ftmp); /* 2^6 - 2^2 */
624  felem_square(ftmp, ftmp); /* 2^7 - 2^3 */
625  felem_square(ftmp, ftmp); /* 2^8 - 2^4 */
626  felem_mul(ftmp, ftmp, e4); /* 2^8 - 2^0 */
627  felem_assign(e8, ftmp);
628  for (i = 0; i < 8; i++) {
629    felem_square(ftmp, ftmp);
630  } /* 2^16 - 2^8 */
631  felem_mul(ftmp, ftmp, e8); /* 2^16 - 2^0 */
632  felem_assign(e16, ftmp);
633  for (i = 0; i < 16; i++) {
634    felem_square(ftmp, ftmp);
635  } /* 2^32 - 2^16 */
636  felem_mul(ftmp, ftmp, e16); /* 2^32 - 2^0 */
637  felem_assign(e32, ftmp);
638  for (i = 0; i < 32; i++) {
639    felem_square(ftmp, ftmp);
640  } /* 2^64 - 2^32 */
641  felem_assign(e64, ftmp);
642  felem_mul(ftmp, ftmp, in); /* 2^64 - 2^32 + 2^0 */
643  for (i = 0; i < 192; i++) {
644    felem_square(ftmp, ftmp);
645  } /* 2^256 - 2^224 + 2^192 */
646
647  felem_mul(ftmp2, e64, e32); /* 2^64 - 2^0 */
648  for (i = 0; i < 16; i++) {
649    felem_square(ftmp2, ftmp2);
650  } /* 2^80 - 2^16 */
651  felem_mul(ftmp2, ftmp2, e16); /* 2^80 - 2^0 */
652  for (i = 0; i < 8; i++) {
653    felem_square(ftmp2, ftmp2);
654  } /* 2^88 - 2^8 */
655  felem_mul(ftmp2, ftmp2, e8); /* 2^88 - 2^0 */
656  for (i = 0; i < 4; i++) {
657    felem_square(ftmp2, ftmp2);
658  } /* 2^92 - 2^4 */
659  felem_mul(ftmp2, ftmp2, e4); /* 2^92 - 2^0 */
660  felem_square(ftmp2, ftmp2); /* 2^93 - 2^1 */
661  felem_square(ftmp2, ftmp2); /* 2^94 - 2^2 */
662  felem_mul(ftmp2, ftmp2, e2); /* 2^94 - 2^0 */
663  felem_square(ftmp2, ftmp2); /* 2^95 - 2^1 */
664  felem_square(ftmp2, ftmp2); /* 2^96 - 2^2 */
665  felem_mul(ftmp2, ftmp2, in); /* 2^96 - 3 */
666
667  felem_mul(out, ftmp2, ftmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
668}
669
670/* felem_scalar_3 sets out=3*out.
671 *
672 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
673 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
674static void felem_scalar_3(felem out) {
675  limb carry = 0;
676  unsigned i;
677
678  for (i = 0;; i++) {
679    out[i] *= 3;
680    out[i] += carry;
681    carry = out[i] >> 29;
682    out[i] &= kBottom29Bits;
683
684    i++;
685    if (i == NLIMBS)
686      break;
687
688    out[i] *= 3;
689    out[i] += carry;
690    carry = out[i] >> 28;
691    out[i] &= kBottom28Bits;
692  }
693
694  felem_reduce_carry(out, carry);
695}
696
697/* felem_scalar_4 sets out=4*out.
698 *
699 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
700 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
701static void felem_scalar_4(felem out) {
702  limb carry = 0, next_carry;
703  unsigned i;
704
705  for (i = 0;; i++) {
706    next_carry = out[i] >> 27;
707    out[i] <<= 2;
708    out[i] &= kBottom29Bits;
709    out[i] += carry;
710    carry = next_carry + (out[i] >> 29);
711    out[i] &= kBottom29Bits;
712
713    i++;
714    if (i == NLIMBS)
715      break;
716
717    next_carry = out[i] >> 26;
718    out[i] <<= 2;
719    out[i] &= kBottom28Bits;
720    out[i] += carry;
721    carry = next_carry + (out[i] >> 28);
722    out[i] &= kBottom28Bits;
723  }
724
725  felem_reduce_carry(out, carry);
726}
727
728/* felem_scalar_8 sets out=8*out.
729 *
730 * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
731 * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29. */
732static void felem_scalar_8(felem out) {
733  limb carry = 0, next_carry;
734  unsigned i;
735
736  for (i = 0;; i++) {
737    next_carry = out[i] >> 26;
738    out[i] <<= 3;
739    out[i] &= kBottom29Bits;
740    out[i] += carry;
741    carry = next_carry + (out[i] >> 29);
742    out[i] &= kBottom29Bits;
743
744    i++;
745    if (i == NLIMBS)
746      break;
747
748    next_carry = out[i] >> 25;
749    out[i] <<= 3;
750    out[i] &= kBottom28Bits;
751    out[i] += carry;
752    carry = next_carry + (out[i] >> 28);
753    out[i] &= kBottom28Bits;
754  }
755
756  felem_reduce_carry(out, carry);
757}
758
759/* felem_is_zero_vartime returns 1 iff |in| == 0. It takes a variable amount of
760 * time depending on the value of |in|. */
761static char felem_is_zero_vartime(const felem in) {
762  limb carry;
763  int i;
764  limb tmp[NLIMBS];
765
766  felem_assign(tmp, in);
767
768  /* First, reduce tmp to a minimal form. */
769  do {
770    carry = 0;
771    for (i = 0;; i++) {
772      tmp[i] += carry;
773      carry = tmp[i] >> 29;
774      tmp[i] &= kBottom29Bits;
775
776      i++;
777      if (i == NLIMBS)
778        break;
779
780      tmp[i] += carry;
781      carry = tmp[i] >> 28;
782      tmp[i] &= kBottom28Bits;
783    }
784
785    felem_reduce_carry(tmp, carry);
786  } while (carry);
787
788  /* tmp < 2**257, so the only possible zero values are 0, p and 2p. */
789  return memcmp(tmp, kZero, sizeof(tmp)) == 0 ||
790         memcmp(tmp, kP, sizeof(tmp)) == 0 ||
791         memcmp(tmp, k2P, sizeof(tmp)) == 0;
792}
793
794
795/* Group operations:
796 *
797 * Elements of the elliptic curve group are represented in Jacobian
798 * coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
799 * Jacobian form. */
800
801/* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.
802 *
803 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l */
804static void point_double(felem x_out, felem y_out, felem z_out, const felem x,
805                         const felem y, const felem z) {
806  felem delta, gamma, alpha, beta, tmp, tmp2;
807
808  felem_square(delta, z);
809  felem_square(gamma, y);
810  felem_mul(beta, x, gamma);
811
812  felem_sum(tmp, x, delta);
813  felem_diff(tmp2, x, delta);
814  felem_mul(alpha, tmp, tmp2);
815  felem_scalar_3(alpha);
816
817  felem_sum(tmp, y, z);
818  felem_square(tmp, tmp);
819  felem_diff(tmp, tmp, gamma);
820  felem_diff(z_out, tmp, delta);
821
822  felem_scalar_4(beta);
823  felem_square(x_out, alpha);
824  felem_diff(x_out, x_out, beta);
825  felem_diff(x_out, x_out, beta);
826
827  felem_diff(tmp, beta, x_out);
828  felem_mul(tmp, alpha, tmp);
829  felem_square(tmp2, gamma);
830  felem_scalar_8(tmp2);
831  felem_diff(y_out, tmp, tmp2);
832}
833
834/* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.
835 * (i.e. the second point is affine.)
836 *
837 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
838 *
839 * Note that this function does not handle P+P, infinity+P nor P+infinity
840 * correctly. */
841static void point_add_mixed(felem x_out, felem y_out, felem z_out,
842                            const felem x1, const felem y1, const felem z1,
843                            const felem x2, const felem y2) {
844  felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;
845
846  felem_square(z1z1, z1);
847  felem_sum(tmp, z1, z1);
848
849  felem_mul(u2, x2, z1z1);
850  felem_mul(z1z1z1, z1, z1z1);
851  felem_mul(s2, y2, z1z1z1);
852  felem_diff(h, u2, x1);
853  felem_sum(i, h, h);
854  felem_square(i, i);
855  felem_mul(j, h, i);
856  felem_diff(r, s2, y1);
857  felem_sum(r, r, r);
858  felem_mul(v, x1, i);
859
860  felem_mul(z_out, tmp, h);
861  felem_square(rr, r);
862  felem_diff(x_out, rr, j);
863  felem_diff(x_out, x_out, v);
864  felem_diff(x_out, x_out, v);
865
866  felem_diff(tmp, v, x_out);
867  felem_mul(y_out, tmp, r);
868  felem_mul(tmp, y1, j);
869  felem_diff(y_out, y_out, tmp);
870  felem_diff(y_out, y_out, tmp);
871}
872
873/* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.
874 *
875 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
876 *
877 * Note that this function does not handle P+P, infinity+P nor P+infinity
878 * correctly. */
879static void point_add(felem x_out, felem y_out, felem z_out, const felem x1,
880                      const felem y1, const felem z1, const felem x2,
881                      const felem y2, const felem z2) {
882  felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
883
884  felem_square(z1z1, z1);
885  felem_square(z2z2, z2);
886  felem_mul(u1, x1, z2z2);
887
888  felem_sum(tmp, z1, z2);
889  felem_square(tmp, tmp);
890  felem_diff(tmp, tmp, z1z1);
891  felem_diff(tmp, tmp, z2z2);
892
893  felem_mul(z2z2z2, z2, z2z2);
894  felem_mul(s1, y1, z2z2z2);
895
896  felem_mul(u2, x2, z1z1);
897  felem_mul(z1z1z1, z1, z1z1);
898  felem_mul(s2, y2, z1z1z1);
899  felem_diff(h, u2, u1);
900  felem_sum(i, h, h);
901  felem_square(i, i);
902  felem_mul(j, h, i);
903  felem_diff(r, s2, s1);
904  felem_sum(r, r, r);
905  felem_mul(v, u1, i);
906
907  felem_mul(z_out, tmp, h);
908  felem_square(rr, r);
909  felem_diff(x_out, rr, j);
910  felem_diff(x_out, x_out, v);
911  felem_diff(x_out, x_out, v);
912
913  felem_diff(tmp, v, x_out);
914  felem_mul(y_out, tmp, r);
915  felem_mul(tmp, s1, j);
916  felem_diff(y_out, y_out, tmp);
917  felem_diff(y_out, y_out, tmp);
918}
919
920/* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +
921 *                                                        {x2,y2,z2}.
922 *
923 * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
924 *
925 * This function handles the case where {x1,y1,z1}={x2,y2,z2}. */
926static void point_add_or_double_vartime(
927    felem x_out, felem y_out, felem z_out, const felem x1, const felem y1,
928    const felem z1, const felem x2, const felem y2, const felem z2) {
929  felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
930  char x_equal, y_equal;
931
932  felem_square(z1z1, z1);
933  felem_square(z2z2, z2);
934  felem_mul(u1, x1, z2z2);
935
936  felem_sum(tmp, z1, z2);
937  felem_square(tmp, tmp);
938  felem_diff(tmp, tmp, z1z1);
939  felem_diff(tmp, tmp, z2z2);
940
941  felem_mul(z2z2z2, z2, z2z2);
942  felem_mul(s1, y1, z2z2z2);
943
944  felem_mul(u2, x2, z1z1);
945  felem_mul(z1z1z1, z1, z1z1);
946  felem_mul(s2, y2, z1z1z1);
947  felem_diff(h, u2, u1);
948  x_equal = felem_is_zero_vartime(h);
949  felem_sum(i, h, h);
950  felem_square(i, i);
951  felem_mul(j, h, i);
952  felem_diff(r, s2, s1);
953  y_equal = felem_is_zero_vartime(r);
954  if (x_equal && y_equal) {
955    point_double(x_out, y_out, z_out, x1, y1, z1);
956    return;
957  }
958  felem_sum(r, r, r);
959  felem_mul(v, u1, i);
960
961  felem_mul(z_out, tmp, h);
962  felem_square(rr, r);
963  felem_diff(x_out, rr, j);
964  felem_diff(x_out, x_out, v);
965  felem_diff(x_out, x_out, v);
966
967  felem_diff(tmp, v, x_out);
968  felem_mul(y_out, tmp, r);
969  felem_mul(tmp, s1, j);
970  felem_diff(y_out, y_out, tmp);
971  felem_diff(y_out, y_out, tmp);
972}
973
974/* copy_conditional sets out=in if mask = 0xffffffff in constant time.
975 *
976 * On entry: mask is either 0 or 0xffffffff. */
977static void copy_conditional(felem out, const felem in, limb mask) {
978  int i;
979
980  for (i = 0; i < NLIMBS; i++) {
981    const limb tmp = mask & (in[i] ^ out[i]);
982    out[i] ^= tmp;
983  }
984}
985
986/* select_affine_point sets {out_x,out_y} to the index'th entry of table.
987 * On entry: index < 16, table[0] must be zero. */
988static void select_affine_point(felem out_x, felem out_y, const limb* table,
989                                limb index) {
990  limb i, j;
991
992  memset(out_x, 0, sizeof(felem));
993  memset(out_y, 0, sizeof(felem));
994
995  for (i = 1; i < 16; i++) {
996    limb mask = i ^ index;
997    mask |= mask >> 2;
998    mask |= mask >> 1;
999    mask &= 1;
1000    mask--;
1001    for (j = 0; j < NLIMBS; j++, table++) {
1002      out_x[j] |= *table & mask;
1003    }
1004    for (j = 0; j < NLIMBS; j++, table++) {
1005      out_y[j] |= *table & mask;
1006    }
1007  }
1008}
1009
1010/* select_jacobian_point sets {out_x,out_y,out_z} to the index'th entry of
1011 * table. On entry: index < 16, table[0] must be zero. */
1012static void select_jacobian_point(felem out_x, felem out_y, felem out_z,
1013                                  const limb* table, limb index) {
1014  limb i, j;
1015
1016  memset(out_x, 0, sizeof(felem));
1017  memset(out_y, 0, sizeof(felem));
1018  memset(out_z, 0, sizeof(felem));
1019
1020  /* The implicit value at index 0 is all zero. We don't need to perform that
1021   * iteration of the loop because we already set out_* to zero. */
1022  table += 3 * NLIMBS;
1023
1024  // Hit all entries to obscure cache profiling.
1025  for (i = 1; i < 16; i++) {
1026    limb mask = i ^ index;
1027    mask |= mask >> 2;
1028    mask |= mask >> 1;
1029    mask &= 1;
1030    mask--;
1031    for (j = 0; j < NLIMBS; j++, table++) {
1032      out_x[j] |= *table & mask;
1033    }
1034    for (j = 0; j < NLIMBS; j++, table++) {
1035      out_y[j] |= *table & mask;
1036    }
1037    for (j = 0; j < NLIMBS; j++, table++) {
1038      out_z[j] |= *table & mask;
1039    }
1040  }
1041}
1042
1043/* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian
1044 * number. Note that the value of scalar must be less than the order of the
1045 * group. */
1046static void scalar_base_mult(felem nx, felem ny, felem nz,
1047                             const p256_int* scalar) {
1048  int i, j;
1049  limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;
1050  u32 table_offset;
1051
1052  felem px, py;
1053  felem tx, ty, tz;
1054
1055  memset(nx, 0, sizeof(felem));
1056  memset(ny, 0, sizeof(felem));
1057  memset(nz, 0, sizeof(felem));
1058
1059  /* The loop adds bits at positions 0, 64, 128 and 192, followed by
1060   * positions 32,96,160 and 224 and does this 32 times. */
1061  for (i = 0; i < 32; i++) {
1062    if (i) {
1063      point_double(nx, ny, nz, nx, ny, nz);
1064    }
1065    table_offset = 0;
1066    for (j = 0; j <= 32; j += 32) {
1067      char bit0 = p256_get_bit(scalar, 31 - i + j);
1068      char bit1 = p256_get_bit(scalar, 95 - i + j);
1069      char bit2 = p256_get_bit(scalar, 159 - i + j);
1070      char bit3 = p256_get_bit(scalar, 223 - i + j);
1071      limb index = bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3);
1072
1073      select_affine_point(px, py, kPrecomputed + table_offset, index);
1074      table_offset += 30 * NLIMBS;
1075
1076      /* Since scalar is less than the order of the group, we know that
1077       * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle
1078       * below. */
1079      point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);
1080      /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero
1081       * (a.k.a.  the point at infinity). We handle that situation by
1082       * copying the point from the table. */
1083      copy_conditional(nx, px, n_is_infinity_mask);
1084      copy_conditional(ny, py, n_is_infinity_mask);
1085      copy_conditional(nz, kOne, n_is_infinity_mask);
1086
1087      /* Equally, the result is also wrong if the point from the table is
1088       * zero, which happens when the index is zero. We handle that by
1089       * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0. */
1090      p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
1091      mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1092      copy_conditional(nx, tx, mask);
1093      copy_conditional(ny, ty, mask);
1094      copy_conditional(nz, tz, mask);
1095      /* If p was not zero, then n is now non-zero. */
1096      n_is_infinity_mask &= ~p_is_noninfinite_mask;
1097    }
1098  }
1099}
1100
1101/* point_to_affine converts a Jacobian point to an affine point. If the input
1102 * is the point at infinity then it returns (0, 0) in constant time. */
1103static void point_to_affine(felem x_out, felem y_out, const felem nx,
1104                            const felem ny, const felem nz) {
1105  felem z_inv, z_inv_sq;
1106  felem_inv(z_inv, nz);
1107  felem_square(z_inv_sq, z_inv);
1108  felem_mul(x_out, nx, z_inv_sq);
1109  felem_mul(z_inv, z_inv, z_inv_sq);
1110  felem_mul(y_out, ny, z_inv);
1111}
1112
1113/* scalar_base_mult sets {nx,ny,nz} = scalar*{x,y}. */
1114static void scalar_mult(felem nx, felem ny, felem nz, const felem x,
1115                        const felem y, const p256_int* scalar) {
1116  int i;
1117  felem px, py, pz, tx, ty, tz;
1118  felem precomp[16][3];
1119  limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;
1120
1121  /* We precompute 0,1,2,... times {x,y}. */
1122  memset(precomp, 0, sizeof(felem) * 3);
1123  memcpy(&precomp[1][0], x, sizeof(felem));
1124  memcpy(&precomp[1][1], y, sizeof(felem));
1125  memcpy(&precomp[1][2], kOne, sizeof(felem));
1126
1127  for (i = 2; i < 16; i += 2) {
1128    point_double(precomp[i][0], precomp[i][1], precomp[i][2],
1129                 precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);
1130
1131    point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],
1132                    precomp[i][0], precomp[i][1], precomp[i][2], x, y);
1133  }
1134
1135  memset(nx, 0, sizeof(felem));
1136  memset(ny, 0, sizeof(felem));
1137  memset(nz, 0, sizeof(felem));
1138  n_is_infinity_mask = -1;
1139
1140  /* We add in a window of four bits each iteration and do this 64 times. */
1141  for (i = 0; i < 256; i += 4) {
1142    if (i) {
1143      point_double(nx, ny, nz, nx, ny, nz);
1144      point_double(nx, ny, nz, nx, ny, nz);
1145      point_double(nx, ny, nz, nx, ny, nz);
1146      point_double(nx, ny, nz, nx, ny, nz);
1147    }
1148
1149    index = (p256_get_bit(scalar, 255 - i - 0) << 3) |
1150            (p256_get_bit(scalar, 255 - i - 1) << 2) |
1151            (p256_get_bit(scalar, 255 - i - 2) << 1) |
1152            p256_get_bit(scalar, 255 - i - 3);
1153
1154    /* See the comments in scalar_base_mult about handling infinities. */
1155    select_jacobian_point(px, py, pz, precomp[0][0], index);
1156    point_add(tx, ty, tz, nx, ny, nz, px, py, pz);
1157    copy_conditional(nx, px, n_is_infinity_mask);
1158    copy_conditional(ny, py, n_is_infinity_mask);
1159    copy_conditional(nz, pz, n_is_infinity_mask);
1160
1161    p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
1162    mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
1163
1164    copy_conditional(nx, tx, mask);
1165    copy_conditional(ny, ty, mask);
1166    copy_conditional(nz, tz, mask);
1167    n_is_infinity_mask &= ~p_is_noninfinite_mask;
1168  }
1169}
1170
1171#define kRDigits {2, 0, 0, 0xfffffffe, 0xffffffff, 0xffffffff, 0xfffffffd, 1} // 2^257 mod p256.p
1172
1173#define kRInvDigits {0x80000000, 1, 0xffffffff, 0, 0x80000001, 0xfffffffe, 1, 0x7fffffff}  // 1 / 2^257 mod p256.p
1174
1175static const p256_int kR = { kRDigits };
1176static const p256_int kRInv = { kRInvDigits };
1177
1178/* to_montgomery sets out = R*in. */
1179static void to_montgomery(felem out, const p256_int* in) {
1180  p256_int in_shifted;
1181  int i;
1182
1183  p256_init(&in_shifted);
1184  p256_modmul(&SECP256r1_p, in, 0, &kR, &in_shifted);
1185
1186  for (i = 0; i < NLIMBS; i++) {
1187    if ((i & 1) == 0) {
1188      out[i] = P256_DIGIT(&in_shifted, 0) & kBottom29Bits;
1189      p256_shr(&in_shifted, 29, &in_shifted);
1190    } else {
1191      out[i] = P256_DIGIT(&in_shifted, 0) & kBottom28Bits;
1192      p256_shr(&in_shifted, 28, &in_shifted);
1193    }
1194  }
1195
1196  p256_clear(&in_shifted);
1197}
1198
1199/* from_montgomery sets out=in/R. */
1200static void from_montgomery(p256_int* out, const felem in) {
1201  p256_int result, tmp;
1202  int i, top;
1203
1204  p256_init(&result);
1205  p256_init(&tmp);
1206
1207  p256_add_d(&tmp, in[NLIMBS - 1], &result);
1208  for (i = NLIMBS - 2; i >= 0; i--) {
1209    if ((i & 1) == 0) {
1210      top = p256_shl(&result, 29, &tmp);
1211    } else {
1212      top = p256_shl(&result, 28, &tmp);
1213    }
1214    top |= p256_add_d(&tmp, in[i], &result);
1215  }
1216
1217  p256_modmul(&SECP256r1_p, &kRInv, top, &result, out);
1218
1219  p256_clear(&result);
1220  p256_clear(&tmp);
1221}
1222
1223/* p256_base_point_mul sets {out_x,out_y} = nG, where n is < the
1224 * order of the group. */
1225void p256_base_point_mul(const p256_int* n, p256_int* out_x, p256_int* out_y) {
1226  felem x, y, z;
1227
1228  scalar_base_mult(x, y, z, n);
1229
1230  {
1231    felem x_affine, y_affine;
1232
1233    point_to_affine(x_affine, y_affine, x, y, z);
1234    from_montgomery(out_x, x_affine);
1235    from_montgomery(out_y, y_affine);
1236  }
1237}
1238
1239/* p256_points_mul_vartime sets {out_x,out_y} = n1*G + n2*{in_x,in_y}, where
1240 * n1 and n2 are < the order of the group.
1241 *
1242 * As indicated by the name, this function operates in variable time. This
1243 * is safe because it's used for signature validation which doesn't deal
1244 * with secrets. */
1245void p256_points_mul_vartime(
1246    const p256_int* n1, const p256_int* n2, const p256_int* in_x,
1247    const p256_int* in_y, p256_int* out_x, p256_int* out_y) {
1248  felem x1, y1, z1, x2, y2, z2, px, py;
1249
1250  /* If both scalars are zero, then the result is the point at infinity. */
1251  if (p256_is_zero(n1) != 0 && p256_is_zero(n2) != 0) {
1252    p256_clear(out_x);
1253    p256_clear(out_y);
1254    return;
1255  }
1256
1257  to_montgomery(px, in_x);
1258  to_montgomery(py, in_y);
1259  scalar_base_mult(x1, y1, z1, n1);
1260  scalar_mult(x2, y2, z2, px, py, n2);
1261
1262  if (p256_is_zero(n2) != 0) {
1263    /* If n2 == 0, then {x2,y2,z2} is zero and the result is just
1264         * {x1,y1,z1}. */
1265  } else if (p256_is_zero(n1) != 0) {
1266    /* If n1 == 0, then {x1,y1,z1} is zero and the result is just
1267         * {x2,y2,z2}. */
1268    memcpy(x1, x2, sizeof(x2));
1269    memcpy(y1, y2, sizeof(y2));
1270    memcpy(z1, z2, sizeof(z2));
1271  } else {
1272    /* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */
1273    point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);
1274  }
1275
1276  point_to_affine(px, py, x1, y1, z1);
1277  from_montgomery(out_x, px);
1278  from_montgomery(out_y, py);
1279}
1280