1/*
2 * Camellia Cipher Algorithm (x86_64)
3 *
4 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
19 * USA
20 *
21 */
22
23.file "camellia-x86_64-asm_64.S"
24.text
25
26.extern camellia_sp10011110;
27.extern camellia_sp22000222;
28.extern camellia_sp03303033;
29.extern camellia_sp00444404;
30.extern camellia_sp02220222;
31.extern camellia_sp30333033;
32.extern camellia_sp44044404;
33.extern camellia_sp11101110;
34
35#define sp10011110 camellia_sp10011110
36#define sp22000222 camellia_sp22000222
37#define sp03303033 camellia_sp03303033
38#define sp00444404 camellia_sp00444404
39#define sp02220222 camellia_sp02220222
40#define sp30333033 camellia_sp30333033
41#define sp44044404 camellia_sp44044404
42#define sp11101110 camellia_sp11101110
43
44#define CAMELLIA_TABLE_BYTE_LEN 272
45
46/* struct camellia_ctx: */
47#define key_table 0
48#define key_length CAMELLIA_TABLE_BYTE_LEN
49
50/* register macros */
51#define CTX %rdi
52#define RIO %rsi
53#define RIOd %esi
54
55#define RAB0 %rax
56#define RCD0 %rcx
57#define RAB1 %rbx
58#define RCD1 %rdx
59
60#define RAB0d %eax
61#define RCD0d %ecx
62#define RAB1d %ebx
63#define RCD1d %edx
64
65#define RAB0bl %al
66#define RCD0bl %cl
67#define RAB1bl %bl
68#define RCD1bl %dl
69
70#define RAB0bh %ah
71#define RCD0bh %ch
72#define RAB1bh %bh
73#define RCD1bh %dh
74
75#define RT0 %rsi
76#define RT1 %rbp
77#define RT2 %r8
78
79#define RT0d %esi
80#define RT1d %ebp
81#define RT2d %r8d
82
83#define RT2bl %r8b
84
85#define RXOR %r9
86#define RRBP %r10
87#define RDST %r11
88
89#define RXORd %r9d
90#define RXORbl %r9b
91
92#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
93	movzbl ab ## bl,		tmp2 ## d; \
94	movzbl ab ## bh,		tmp1 ## d; \
95	rorq $16,			ab; \
96	xorq T0(, tmp2, 8),		dst; \
97	xorq T1(, tmp1, 8),		dst;
98
99/**********************************************************************
100  1-way camellia
101 **********************************************************************/
102#define roundsm(ab, subkey, cd) \
103	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
104	\
105	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
106	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
107	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
108	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
109	\
110	xorq RT2,					cd ## 0;
111
112#define fls(l, r, kl, kr) \
113	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
114	andl l ## 0d,					RT0d; \
115	roll $1,					RT0d; \
116	shlq $32,					RT0; \
117	xorq RT0,					l ## 0; \
118	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
119	orq r ## 0,					RT1; \
120	shrq $32,					RT1; \
121	xorq RT1,					r ## 0; \
122	\
123	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
124	orq l ## 0,					RT2; \
125	shrq $32,					RT2; \
126	xorq RT2,					l ## 0; \
127	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
128	andl r ## 0d,					RT0d; \
129	roll $1,					RT0d; \
130	shlq $32,					RT0; \
131	xorq RT0,					r ## 0;
132
133#define enc_rounds(i) \
134	roundsm(RAB, i + 2, RCD); \
135	roundsm(RCD, i + 3, RAB); \
136	roundsm(RAB, i + 4, RCD); \
137	roundsm(RCD, i + 5, RAB); \
138	roundsm(RAB, i + 6, RCD); \
139	roundsm(RCD, i + 7, RAB);
140
141#define enc_fls(i) \
142	fls(RAB, RCD, i + 0, i + 1);
143
144#define enc_inpack() \
145	movq (RIO),			RAB0; \
146	bswapq				RAB0; \
147	rolq $32,			RAB0; \
148	movq 4*2(RIO),			RCD0; \
149	bswapq				RCD0; \
150	rorq $32,			RCD0; \
151	xorq key_table(CTX),		RAB0;
152
153#define enc_outunpack(op, max) \
154	xorq key_table(CTX, max, 8),	RCD0; \
155	rorq $32,			RCD0; \
156	bswapq				RCD0; \
157	op ## q RCD0,			(RIO); \
158	rolq $32,			RAB0; \
159	bswapq				RAB0; \
160	op ## q RAB0,			4*2(RIO);
161
162#define dec_rounds(i) \
163	roundsm(RAB, i + 7, RCD); \
164	roundsm(RCD, i + 6, RAB); \
165	roundsm(RAB, i + 5, RCD); \
166	roundsm(RCD, i + 4, RAB); \
167	roundsm(RAB, i + 3, RCD); \
168	roundsm(RCD, i + 2, RAB);
169
170#define dec_fls(i) \
171	fls(RAB, RCD, i + 1, i + 0);
172
173#define dec_inpack(max) \
174	movq (RIO),			RAB0; \
175	bswapq				RAB0; \
176	rolq $32,			RAB0; \
177	movq 4*2(RIO),			RCD0; \
178	bswapq				RCD0; \
179	rorq $32,			RCD0; \
180	xorq key_table(CTX, max, 8),	RAB0;
181
182#define dec_outunpack() \
183	xorq key_table(CTX),		RCD0; \
184	rorq $32,			RCD0; \
185	bswapq				RCD0; \
186	movq RCD0,			(RIO); \
187	rolq $32,			RAB0; \
188	bswapq				RAB0; \
189	movq RAB0,			4*2(RIO);
190
191.global __camellia_enc_blk;
192.type   __camellia_enc_blk,@function;
193
194__camellia_enc_blk:
195	/* input:
196	 *	%rdi: ctx, CTX
197	 *	%rsi: dst
198	 *	%rdx: src
199	 *	%rcx: bool xor
200	 */
201	movq %rbp, RRBP;
202
203	movq %rcx, RXOR;
204	movq %rsi, RDST;
205	movq %rdx, RIO;
206
207	enc_inpack();
208
209	enc_rounds(0);
210	enc_fls(8);
211	enc_rounds(8);
212	enc_fls(16);
213	enc_rounds(16);
214	movl $24, RT1d; /* max */
215
216	cmpb $16, key_length(CTX);
217	je __enc_done;
218
219	enc_fls(24);
220	enc_rounds(24);
221	movl $32, RT1d; /* max */
222
223__enc_done:
224	testb RXORbl, RXORbl;
225	movq RDST, RIO;
226
227	jnz __enc_xor;
228
229	enc_outunpack(mov, RT1);
230
231	movq RRBP, %rbp;
232	ret;
233
234__enc_xor:
235	enc_outunpack(xor, RT1);
236
237	movq RRBP, %rbp;
238	ret;
239
240.global camellia_dec_blk;
241.type   camellia_dec_blk,@function;
242
243camellia_dec_blk:
244	/* input:
245	 *	%rdi: ctx, CTX
246	 *	%rsi: dst
247	 *	%rdx: src
248	 */
249	cmpl $16, key_length(CTX);
250	movl $32, RT2d;
251	movl $24, RXORd;
252	cmovel RXORd, RT2d; /* max */
253
254	movq %rbp, RRBP;
255	movq %rsi, RDST;
256	movq %rdx, RIO;
257
258	dec_inpack(RT2);
259
260	cmpb $24, RT2bl;
261	je __dec_rounds16;
262
263	dec_rounds(24);
264	dec_fls(24);
265
266__dec_rounds16:
267	dec_rounds(16);
268	dec_fls(16);
269	dec_rounds(8);
270	dec_fls(8);
271	dec_rounds(0);
272
273	movq RDST, RIO;
274
275	dec_outunpack();
276
277	movq RRBP, %rbp;
278	ret;
279
280/**********************************************************************
281  2-way camellia
282 **********************************************************************/
283#define roundsm2(ab, subkey, cd) \
284	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
285	xorq RT2,					cd ## 1; \
286	\
287	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
288	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
289	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
290	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
291	\
292		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
293		xorq RT2,					cd ## 0; \
294		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
295		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
296		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
297
298#define fls2(l, r, kl, kr) \
299	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
300	andl l ## 0d,					RT0d; \
301	roll $1,					RT0d; \
302	shlq $32,					RT0; \
303	xorq RT0,					l ## 0; \
304	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
305	orq r ## 0,					RT1; \
306	shrq $32,					RT1; \
307	xorq RT1,					r ## 0; \
308	\
309		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
310		andl l ## 1d,					RT2d; \
311		roll $1,					RT2d; \
312		shlq $32,					RT2; \
313		xorq RT2,					l ## 1; \
314		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
315		orq r ## 1,					RT0; \
316		shrq $32,					RT0; \
317		xorq RT0,					r ## 1; \
318	\
319	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
320	orq l ## 0,					RT1; \
321	shrq $32,					RT1; \
322	xorq RT1,					l ## 0; \
323	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
324	andl r ## 0d,					RT2d; \
325	roll $1,					RT2d; \
326	shlq $32,					RT2; \
327	xorq RT2,					r ## 0; \
328	\
329		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
330		orq l ## 1,					RT0; \
331		shrq $32,					RT0; \
332		xorq RT0,					l ## 1; \
333		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
334		andl r ## 1d,					RT1d; \
335		roll $1,					RT1d; \
336		shlq $32,					RT1; \
337		xorq RT1,					r ## 1;
338
339#define enc_rounds2(i) \
340	roundsm2(RAB, i + 2, RCD); \
341	roundsm2(RCD, i + 3, RAB); \
342	roundsm2(RAB, i + 4, RCD); \
343	roundsm2(RCD, i + 5, RAB); \
344	roundsm2(RAB, i + 6, RCD); \
345	roundsm2(RCD, i + 7, RAB);
346
347#define enc_fls2(i) \
348	fls2(RAB, RCD, i + 0, i + 1);
349
350#define enc_inpack2() \
351	movq (RIO),			RAB0; \
352	bswapq				RAB0; \
353	rorq $32,			RAB0; \
354	movq 4*2(RIO),			RCD0; \
355	bswapq				RCD0; \
356	rolq $32,			RCD0; \
357	xorq key_table(CTX),		RAB0; \
358	\
359		movq 8*2(RIO),			RAB1; \
360		bswapq				RAB1; \
361		rorq $32,			RAB1; \
362		movq 12*2(RIO),			RCD1; \
363		bswapq				RCD1; \
364		rolq $32,			RCD1; \
365		xorq key_table(CTX),		RAB1;
366
367#define enc_outunpack2(op, max) \
368	xorq key_table(CTX, max, 8),	RCD0; \
369	rolq $32,			RCD0; \
370	bswapq				RCD0; \
371	op ## q RCD0,			(RIO); \
372	rorq $32,			RAB0; \
373	bswapq				RAB0; \
374	op ## q RAB0,			4*2(RIO); \
375	\
376		xorq key_table(CTX, max, 8),	RCD1; \
377		rolq $32,			RCD1; \
378		bswapq				RCD1; \
379		op ## q RCD1,			8*2(RIO); \
380		rorq $32,			RAB1; \
381		bswapq				RAB1; \
382		op ## q RAB1,			12*2(RIO);
383
384#define dec_rounds2(i) \
385	roundsm2(RAB, i + 7, RCD); \
386	roundsm2(RCD, i + 6, RAB); \
387	roundsm2(RAB, i + 5, RCD); \
388	roundsm2(RCD, i + 4, RAB); \
389	roundsm2(RAB, i + 3, RCD); \
390	roundsm2(RCD, i + 2, RAB);
391
392#define dec_fls2(i) \
393	fls2(RAB, RCD, i + 1, i + 0);
394
395#define dec_inpack2(max) \
396	movq (RIO),			RAB0; \
397	bswapq				RAB0; \
398	rorq $32,			RAB0; \
399	movq 4*2(RIO),			RCD0; \
400	bswapq				RCD0; \
401	rolq $32,			RCD0; \
402	xorq key_table(CTX, max, 8),	RAB0; \
403	\
404		movq 8*2(RIO),			RAB1; \
405		bswapq				RAB1; \
406		rorq $32,			RAB1; \
407		movq 12*2(RIO),			RCD1; \
408		bswapq				RCD1; \
409		rolq $32,			RCD1; \
410		xorq key_table(CTX, max, 8),	RAB1;
411
412#define dec_outunpack2() \
413	xorq key_table(CTX),		RCD0; \
414	rolq $32,			RCD0; \
415	bswapq				RCD0; \
416	movq RCD0,			(RIO); \
417	rorq $32,			RAB0; \
418	bswapq				RAB0; \
419	movq RAB0,			4*2(RIO); \
420	\
421		xorq key_table(CTX),		RCD1; \
422		rolq $32,			RCD1; \
423		bswapq				RCD1; \
424		movq RCD1,			8*2(RIO); \
425		rorq $32,			RAB1; \
426		bswapq				RAB1; \
427		movq RAB1,			12*2(RIO);
428
429.global __camellia_enc_blk_2way;
430.type   __camellia_enc_blk_2way,@function;
431
432__camellia_enc_blk_2way:
433	/* input:
434	 *	%rdi: ctx, CTX
435	 *	%rsi: dst
436	 *	%rdx: src
437	 *	%rcx: bool xor
438	 */
439	pushq %rbx;
440
441	movq %rbp, RRBP;
442	movq %rcx, RXOR;
443	movq %rsi, RDST;
444	movq %rdx, RIO;
445
446	enc_inpack2();
447
448	enc_rounds2(0);
449	enc_fls2(8);
450	enc_rounds2(8);
451	enc_fls2(16);
452	enc_rounds2(16);
453	movl $24, RT2d; /* max */
454
455	cmpb $16, key_length(CTX);
456	je __enc2_done;
457
458	enc_fls2(24);
459	enc_rounds2(24);
460	movl $32, RT2d; /* max */
461
462__enc2_done:
463	test RXORbl, RXORbl;
464	movq RDST, RIO;
465	jnz __enc2_xor;
466
467	enc_outunpack2(mov, RT2);
468
469	movq RRBP, %rbp;
470	popq %rbx;
471	ret;
472
473__enc2_xor:
474	enc_outunpack2(xor, RT2);
475
476	movq RRBP, %rbp;
477	popq %rbx;
478	ret;
479
480.global camellia_dec_blk_2way;
481.type   camellia_dec_blk_2way,@function;
482
483camellia_dec_blk_2way:
484	/* input:
485	 *	%rdi: ctx, CTX
486	 *	%rsi: dst
487	 *	%rdx: src
488	 */
489	cmpl $16, key_length(CTX);
490	movl $32, RT2d;
491	movl $24, RXORd;
492	cmovel RXORd, RT2d; /* max */
493
494	movq %rbx, RXOR;
495	movq %rbp, RRBP;
496	movq %rsi, RDST;
497	movq %rdx, RIO;
498
499	dec_inpack2(RT2);
500
501	cmpb $24, RT2bl;
502	je __dec2_rounds16;
503
504	dec_rounds2(24);
505	dec_fls2(24);
506
507__dec2_rounds16:
508	dec_rounds2(16);
509	dec_fls2(16);
510	dec_rounds2(8);
511	dec_fls2(8);
512	dec_rounds2(0);
513
514	movq RDST, RIO;
515
516	dec_outunpack2();
517
518	movq RRBP, %rbp;
519	movq RXOR, %rbx;
520	ret;
521