1#!/usr/bin/env perl
2
3# Copyright (c) 2017, Shay Gueron.
4# Copyright (c) 2017, Google Inc.
5#
6# Permission to use, copy, modify, and/or distribute this software for any
7# purpose with or without fee is hereby granted, provided that the above
8# copyright notice and this permission notice appear in all copies.
9#
10# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
13# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
15# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
16# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
17
18use warnings FATAL => 'all';
19
20$flavour = shift;
21$output  = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
32*STDOUT=*OUT;
33
34$code.=<<___;
35.data
36
37.align 16
38one:
39.quad 1,0
40two:
41.quad 2,0
42three:
43.quad 3,0
44four:
45.quad 4,0
46five:
47.quad 5,0
48six:
49.quad 6,0
50seven:
51.quad 7,0
52eight:
53.quad 8,0
54
55OR_MASK:
56.long 0x00000000,0x00000000,0x00000000,0x80000000
57poly:
58.quad 0x1, 0xc200000000000000
59mask:
60.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
61con1:
62.long 1,1,1,1
63con2:
64.long 0x1b,0x1b,0x1b,0x1b
65con3:
66.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
67and_mask:
68.long 0,0xffffffff, 0xffffffff, 0xffffffff
69___
70
71$code.=<<___;
72.text
73___
74
75sub gfmul {
76  #########################
77  # a = T
78  # b = TMP0 - remains unchanged
79  # res = T
80  # uses also TMP1,TMP2,TMP3,TMP4
81  # __m128i GFMUL(__m128i A, __m128i B);
82
83  my $T = "%xmm0";
84  my $TMP0 = "%xmm1";
85  my $TMP1 = "%xmm2";
86  my $TMP2 = "%xmm3";
87  my $TMP3 = "%xmm4";
88  my $TMP4 = "%xmm5";
89
90  $code.=<<___;
91.type GFMUL,\@abi-omnipotent
92.align 16
93GFMUL:
94.cfi_startproc
95    vpclmulqdq  \$0x00, $TMP0, $T, $TMP1
96    vpclmulqdq  \$0x11, $TMP0, $T, $TMP4
97    vpclmulqdq  \$0x10, $TMP0, $T, $TMP2
98    vpclmulqdq  \$0x01, $TMP0, $T, $TMP3
99    vpxor       $TMP3, $TMP2, $TMP2
100    vpslldq     \$8, $TMP2, $TMP3
101    vpsrldq     \$8, $TMP2, $TMP2
102    vpxor       $TMP3, $TMP1, $TMP1
103    vpxor       $TMP2, $TMP4, $TMP4
104
105    vpclmulqdq  \$0x10, poly(%rip), $TMP1, $TMP2
106    vpshufd     \$78, $TMP1, $TMP3
107    vpxor       $TMP3, $TMP2, $TMP1
108
109    vpclmulqdq  \$0x10, poly(%rip), $TMP1, $TMP2
110    vpshufd     \$78, $TMP1, $TMP3
111    vpxor       $TMP3, $TMP2, $TMP1
112
113    vpxor       $TMP4, $TMP1, $T
114    ret
115.cfi_endproc
116.size GFMUL, .-GFMUL
117___
118}
119gfmul();
120
121sub aesgcmsiv_htable_init {
122  # aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to
123  # |out_htable|.
124  # void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H);
125
126  my $Htbl = "%rdi";
127  my $H = "%rsi";
128  my $T = "%xmm0";
129  my $TMP0 = "%xmm1";
130
131$code.=<<___;
132.globl aesgcmsiv_htable_init
133.type aesgcmsiv_htable_init,\@function,2
134.align 16
135aesgcmsiv_htable_init:
136.cfi_startproc
137    vmovdqa ($H), $T
138    vmovdqa $T, $TMP0
139    vmovdqa $T, ($Htbl)      # H
140    call GFMUL
141    vmovdqa $T, 16($Htbl)    # H^2
142    call GFMUL
143    vmovdqa $T, 32($Htbl)    # H^3
144    call GFMUL
145    vmovdqa $T, 48($Htbl)    # H^4
146    call GFMUL
147    vmovdqa $T, 64($Htbl)    # H^5
148    call GFMUL
149    vmovdqa $T, 80($Htbl)    # H^6
150    call GFMUL
151    vmovdqa $T, 96($Htbl)    # H^7
152    call GFMUL
153    vmovdqa $T, 112($Htbl)   # H^8
154    ret
155.cfi_endproc
156.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
157___
158}
159aesgcmsiv_htable_init();
160
161sub aesgcmsiv_htable6_init {
162  # aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to
163  # |out_htable|.
164  # void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H);
165  #
166  my $Htbl = "%rdi";
167  my $H = "%rsi";
168  my $T = "%xmm0";
169  my $TMP0 = "%xmm1";
170
171  $code.=<<___;
172.globl aesgcmsiv_htable6_init
173.type aesgcmsiv_htable6_init,\@function,2
174.align 16
175aesgcmsiv_htable6_init:
176.cfi_startproc
177    vmovdqa ($H), $T
178    vmovdqa $T, $TMP0
179    vmovdqa $T, ($Htbl)      # H
180    call GFMUL
181    vmovdqa $T, 16($Htbl)    # H^2
182    call GFMUL
183    vmovdqa $T, 32($Htbl)    # H^3
184    call GFMUL
185    vmovdqa $T, 48($Htbl)    # H^4
186    call GFMUL
187    vmovdqa $T, 64($Htbl)    # H^5
188    call GFMUL
189    vmovdqa $T, 80($Htbl)    # H^6
190    ret
191.cfi_endproc
192.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
193___
194}
195aesgcmsiv_htable6_init();
196
197sub aesgcmsiv_htable_polyval {
198  # void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T);
199  # parameter 1: %rdi     Htable  - pointer to Htable
200  # parameter 2: %rsi     INp     - pointer to input
201  # parameter 3: %rdx     LEN     - length of BUFFER in bytes
202  # parameter 4: %rcx     T       - pointer to POLYVAL output
203
204  my $DATA = "%xmm0";
205  my $hlp0 = "%r11";
206  my $Htbl = "%rdi";
207  my $inp = "%rsi";
208  my $len = "%rdx";
209  my $TMP0 = "%xmm3";
210  my $TMP1 = "%xmm4";
211  my $TMP2 = "%xmm5";
212  my $TMP3 = "%xmm6";
213  my $TMP4 = "%xmm7";
214  my $Tp = "%rcx";
215  my $T = "%xmm1";
216  my $Xhi = "%xmm9";
217
218  my $SCHOOLBOOK_AAD = sub {
219    my ($i)=@_;
220    return <<___;
221    vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
222    vpxor $TMP3, $TMP2, $TMP2
223    vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
224    vpxor $TMP3, $TMP0, $TMP0
225    vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
226    vpxor $TMP3, $TMP1, $TMP1
227    vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
228    vpxor $TMP3, $TMP2, $TMP2
229___
230  };
231
232  $code.=<<___;
233.globl aesgcmsiv_htable_polyval
234.type aesgcmsiv_htable_polyval,\@function,4
235.align 16
236aesgcmsiv_htable_polyval:
237.cfi_startproc
238    test  $len, $len
239    jnz   .Lhtable_polyval_start
240    ret
241
242.Lhtable_polyval_start:
243    vzeroall
244
245    # We hash 8 blocks each iteration. If the total number of blocks is not a
246    # multiple of 8, we first hash the leading n%8 blocks.
247    movq $len, $hlp0
248    andq \$127, $hlp0
249
250    jz .Lhtable_polyval_no_prefix
251
252    vpxor $Xhi, $Xhi, $Xhi
253    vmovdqa ($Tp), $T
254    sub $hlp0, $len
255
256    sub \$16, $hlp0
257
258    # hash first prefix block
259    vmovdqu ($inp), $DATA
260    vpxor $T, $DATA, $DATA
261
262    vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2
263    vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0
264    vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1
265    vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
266    vpxor $TMP3, $TMP2, $TMP2
267
268    lea 16($inp), $inp
269    test $hlp0, $hlp0
270    jnz .Lhtable_polyval_prefix_loop
271    jmp .Lhtable_polyval_prefix_complete
272
273    # hash remaining prefix bocks (up to 7 total prefix blocks)
274.align 64
275.Lhtable_polyval_prefix_loop:
276    sub \$16, $hlp0
277
278    vmovdqu ($inp), $DATA           # next data block
279
280    vpclmulqdq  \$0x00, ($Htbl,$hlp0), $DATA, $TMP3
281    vpxor       $TMP3, $TMP0, $TMP0
282    vpclmulqdq  \$0x11, ($Htbl,$hlp0), $DATA, $TMP3
283    vpxor       $TMP3, $TMP1, $TMP1
284    vpclmulqdq  \$0x01, ($Htbl,$hlp0), $DATA, $TMP3
285    vpxor       $TMP3, $TMP2, $TMP2
286    vpclmulqdq  \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
287    vpxor       $TMP3, $TMP2, $TMP2
288
289    test $hlp0, $hlp0
290
291    lea 16($inp), $inp
292
293    jnz .Lhtable_polyval_prefix_loop
294
295.Lhtable_polyval_prefix_complete:
296    vpsrldq \$8, $TMP2, $TMP3
297    vpslldq \$8, $TMP2, $TMP2
298
299    vpxor $TMP3, $TMP1, $Xhi
300    vpxor $TMP2, $TMP0, $T
301
302    jmp .Lhtable_polyval_main_loop
303
304.Lhtable_polyval_no_prefix:
305    # At this point we know the number of blocks is a multiple of 8. However,
306    # the reduction in the main loop includes a multiplication by x^(-128). In
307    # order to counter this, the existing tag needs to be multipled by x^128.
308    # In practice, this just means that it is loaded into $Xhi, not $T.
309    vpxor $T, $T, $T
310    vmovdqa ($Tp), $Xhi
311
312.align 64
313.Lhtable_polyval_main_loop:
314    sub \$0x80, $len
315    jb .Lhtable_polyval_out
316
317    vmovdqu 16*7($inp), $DATA      # Ii
318
319    vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2
320    vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0
321    vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1
322    vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3
323    vpxor $TMP3, $TMP2, $TMP2
324
325    #########################################################
326    vmovdqu 16*6($inp), $DATA
327    ${\$SCHOOLBOOK_AAD->(1)}
328
329    #########################################################
330    vmovdqu 16*5($inp), $DATA
331
332    vpclmulqdq \$0x10, poly(%rip), $T, $TMP4         # reduction stage 1a
333    vpalignr \$8, $T, $T, $T
334
335    ${\$SCHOOLBOOK_AAD->(2)}
336
337    vpxor $TMP4, $T, $T                              # reduction stage 1b
338    #########################################################
339    vmovdqu     16*4($inp), $DATA
340
341    ${\$SCHOOLBOOK_AAD->(3)}
342    #########################################################
343    vmovdqu     16*3($inp), $DATA
344
345    vpclmulqdq \$0x10, poly(%rip), $T, $TMP4         # reduction stage 2a
346    vpalignr \$8, $T, $T, $T
347
348    ${\$SCHOOLBOOK_AAD->(4)}
349
350    vpxor $TMP4, $T, $T                              # reduction stage 2b
351    #########################################################
352    vmovdqu 16*2($inp), $DATA
353
354    ${\$SCHOOLBOOK_AAD->(5)}
355
356    vpxor $Xhi, $T, $T                               # reduction finalize
357    #########################################################
358    vmovdqu 16*1($inp), $DATA
359
360    ${\$SCHOOLBOOK_AAD->(6)}
361    #########################################################
362    vmovdqu 16*0($inp), $DATA
363    vpxor $T, $DATA, $DATA
364
365    ${\$SCHOOLBOOK_AAD->(7)}
366    #########################################################
367    vpsrldq \$8, $TMP2, $TMP3
368    vpslldq \$8, $TMP2, $TMP2
369
370    vpxor $TMP3, $TMP1, $Xhi
371    vpxor $TMP2, $TMP0, $T
372
373    lea 16*8($inp), $inp
374    jmp .Lhtable_polyval_main_loop
375
376    #########################################################
377
378.Lhtable_polyval_out:
379    vpclmulqdq  \$0x10, poly(%rip), $T, $TMP3
380    vpalignr    \$8, $T, $T, $T
381    vpxor       $TMP3, $T, $T
382
383    vpclmulqdq  \$0x10, poly(%rip), $T, $TMP3
384    vpalignr    \$8, $T, $T, $T
385    vpxor       $TMP3, $T, $T
386    vpxor       $Xhi, $T, $T
387
388    vmovdqu $T, ($Tp)
389    vzeroupper
390    ret
391.cfi_endproc
392.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
393___
394}
395aesgcmsiv_htable_polyval();
396
397sub aesgcmsiv_polyval_horner {
398  #void aesgcmsiv_polyval_horner(unsigned char T[16],  // output
399  #      const unsigned char* H, // H
400  #      unsigned char* BUF,  // Buffer
401  #      unsigned int blocks);  // Len2
402  #
403  # parameter 1: %rdi T - pointers to POLYVAL output
404  # parameter 2: %rsi Hp - pointer to H (user key)
405  # parameter 3: %rdx INp - pointer to input
406  # parameter 4: %rcx L - total number of blocks in input BUFFER
407  #
408  my $T = "%rdi";
409  my $Hp = "%rsi";
410  my $INp = "%rdx";
411  my $L = "%rcx";
412  my $LOC = "%r10";
413  my $LEN = "%eax";
414  my $H = "%xmm1";
415  my $RES = "%xmm0";
416
417  $code.=<<___;
418.globl aesgcmsiv_polyval_horner
419.type aesgcmsiv_polyval_horner,\@function,4
420.align 16
421aesgcmsiv_polyval_horner:
422.cfi_startproc
423    test $L, $L
424    jnz .Lpolyval_horner_start
425    ret
426
427.Lpolyval_horner_start:
428    # We will start with L GFMULS for POLYVAL(BIG_BUFFER)
429    # RES = GFMUL(RES, H)
430
431    xorq $LOC, $LOC
432    shlq \$4, $L    # L contains number of bytes to process
433
434    vmovdqa ($Hp), $H
435    vmovdqa ($T), $RES
436
437.Lpolyval_horner_loop:
438    vpxor ($INp,$LOC), $RES, $RES  # RES = RES + Xi
439    call GFMUL  # RES = RES * H
440
441    add \$16, $LOC
442    cmp $LOC, $L
443    jne .Lpolyval_horner_loop
444
445    # calculation of T is complete. RES=T
446    vmovdqa $RES, ($T)
447    ret
448.cfi_endproc
449.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
450___
451}
452aesgcmsiv_polyval_horner();
453
454# void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
455# parameter 1: %rdi
456# parameter 2: %rsi
457$code.=<<___;
458.globl aes128gcmsiv_aes_ks
459.type aes128gcmsiv_aes_ks,\@function,2
460.align 16
461aes128gcmsiv_aes_ks:
462.cfi_startproc
463    vmovdqu (%rdi), %xmm1           # xmm1 = user key
464    vmovdqa %xmm1, (%rsi)           # rsi points to output
465
466    vmovdqa con1(%rip), %xmm0
467    vmovdqa mask(%rip), %xmm15
468
469    movq \$8, %rax
470
471.Lks128_loop:
472    addq \$16, %rsi                 # rsi points for next key
473    subq \$1, %rax
474    vpshufb %xmm15, %xmm1, %xmm2    # xmm2 = shuffled user key
475    vaesenclast %xmm0, %xmm2, %xmm2
476    vpslld \$1, %xmm0, %xmm0
477    vpslldq \$4, %xmm1, %xmm3
478    vpxor %xmm3, %xmm1, %xmm1
479    vpslldq \$4, %xmm3, %xmm3
480    vpxor %xmm3, %xmm1, %xmm1
481    vpslldq \$4, %xmm3, %xmm3
482    vpxor %xmm3, %xmm1, %xmm1
483    vpxor %xmm2, %xmm1, %xmm1
484    vmovdqa %xmm1, (%rsi)
485    jne .Lks128_loop
486
487    vmovdqa con2(%rip), %xmm0
488    vpshufb %xmm15, %xmm1, %xmm2
489    vaesenclast %xmm0, %xmm2, %xmm2
490    vpslld \$1, %xmm0, %xmm0
491    vpslldq \$4, %xmm1, %xmm3
492    vpxor %xmm3, %xmm1, %xmm1
493    vpslldq \$4, %xmm3, %xmm3
494    vpxor %xmm3, %xmm1, %xmm1
495    vpslldq \$4, %xmm3, %xmm3
496    vpxor %xmm3, %xmm1, %xmm1
497    vpxor %xmm2, %xmm1, %xmm1
498    vmovdqa %xmm1, 16(%rsi)
499
500    vpshufb %xmm15, %xmm1, %xmm2
501    vaesenclast %xmm0, %xmm2, %xmm2
502    vpslldq \$4, %xmm1, %xmm3
503    vpxor %xmm3, %xmm1, %xmm1
504    vpslldq \$4, %xmm3, %xmm3
505    vpxor %xmm3, %xmm1, %xmm1
506    vpslldq \$4, %xmm3, %xmm3
507    vpxor %xmm3, %xmm1, %xmm1
508    vpxor %xmm2, %xmm1, %xmm1
509    vmovdqa %xmm1, 32(%rsi)
510    ret
511.cfi_endproc
512.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
513___
514
515# void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
516# parameter 1: %rdi
517# parameter 2: %rsi
518$code.=<<___;
519.globl aes256gcmsiv_aes_ks
520.type aes256gcmsiv_aes_ks,\@function,2
521.align 16
522aes256gcmsiv_aes_ks:
523.cfi_startproc
524    vmovdqu (%rdi), %xmm1
525    vmovdqu 16(%rdi), %xmm3
526    vmovdqa %xmm1, (%rsi)
527    vmovdqa %xmm3, 16(%rsi)
528    vmovdqa con1(%rip), %xmm0
529    vmovdqa mask(%rip), %xmm15
530    vpxor %xmm14, %xmm14, %xmm14
531    mov \$6, %rax
532
533.Lks256_loop:
534    add \$32, %rsi
535    subq \$1, %rax
536    vpshufb %xmm15, %xmm3, %xmm2
537    vaesenclast %xmm0, %xmm2, %xmm2
538    vpslld \$1, %xmm0, %xmm0
539    vpsllq \$32, %xmm1, %xmm4
540    vpxor %xmm4, %xmm1, %xmm1
541    vpshufb con3(%rip), %xmm1,  %xmm4
542    vpxor %xmm4, %xmm1, %xmm1
543    vpxor %xmm2, %xmm1, %xmm1
544    vmovdqa %xmm1, (%rsi)
545    vpshufd \$0xff, %xmm1, %xmm2
546    vaesenclast %xmm14, %xmm2, %xmm2
547    vpsllq \$32, %xmm3, %xmm4
548    vpxor %xmm4, %xmm3, %xmm3
549    vpshufb con3(%rip), %xmm3,  %xmm4
550    vpxor %xmm4, %xmm3, %xmm3
551    vpxor %xmm2, %xmm3, %xmm3
552    vmovdqa %xmm3, 16(%rsi)
553    jne .Lks256_loop
554
555    vpshufb %xmm15, %xmm3, %xmm2
556    vaesenclast %xmm0, %xmm2, %xmm2
557    vpsllq \$32, %xmm1, %xmm4
558    vpxor %xmm4, %xmm1, %xmm1
559    vpshufb con3(%rip), %xmm1,  %xmm4
560    vpxor %xmm4, %xmm1, %xmm1
561    vpxor %xmm2, %xmm1, %xmm1
562    vmovdqa %xmm1, 32(%rsi)
563    ret
564.cfi_endproc
565___
566
567sub aes128gcmsiv_aes_ks_enc_x1 {
568  my $KS1_REGA = "%xmm1";
569  my $KS1_REGB = "%xmm2";
570  my $BLOCK1 = "%xmm4";
571  my $AUXREG = "%xmm3";
572
573  my $KS_BLOCK = sub {
574    my ($reg, $reg2, $auxReg) = @_;
575    return <<___;
576    vpsllq \$32, $reg, $auxReg         #!!saving mov instruction to xmm3
577    vpxor $auxReg, $reg, $reg
578    vpshufb con3(%rip), $reg,  $auxReg
579    vpxor $auxReg, $reg, $reg
580    vpxor $reg2, $reg, $reg
581___
582  };
583
584  my $round = sub {
585    my ($i, $j) = @_;
586    return <<___;
587    vpshufb %xmm15, %xmm1, %xmm2      #!!saving mov instruction to xmm2
588    vaesenclast %xmm0, %xmm2, %xmm2
589    vpslld \$1, %xmm0, %xmm0
590    ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
591    vaesenc %xmm1, $BLOCK1, $BLOCK1
592    vmovdqa %xmm1, ${\eval(16*$i)}($j)
593___
594  };
595
596  my $roundlast = sub {
597    my ($i, $j) = @_;
598    return <<___;
599    vpshufb %xmm15, %xmm1, %xmm2      #!!saving mov instruction to xmm2
600    vaesenclast %xmm0, %xmm2, %xmm2
601    ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
602    vaesenclast %xmm1, $BLOCK1, $BLOCK1
603    vmovdqa %xmm1, ${\eval(16*$i)}($j)
604___
605  };
606
607# parameter 1: %rdi                         Pointer to PT
608# parameter 2: %rsi                         Pointer to CT
609# parameter 4: %rdx                         Pointer to keys
610# parameter 5: %rcx                         Pointer to initial key
611  $code.=<<___;
612.globl aes128gcmsiv_aes_ks_enc_x1
613.type aes128gcmsiv_aes_ks_enc_x1,\@function,4
614.align 16
615aes128gcmsiv_aes_ks_enc_x1:
616.cfi_startproc
617    vmovdqa (%rcx), %xmm1                 # xmm1 = first 16 bytes of random key
618    vmovdqa 0*16(%rdi), $BLOCK1
619
620    vmovdqa %xmm1, (%rdx)                 # KEY[0] = first 16 bytes of random key
621    vpxor %xmm1, $BLOCK1, $BLOCK1
622
623    vmovdqa con1(%rip), %xmm0             # xmm0  = 1,1,1,1
624    vmovdqa mask(%rip), %xmm15            # xmm15 = mask
625
626    ${\$round->(1, "%rdx")}
627    ${\$round->(2, "%rdx")}
628    ${\$round->(3, "%rdx")}
629    ${\$round->(4, "%rdx")}
630    ${\$round->(5, "%rdx")}
631    ${\$round->(6, "%rdx")}
632    ${\$round->(7, "%rdx")}
633    ${\$round->(8, "%rdx")}
634
635    vmovdqa con2(%rip), %xmm0
636
637    ${\$round->(9, "%rdx")}
638    ${\$roundlast->(10, "%rdx")}
639
640    vmovdqa $BLOCK1, 0*16(%rsi)
641    ret
642.cfi_endproc
643.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
644___
645}
646aes128gcmsiv_aes_ks_enc_x1();
647
648sub aes128gcmsiv_kdf {
649  my $BLOCK1 = "%xmm9";
650  my $BLOCK2 = "%xmm10";
651  my $BLOCK3 = "%xmm11";
652  my $BLOCK4 = "%xmm12";
653  my $BLOCK5 = "%xmm13";
654  my $BLOCK6 = "%xmm14";
655  my $ONE = "%xmm13";
656  my $KSp = "%rdx";
657  my $STATE_1 = "%xmm1";
658
659  my $enc_roundx4 = sub {
660    my ($i, $j) = @_;
661    return <<___;
662    vmovdqa ${\eval($i*16)}(%rdx), $j
663    vaesenc $j, $BLOCK1, $BLOCK1
664    vaesenc $j, $BLOCK2, $BLOCK2
665    vaesenc $j, $BLOCK3, $BLOCK3
666    vaesenc $j, $BLOCK4, $BLOCK4
667___
668  };
669
670  my $enc_roundlastx4 = sub {
671    my ($i, $j) = @_;
672    return <<___;
673    vmovdqa ${\eval($i*16)}(%rdx), $j
674    vaesenclast $j, $BLOCK1, $BLOCK1
675    vaesenclast $j, $BLOCK2, $BLOCK2
676    vaesenclast $j, $BLOCK3, $BLOCK3
677    vaesenclast $j, $BLOCK4, $BLOCK4
678___
679  };
680
681# void aes128gcmsiv_kdf(const uint8_t nonce[16],
682#                       uint8_t *out_key_material,
683#                       const uint8_t *key_schedule);
684  $code.=<<___;
685.globl aes128gcmsiv_kdf
686.type aes128gcmsiv_kdf,\@function,3
687.align 16
688aes128gcmsiv_kdf:
689.cfi_startproc
690# parameter 1: %rdi                         Pointer to NONCE
691# parameter 2: %rsi                         Pointer to CT
692# parameter 4: %rdx                         Pointer to keys
693
694    vmovdqa (%rdx), %xmm1                  # xmm1 = first 16 bytes of random key
695    vmovdqa 0*16(%rdi), $BLOCK1
696    vmovdqa and_mask(%rip), $BLOCK4
697    vmovdqa one(%rip), $ONE
698    vpshufd \$0x90, $BLOCK1, $BLOCK1
699    vpand $BLOCK4, $BLOCK1, $BLOCK1
700    vpaddd $ONE, $BLOCK1, $BLOCK2
701    vpaddd $ONE, $BLOCK2, $BLOCK3
702    vpaddd $ONE, $BLOCK3, $BLOCK4
703
704    vpxor %xmm1, $BLOCK1, $BLOCK1
705    vpxor %xmm1, $BLOCK2, $BLOCK2
706    vpxor %xmm1, $BLOCK3, $BLOCK3
707    vpxor %xmm1, $BLOCK4, $BLOCK4
708
709    ${\$enc_roundx4->(1, "%xmm1")}
710    ${\$enc_roundx4->(2, "%xmm2")}
711    ${\$enc_roundx4->(3, "%xmm1")}
712    ${\$enc_roundx4->(4, "%xmm2")}
713    ${\$enc_roundx4->(5, "%xmm1")}
714    ${\$enc_roundx4->(6, "%xmm2")}
715    ${\$enc_roundx4->(7, "%xmm1")}
716    ${\$enc_roundx4->(8, "%xmm2")}
717    ${\$enc_roundx4->(9, "%xmm1")}
718    ${\$enc_roundlastx4->(10, "%xmm2")}
719
720    vmovdqa $BLOCK1, 0*16(%rsi)
721    vmovdqa $BLOCK2, 1*16(%rsi)
722    vmovdqa $BLOCK3, 2*16(%rsi)
723    vmovdqa $BLOCK4, 3*16(%rsi)
724    ret
725.cfi_endproc
726.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
727___
728}
729aes128gcmsiv_kdf();
730
731sub aes128gcmsiv_enc_msg_x4 {
732  my $CTR1 = "%xmm0";
733  my $CTR2 = "%xmm1";
734  my $CTR3 = "%xmm2";
735  my $CTR4 = "%xmm3";
736  my $ADDER = "%xmm4";
737
738  my $STATE1 = "%xmm5";
739  my $STATE2 = "%xmm6";
740  my $STATE3 = "%xmm7";
741  my $STATE4 = "%xmm8";
742
743  my $TMP = "%xmm12";
744  my $TMP2 = "%xmm13";
745  my $TMP3 = "%xmm14";
746  my $IV = "%xmm15";
747
748  my $PT = "%rdi";
749  my $CT = "%rsi";
750  my $TAG = "%rdx";
751  my $KS = "%rcx";
752  my $LEN = "%r8";
753
754  my $aes_round = sub {
755    my ($i) = @_;
756    return <<___;
757    vmovdqu ${\eval($i*16)}($KS), $TMP
758    vaesenc $TMP, $STATE1, $STATE1
759    vaesenc $TMP, $STATE2, $STATE2
760    vaesenc $TMP, $STATE3, $STATE3
761    vaesenc $TMP, $STATE4, $STATE4
762___
763  };
764
765  my $aes_lastround = sub {
766    my ($i) = @_;
767    return <<___;
768    vmovdqu ${\eval($i*16)}($KS), $TMP
769    vaesenclast $TMP, $STATE1, $STATE1
770    vaesenclast $TMP, $STATE2, $STATE2
771    vaesenclast $TMP, $STATE3, $STATE3
772    vaesenclast $TMP, $STATE4, $STATE4
773___
774  };
775
776# void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
777#                              unsigned char* TAG, unsigned char* KS,
778#                              size_t byte_len);
779# parameter 1: %rdi     #PT
780# parameter 2: %rsi     #CT
781# parameter 3: %rdx     #TAG  [127 126 ... 0]  IV=[127...32]
782# parameter 4: %rcx     #KS
783# parameter 5: %r8      #LEN MSG_length in bytes
784  $code.=<<___;
785.globl aes128gcmsiv_enc_msg_x4
786.type aes128gcmsiv_enc_msg_x4,\@function,5
787.align 16
788aes128gcmsiv_enc_msg_x4:
789.cfi_startproc
790    test $LEN, $LEN
791    jnz .L128_enc_msg_x4_start
792    ret
793
794.L128_enc_msg_x4_start:
795    pushq %r12
796.cfi_push %r12
797    pushq %r13
798.cfi_push %r13
799
800    shrq \$4, $LEN      # LEN = num of blocks
801    movq $LEN, %r10
802    shlq \$62, %r10
803    shrq \$62, %r10
804
805    # make IV from TAG
806    vmovdqa ($TAG), $IV
807    vpor OR_MASK(%rip), $IV, $IV  #IV = [1]TAG[126...32][00..00]
808
809    vmovdqu four(%rip), $ADDER     # Register to increment counters
810    vmovdqa $IV, $CTR1             # CTR1 = TAG[1][127...32][00..00]
811    vpaddd one(%rip), $IV, $CTR2   # CTR2 = TAG[1][127...32][00..01]
812    vpaddd two(%rip), $IV, $CTR3   # CTR3 = TAG[1][127...32][00..02]
813    vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
814
815    shrq \$2, $LEN
816    je .L128_enc_msg_x4_check_remainder
817
818    subq \$64, $CT
819    subq \$64, $PT
820
821.L128_enc_msg_x4_loop1:
822    addq \$64, $CT
823    addq \$64, $PT
824
825    vmovdqa $CTR1, $STATE1
826    vmovdqa $CTR2, $STATE2
827    vmovdqa $CTR3, $STATE3
828    vmovdqa $CTR4, $STATE4
829
830    vpxor ($KS), $STATE1, $STATE1
831    vpxor ($KS), $STATE2, $STATE2
832    vpxor ($KS), $STATE3, $STATE3
833    vpxor ($KS), $STATE4, $STATE4
834
835    ${\$aes_round->(1)}
836    vpaddd $ADDER, $CTR1, $CTR1
837    ${\$aes_round->(2)}
838    vpaddd $ADDER, $CTR2, $CTR2
839    ${\$aes_round->(3)}
840    vpaddd $ADDER, $CTR3, $CTR3
841    ${\$aes_round->(4)}
842    vpaddd $ADDER, $CTR4, $CTR4
843
844    ${\$aes_round->(5)}
845    ${\$aes_round->(6)}
846    ${\$aes_round->(7)}
847    ${\$aes_round->(8)}
848    ${\$aes_round->(9)}
849    ${\$aes_lastround->(10)}
850
851    # XOR with Plaintext
852    vpxor 0*16($PT), $STATE1, $STATE1
853    vpxor 1*16($PT), $STATE2, $STATE2
854    vpxor 2*16($PT), $STATE3, $STATE3
855    vpxor 3*16($PT), $STATE4, $STATE4
856
857    subq \$1, $LEN
858
859    vmovdqu $STATE1, 0*16($CT)
860    vmovdqu $STATE2, 1*16($CT)
861    vmovdqu $STATE3, 2*16($CT)
862    vmovdqu $STATE4, 3*16($CT)
863
864    jne .L128_enc_msg_x4_loop1
865
866    addq \$64,$CT
867    addq \$64,$PT
868
869.L128_enc_msg_x4_check_remainder:
870    cmpq \$0, %r10
871    je .L128_enc_msg_x4_out
872
873.L128_enc_msg_x4_loop2:
874    # enc each block separately
875    # CTR1 is the highest counter (even if no LOOP done)
876    vmovdqa $CTR1, $STATE1
877    vpaddd one(%rip), $CTR1, $CTR1  # inc counter
878
879    vpxor ($KS), $STATE1, $STATE1
880    vaesenc 16($KS), $STATE1, $STATE1
881    vaesenc 32($KS), $STATE1, $STATE1
882    vaesenc 48($KS), $STATE1, $STATE1
883    vaesenc 64($KS), $STATE1, $STATE1
884    vaesenc 80($KS), $STATE1, $STATE1
885    vaesenc 96($KS), $STATE1, $STATE1
886    vaesenc 112($KS), $STATE1, $STATE1
887    vaesenc 128($KS), $STATE1, $STATE1
888    vaesenc 144($KS), $STATE1, $STATE1
889    vaesenclast 160($KS), $STATE1, $STATE1
890
891    # XOR with plaintext
892    vpxor ($PT), $STATE1, $STATE1
893    vmovdqu $STATE1, ($CT)
894
895    addq \$16, $PT
896    addq \$16, $CT
897
898    subq \$1, %r10
899    jne .L128_enc_msg_x4_loop2
900
901.L128_enc_msg_x4_out:
902    popq %r13
903.cfi_pop %r13
904    popq %r12
905.cfi_pop %r12
906    ret
907.cfi_endproc
908.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
909___
910}
911aes128gcmsiv_enc_msg_x4();
912
913sub aes128gcmsiv_enc_msg_x8 {
914  my $STATE1 = "%xmm1";
915  my $STATE2 = "%xmm2";
916  my $STATE3 = "%xmm3";
917  my $STATE4 = "%xmm4";
918  my $STATE5 = "%xmm5";
919  my $STATE6 = "%xmm6";
920  my $STATE7 = "%xmm7";
921  my $STATE8 = "%xmm8";
922
923  my $CTR1 = "%xmm0";
924  my $CTR2 = "%xmm9";
925  my $CTR3 = "%xmm10";
926  my $CTR4 = "%xmm11";
927  my $CTR5 = "%xmm12";
928  my $CTR6 = "%xmm13";
929  my $CTR7 = "%xmm14";
930  my $SCHED = "%xmm15";
931
932  my $TMP1 = "%xmm1";
933  my $TMP2 = "%xmm2";
934
935  my $PT = "%rdi";
936  my $CT = "%rsi";
937  my $TAG = "%rdx";
938  my $KS = "%rcx";
939  my $LEN = "%r8";
940
941  my $aes_round8 = sub {
942    my ($i) = @_;
943    return <<___;
944    vmovdqu ${\eval($i*16)}($KS), $SCHED
945    vaesenc $SCHED, $STATE1, $STATE1
946    vaesenc $SCHED, $STATE2, $STATE2
947    vaesenc $SCHED, $STATE3, $STATE3
948    vaesenc $SCHED, $STATE4, $STATE4
949    vaesenc $SCHED, $STATE5, $STATE5
950    vaesenc $SCHED, $STATE6, $STATE6
951    vaesenc $SCHED, $STATE7, $STATE7
952    vaesenc $SCHED, $STATE8, $STATE8
953___
954  };
955
956  my $aes_lastround8 = sub {
957    my ($i) = @_;
958    return <<___;
959    vmovdqu ${\eval($i*16)}($KS), $SCHED
960    vaesenclast $SCHED, $STATE1, $STATE1
961    vaesenclast $SCHED, $STATE2, $STATE2
962    vaesenclast $SCHED, $STATE3, $STATE3
963    vaesenclast $SCHED, $STATE4, $STATE4
964    vaesenclast $SCHED, $STATE5, $STATE5
965    vaesenclast $SCHED, $STATE6, $STATE6
966    vaesenclast $SCHED, $STATE7, $STATE7
967    vaesenclast $SCHED, $STATE8, $STATE8
968___
969  };
970
971# void ENC_MSG_x8(unsigned char* PT,
972#                 unsigned char* CT,
973#                 unsigned char* TAG,
974#                 unsigned char* KS,
975#                 size_t byte_len);
976# parameter 1: %rdi     #PT
977# parameter 2: %rsi     #CT
978# parameter 3: %rdx     #TAG        [127 126 ... 0]  IV=[127...32]
979# parameter 4: %rcx     #KS
980# parameter 5: %r8      #LEN MSG_length in bytes
981  $code.=<<___;
982.globl aes128gcmsiv_enc_msg_x8
983.type aes128gcmsiv_enc_msg_x8,\@function,5
984.align 16
985aes128gcmsiv_enc_msg_x8:
986.cfi_startproc
987    test $LEN, $LEN
988    jnz .L128_enc_msg_x8_start
989    ret
990
991.L128_enc_msg_x8_start:
992    pushq %r12
993.cfi_push %r12
994    pushq %r13
995.cfi_push %r13
996    pushq %rbp
997.cfi_push %rbp
998    movq %rsp, %rbp
999.cfi_def_cfa_register rbp
1000
1001    # Place in stack
1002    subq \$128, %rsp
1003    andq \$-64, %rsp
1004
1005    shrq \$4, $LEN  # LEN = num of blocks
1006    movq $LEN, %r10
1007    shlq \$61, %r10
1008    shrq \$61, %r10
1009
1010    # make IV from TAG
1011    vmovdqu ($TAG), $TMP1
1012    vpor OR_MASK(%rip), $TMP1, $TMP1  # TMP1= IV = [1]TAG[126...32][00..00]
1013
1014    # store counter8 in the stack
1015    vpaddd seven(%rip), $TMP1, $CTR1
1016    vmovdqu $CTR1, (%rsp)             # CTR8 = TAG[127...32][00..07]
1017    vpaddd one(%rip), $TMP1, $CTR2    # CTR2 = TAG[127...32][00..01]
1018    vpaddd two(%rip), $TMP1, $CTR3    # CTR3 = TAG[127...32][00..02]
1019    vpaddd three(%rip), $TMP1, $CTR4  # CTR4 = TAG[127...32][00..03]
1020    vpaddd four(%rip), $TMP1, $CTR5   # CTR5 = TAG[127...32][00..04]
1021    vpaddd five(%rip), $TMP1, $CTR6   # CTR6 = TAG[127...32][00..05]
1022    vpaddd six(%rip), $TMP1, $CTR7    # CTR7 = TAG[127...32][00..06]
1023    vmovdqa $TMP1, $CTR1              # CTR1 = TAG[127...32][00..00]
1024
1025    shrq \$3, $LEN
1026    je .L128_enc_msg_x8_check_remainder
1027
1028    subq \$128, $CT
1029    subq \$128, $PT
1030
1031.L128_enc_msg_x8_loop1:
1032    addq \$128, $CT
1033    addq \$128, $PT
1034
1035    vmovdqa $CTR1, $STATE1
1036    vmovdqa $CTR2, $STATE2
1037    vmovdqa $CTR3, $STATE3
1038    vmovdqa $CTR4, $STATE4
1039    vmovdqa $CTR5, $STATE5
1040    vmovdqa $CTR6, $STATE6
1041    vmovdqa $CTR7, $STATE7
1042    # move from stack
1043    vmovdqu (%rsp), $STATE8
1044
1045    vpxor ($KS), $STATE1, $STATE1
1046    vpxor ($KS), $STATE2, $STATE2
1047    vpxor ($KS), $STATE3, $STATE3
1048    vpxor ($KS), $STATE4, $STATE4
1049    vpxor ($KS), $STATE5, $STATE5
1050    vpxor ($KS), $STATE6, $STATE6
1051    vpxor ($KS), $STATE7, $STATE7
1052    vpxor ($KS), $STATE8, $STATE8
1053
1054    ${\$aes_round8->(1)}
1055    vmovdqu (%rsp), $CTR7  # deal with CTR8
1056    vpaddd eight(%rip), $CTR7, $CTR7
1057    vmovdqu $CTR7, (%rsp)
1058    ${\$aes_round8->(2)}
1059    vpsubd one(%rip), $CTR7, $CTR7
1060    ${\$aes_round8->(3)}
1061    vpaddd eight(%rip), $CTR1, $CTR1
1062    ${\$aes_round8->(4)}
1063    vpaddd eight(%rip), $CTR2, $CTR2
1064    ${\$aes_round8->(5)}
1065    vpaddd eight(%rip), $CTR3, $CTR3
1066    ${\$aes_round8->(6)}
1067    vpaddd eight(%rip), $CTR4, $CTR4
1068    ${\$aes_round8->(7)}
1069    vpaddd eight(%rip), $CTR5, $CTR5
1070    ${\$aes_round8->(8)}
1071    vpaddd eight(%rip), $CTR6, $CTR6
1072    ${\$aes_round8->(9)}
1073    ${\$aes_lastround8->(10)}
1074
1075    # XOR with Plaintext
1076    vpxor 0*16($PT), $STATE1, $STATE1
1077    vpxor 1*16($PT), $STATE2, $STATE2
1078    vpxor 2*16($PT), $STATE3, $STATE3
1079    vpxor 3*16($PT), $STATE4, $STATE4
1080    vpxor 4*16($PT), $STATE5, $STATE5
1081    vpxor 5*16($PT), $STATE6, $STATE6
1082    vpxor 6*16($PT), $STATE7, $STATE7
1083    vpxor 7*16($PT), $STATE8, $STATE8
1084
1085    dec $LEN
1086
1087    vmovdqu $STATE1, 0*16($CT)
1088    vmovdqu $STATE2, 1*16($CT)
1089    vmovdqu $STATE3, 2*16($CT)
1090    vmovdqu $STATE4, 3*16($CT)
1091    vmovdqu $STATE5, 4*16($CT)
1092    vmovdqu $STATE6, 5*16($CT)
1093    vmovdqu $STATE7, 6*16($CT)
1094    vmovdqu $STATE8, 7*16($CT)
1095
1096    jne .L128_enc_msg_x8_loop1
1097
1098    addq \$128, $CT
1099    addq \$128, $PT
1100
1101.L128_enc_msg_x8_check_remainder:
1102    cmpq \$0, %r10
1103    je .L128_enc_msg_x8_out
1104
1105.L128_enc_msg_x8_loop2:
1106    # enc each block separately
1107    # CTR1 is the highest counter (even if no LOOP done)
1108    vmovdqa $CTR1, $STATE1
1109    vpaddd one(%rip), $CTR1, $CTR1  # inc counter
1110
1111    vpxor ($KS), $STATE1, $STATE1
1112    vaesenc 16($KS), $STATE1, $STATE1
1113    vaesenc 32($KS), $STATE1, $STATE1
1114    vaesenc 48($KS), $STATE1, $STATE1
1115    vaesenc 64($KS), $STATE1, $STATE1
1116    vaesenc 80($KS), $STATE1, $STATE1
1117    vaesenc 96($KS), $STATE1, $STATE1
1118    vaesenc 112($KS), $STATE1, $STATE1
1119    vaesenc 128($KS), $STATE1, $STATE1
1120    vaesenc 144($KS), $STATE1, $STATE1
1121    vaesenclast 160($KS), $STATE1, $STATE1
1122
1123    # XOR with Plaintext
1124    vpxor ($PT), $STATE1, $STATE1
1125
1126    vmovdqu $STATE1, ($CT)
1127
1128    addq \$16, $PT
1129    addq \$16, $CT
1130
1131    decq %r10
1132    jne .L128_enc_msg_x8_loop2
1133
1134.L128_enc_msg_x8_out:
1135    movq %rbp, %rsp
1136.cfi_def_cfa_register %rsp
1137    popq %rbp
1138.cfi_pop %rbp
1139    popq %r13
1140.cfi_pop %r13
1141    popq %r12
1142.cfi_pop %r12
1143    ret
1144.cfi_endproc
1145.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
1146___
1147}
1148aes128gcmsiv_enc_msg_x8();
1149
1150sub aesgcmsiv_dec {
1151  my ($aes256) = @_;
1152
1153  my $T = "%xmm0";
1154  my $TMP0 = "%xmm1";
1155  my $TMP1 = "%xmm2";
1156  my $TMP2 = "%xmm3";
1157  my $TMP3 = "%xmm4";
1158  my $TMP4 = "%xmm5";
1159  my $TMP5 = "%xmm6";
1160  my $CTR1 = "%xmm7";
1161  my $CTR2 = "%xmm8";
1162  my $CTR3 = "%xmm9";
1163  my $CTR4 = "%xmm10";
1164  my $CTR5 = "%xmm11";
1165  my $CTR6 = "%xmm12";
1166  my $CTR = "%xmm15";
1167  my $CT = "%rdi";
1168  my $PT = "%rsi";
1169  my $POL = "%rdx";
1170  my $Htbl = "%rcx";
1171  my $KS = "%r8";
1172  my $LEN = "%r9";
1173  my $secureBuffer = "%rax";
1174  my $HTABLE_ROUNDS = "%xmm13";
1175
1176  my $labelPrefix = "128";
1177  if ($aes256) {
1178    $labelPrefix = "256";
1179  }
1180
1181  my $aes_round_dec = sub {
1182    my ($i) = @_;
1183    return <<___;
1184    vmovdqu ${\eval($i*16)}($KS), $TMP3
1185    vaesenc $TMP3, $CTR1, $CTR1
1186    vaesenc $TMP3, $CTR2, $CTR2
1187    vaesenc $TMP3, $CTR3, $CTR3
1188    vaesenc $TMP3, $CTR4, $CTR4
1189    vaesenc $TMP3, $CTR5, $CTR5
1190    vaesenc $TMP3, $CTR6, $CTR6
1191___
1192  };
1193
1194  my $aes_lastround_dec = sub {
1195    my ($i) = @_;
1196    return <<___;
1197    vmovdqu ${\eval($i*16)}($KS), $TMP3
1198    vaesenclast $TMP3, $CTR1, $CTR1
1199    vaesenclast $TMP3, $CTR2, $CTR2
1200    vaesenclast $TMP3, $CTR3, $CTR3
1201    vaesenclast $TMP3, $CTR4, $CTR4
1202    vaesenclast $TMP3, $CTR5, $CTR5
1203    vaesenclast $TMP3, $CTR6, $CTR6
1204___
1205  };
1206
1207  my $schoolbook = sub {
1208    my ($i) = @_;
1209    return <<___;
1210    vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5
1211    vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS
1212
1213    vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3
1214    vpxor $TMP3, $TMP0, $TMP0
1215    vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3
1216    vpxor $TMP3, $TMP1, $TMP1
1217    vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3
1218    vpxor $TMP3, $TMP2, $TMP2
1219    vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3
1220    vpxor $TMP3, $TMP0, $TMP0
1221___
1222  };
1223
1224  if ($aes256) {
1225    $code.=<<___;
1226.globl aes256gcmsiv_dec
1227.type aes256gcmsiv_dec,\@function,6
1228.align 16
1229aes256gcmsiv_dec:
1230___
1231  } else {
1232    $code.=<<___;
1233.globl aes128gcmsiv_dec
1234.type aes128gcmsiv_dec,\@function,6
1235.align 16
1236aes128gcmsiv_dec:
1237___
1238  }
1239
1240  $code.=<<___;
1241.cfi_startproc
1242    test \$~15, $LEN
1243    jnz .L${labelPrefix}_dec_start
1244    ret
1245
1246.L${labelPrefix}_dec_start:
1247    vzeroupper
1248    vmovdqa ($POL), $T
1249    movq $POL, $secureBuffer
1250
1251    leaq 32($secureBuffer), $secureBuffer
1252    leaq 32($Htbl), $Htbl
1253
1254    # make CTRBLKs from given tag.
1255    vmovdqu ($CT,$LEN), $CTR
1256    vpor OR_MASK(%rip), $CTR, $CTR      # CTR = [1]TAG[126...32][00..00]
1257    andq \$~15, $LEN
1258
1259    # If less then 6 blocks, make singles
1260    cmp \$96, $LEN
1261    jb .L${labelPrefix}_dec_loop2
1262
1263    # Decrypt the first six blocks
1264    sub \$96, $LEN
1265    vmovdqa $CTR, $CTR1
1266    vpaddd one(%rip), $CTR1, $CTR2
1267    vpaddd two(%rip), $CTR1, $CTR3
1268    vpaddd one(%rip), $CTR3, $CTR4
1269    vpaddd two(%rip), $CTR3, $CTR5
1270    vpaddd one(%rip), $CTR5, $CTR6
1271    vpaddd two(%rip), $CTR5, $CTR
1272
1273    vpxor ($KS), $CTR1, $CTR1
1274    vpxor ($KS), $CTR2, $CTR2
1275    vpxor ($KS), $CTR3, $CTR3
1276    vpxor ($KS), $CTR4, $CTR4
1277    vpxor ($KS), $CTR5, $CTR5
1278    vpxor ($KS), $CTR6, $CTR6
1279
1280    ${\$aes_round_dec->(1)}
1281    ${\$aes_round_dec->(2)}
1282    ${\$aes_round_dec->(3)}
1283    ${\$aes_round_dec->(4)}
1284    ${\$aes_round_dec->(5)}
1285    ${\$aes_round_dec->(6)}
1286    ${\$aes_round_dec->(7)}
1287    ${\$aes_round_dec->(8)}
1288    ${\$aes_round_dec->(9)}
1289___
1290
1291if ($aes256) {
1292$code.=<<___;
1293    ${\$aes_round_dec->(10)}
1294    ${\$aes_round_dec->(11)}
1295    ${\$aes_round_dec->(12)}
1296    ${\$aes_round_dec->(13)}
1297    ${\$aes_lastround_dec->(14)}
1298___
1299} else {
1300$code.=<<___;
1301    ${\$aes_lastround_dec->(10)}
1302___
1303}
1304
1305$code.=<<___;
1306    # XOR with CT
1307    vpxor 0*16($CT), $CTR1, $CTR1
1308    vpxor 1*16($CT), $CTR2, $CTR2
1309    vpxor 2*16($CT), $CTR3, $CTR3
1310    vpxor 3*16($CT), $CTR4, $CTR4
1311    vpxor 4*16($CT), $CTR5, $CTR5
1312    vpxor 5*16($CT), $CTR6, $CTR6
1313
1314    vmovdqu $CTR1, 0*16($PT)
1315    vmovdqu $CTR2, 1*16($PT)
1316    vmovdqu $CTR3, 2*16($PT)
1317    vmovdqu $CTR4, 3*16($PT)
1318    vmovdqu $CTR5, 4*16($PT)
1319    vmovdqu $CTR6, 5*16($PT)
1320
1321    addq \$96, $CT
1322    addq \$96, $PT
1323    jmp .L${labelPrefix}_dec_loop1
1324
1325# Decrypt 6 blocks each time while hashing previous 6 blocks
1326.align 64
1327.L${labelPrefix}_dec_loop1:
1328    cmp \$96, $LEN
1329    jb .L${labelPrefix}_dec_finish_96
1330    sub \$96, $LEN
1331
1332    vmovdqa $CTR6, $TMP5
1333    vmovdqa $CTR5, 1*16-32($secureBuffer)
1334    vmovdqa $CTR4, 2*16-32($secureBuffer)
1335    vmovdqa $CTR3, 3*16-32($secureBuffer)
1336    vmovdqa $CTR2, 4*16-32($secureBuffer)
1337    vmovdqa $CTR1, 5*16-32($secureBuffer)
1338
1339    vmovdqa $CTR, $CTR1
1340    vpaddd one(%rip), $CTR1, $CTR2
1341    vpaddd two(%rip), $CTR1, $CTR3
1342    vpaddd one(%rip), $CTR3, $CTR4
1343    vpaddd two(%rip), $CTR3, $CTR5
1344    vpaddd one(%rip), $CTR5, $CTR6
1345    vpaddd two(%rip), $CTR5, $CTR
1346
1347    vmovdqa ($KS), $TMP3
1348    vpxor $TMP3, $CTR1, $CTR1
1349    vpxor $TMP3, $CTR2, $CTR2
1350    vpxor $TMP3, $CTR3, $CTR3
1351    vpxor $TMP3, $CTR4, $CTR4
1352    vpxor $TMP3, $CTR5, $CTR5
1353    vpxor $TMP3, $CTR6, $CTR6
1354
1355    vmovdqu 0*16-32($Htbl), $TMP3
1356    vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
1357    vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
1358    vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0
1359    vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3
1360    vpxor $TMP3, $TMP0, $TMP0
1361
1362    ${\$aes_round_dec->(1)}
1363    ${\$schoolbook->(1)}
1364
1365    ${\$aes_round_dec->(2)}
1366    ${\$schoolbook->(2)}
1367
1368    ${\$aes_round_dec->(3)}
1369    ${\$schoolbook->(3)}
1370
1371    ${\$aes_round_dec->(4)}
1372    ${\$schoolbook->(4)}
1373
1374    ${\$aes_round_dec->(5)}
1375    ${\$aes_round_dec->(6)}
1376    ${\$aes_round_dec->(7)}
1377
1378    vmovdqa 5*16-32($secureBuffer), $TMP5
1379    vpxor $T, $TMP5, $TMP5
1380    vmovdqu 5*16-32($Htbl), $TMP4
1381
1382    vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
1383    vpxor $TMP3, $TMP0, $TMP0
1384    vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
1385    vpxor $TMP3, $TMP1, $TMP1
1386    vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
1387    vpxor $TMP3, $TMP2, $TMP2
1388    vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
1389    vpxor $TMP3, $TMP0, $TMP0
1390
1391    ${\$aes_round_dec->(8)}
1392
1393    vpsrldq \$8, $TMP0, $TMP3
1394    vpxor $TMP3, $TMP1, $TMP4
1395    vpslldq \$8, $TMP0, $TMP3
1396    vpxor $TMP3, $TMP2, $T
1397
1398    vmovdqa poly(%rip), $TMP2
1399
1400    ${\$aes_round_dec->(9)}
1401___
1402
1403if ($aes256) {
1404$code.=<<___;
1405    ${\$aes_round_dec->(10)}
1406    ${\$aes_round_dec->(11)}
1407    ${\$aes_round_dec->(12)}
1408    ${\$aes_round_dec->(13)}
1409    vmovdqu 14*16($KS), $TMP5
1410___
1411} else {
1412$code.=<<___;
1413    vmovdqu 10*16($KS), $TMP5
1414___
1415}
1416
1417$code.=<<___;
1418    vpalignr \$8, $T, $T, $TMP1
1419    vpclmulqdq \$0x10, $TMP2, $T, $T
1420    vpxor $T, $TMP1, $T
1421
1422    vpxor 0*16($CT), $TMP5, $TMP3
1423    vaesenclast $TMP3, $CTR1, $CTR1
1424    vpxor 1*16($CT), $TMP5, $TMP3
1425    vaesenclast $TMP3, $CTR2, $CTR2
1426    vpxor 2*16($CT), $TMP5, $TMP3
1427    vaesenclast $TMP3, $CTR3, $CTR3
1428    vpxor 3*16($CT), $TMP5, $TMP3
1429    vaesenclast $TMP3, $CTR4, $CTR4
1430    vpxor 4*16($CT), $TMP5, $TMP3
1431    vaesenclast $TMP3, $CTR5, $CTR5
1432    vpxor 5*16($CT), $TMP5, $TMP3
1433    vaesenclast $TMP3, $CTR6, $CTR6
1434
1435    vpalignr \$8, $T, $T, $TMP1
1436    vpclmulqdq \$0x10, $TMP2, $T, $T
1437    vpxor $T, $TMP1, $T
1438
1439    vmovdqu $CTR1, 0*16($PT)
1440    vmovdqu $CTR2, 1*16($PT)
1441    vmovdqu $CTR3, 2*16($PT)
1442    vmovdqu $CTR4, 3*16($PT)
1443    vmovdqu $CTR5, 4*16($PT)
1444    vmovdqu $CTR6, 5*16($PT)
1445
1446    vpxor $TMP4, $T, $T
1447
1448    lea 96($CT), $CT
1449    lea 96($PT), $PT
1450    jmp .L${labelPrefix}_dec_loop1
1451
1452.L${labelPrefix}_dec_finish_96:
1453    vmovdqa $CTR6, $TMP5
1454    vmovdqa $CTR5, 1*16-32($secureBuffer)
1455    vmovdqa $CTR4, 2*16-32($secureBuffer)
1456    vmovdqa $CTR3, 3*16-32($secureBuffer)
1457    vmovdqa $CTR2, 4*16-32($secureBuffer)
1458    vmovdqa $CTR1, 5*16-32($secureBuffer)
1459
1460    vmovdqu 0*16-32($Htbl), $TMP3
1461    vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0
1462    vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
1463    vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
1464    vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3
1465    vpxor $TMP3, $TMP0, $TMP0
1466
1467    ${\$schoolbook->(1)}
1468    ${\$schoolbook->(2)}
1469    ${\$schoolbook->(3)}
1470    ${\$schoolbook->(4)}
1471
1472    vmovdqu 5*16-32($secureBuffer), $TMP5
1473    vpxor $T, $TMP5, $TMP5
1474    vmovdqu 5*16-32($Htbl), $TMP4
1475    vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
1476    vpxor $TMP3, $TMP1, $TMP1
1477    vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
1478    vpxor $TMP3, $TMP2, $TMP2
1479    vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
1480    vpxor $TMP3, $TMP0, $TMP0
1481    vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
1482    vpxor $TMP3, $TMP0, $TMP0
1483
1484    vpsrldq \$8, $TMP0, $TMP3
1485    vpxor $TMP3, $TMP1, $TMP4
1486    vpslldq \$8, $TMP0, $TMP3
1487    vpxor $TMP3, $TMP2, $T
1488
1489    vmovdqa poly(%rip), $TMP2
1490
1491    vpalignr \$8, $T, $T, $TMP1
1492    vpclmulqdq \$0x10, $TMP2, $T, $T
1493    vpxor $T, $TMP1, $T
1494
1495    vpalignr \$8, $T, $T, $TMP1
1496    vpclmulqdq \$0x10, $TMP2, $T, $T
1497    vpxor $T, $TMP1, $T
1498
1499    vpxor $TMP4, $T, $T
1500
1501.L${labelPrefix}_dec_loop2:
1502    # Here we encrypt any remaining whole block
1503
1504    # if there are no whole blocks
1505    cmp \$16, $LEN
1506    jb .L${labelPrefix}_dec_out
1507    sub \$16, $LEN
1508
1509    vmovdqa $CTR, $TMP1
1510    vpaddd one(%rip), $CTR, $CTR
1511
1512    vpxor 0*16($KS), $TMP1, $TMP1
1513    vaesenc 1*16($KS), $TMP1, $TMP1
1514    vaesenc 2*16($KS), $TMP1, $TMP1
1515    vaesenc 3*16($KS), $TMP1, $TMP1
1516    vaesenc 4*16($KS), $TMP1, $TMP1
1517    vaesenc 5*16($KS), $TMP1, $TMP1
1518    vaesenc 6*16($KS), $TMP1, $TMP1
1519    vaesenc 7*16($KS), $TMP1, $TMP1
1520    vaesenc 8*16($KS), $TMP1, $TMP1
1521    vaesenc 9*16($KS), $TMP1, $TMP1
1522___
1523if ($aes256) {
1524$code.=<<___;
1525    vaesenc 10*16($KS), $TMP1, $TMP1
1526    vaesenc 11*16($KS), $TMP1, $TMP1
1527    vaesenc 12*16($KS), $TMP1, $TMP1
1528    vaesenc 13*16($KS), $TMP1, $TMP1
1529    vaesenclast 14*16($KS), $TMP1, $TMP1
1530___
1531} else {
1532$code.=<<___;
1533    vaesenclast 10*16($KS), $TMP1, $TMP1
1534___
1535}
1536
1537$code.=<<___;
1538    vpxor ($CT), $TMP1, $TMP1
1539    vmovdqu $TMP1, ($PT)
1540    addq \$16, $CT
1541    addq \$16, $PT
1542
1543    vpxor $TMP1, $T, $T
1544    vmovdqa -32($Htbl), $TMP0
1545    call GFMUL
1546
1547    jmp .L${labelPrefix}_dec_loop2
1548
1549.L${labelPrefix}_dec_out:
1550    vmovdqu $T, ($POL)
1551    ret
1552.cfi_endproc
1553___
1554
1555  if ($aes256) {
1556    $code.=<<___;
1557.size aes256gcmsiv_dec, .-aes256gcmsiv_dec
1558___
1559  } else {
1560    $code.=<<___;
1561.size aes128gcmsiv_dec, .-aes128gcmsiv_dec
1562___
1563  }
1564}
1565
1566aesgcmsiv_dec(0);  # emit 128-bit version
1567
1568sub aes128gcmsiv_ecb_enc_block {
1569  my $STATE_1 = "%xmm1";
1570  my $KSp = "%rdx";
1571
1572  # parameter 1: PT            %rdi    (pointer to 128 bit)
1573  # parameter 2: CT            %rsi    (pointer to 128 bit)
1574  # parameter 3: ks            %rdx    (pointer to ks)
1575  $code.=<<___;
1576.globl aes128gcmsiv_ecb_enc_block
1577.type aes128gcmsiv_ecb_enc_block,\@function,3
1578.align 16
1579aes128gcmsiv_ecb_enc_block:
1580.cfi_startproc
1581    vmovdqa (%rdi), $STATE_1
1582
1583    vpxor       ($KSp), $STATE_1, $STATE_1
1584    vaesenc 1*16($KSp), $STATE_1, $STATE_1
1585    vaesenc 2*16($KSp), $STATE_1, $STATE_1
1586    vaesenc 3*16($KSp), $STATE_1, $STATE_1
1587    vaesenc 4*16($KSp), $STATE_1, $STATE_1
1588    vaesenc 5*16($KSp), $STATE_1, $STATE_1
1589    vaesenc 6*16($KSp), $STATE_1, $STATE_1
1590    vaesenc 7*16($KSp), $STATE_1, $STATE_1
1591    vaesenc 8*16($KSp), $STATE_1, $STATE_1
1592    vaesenc 9*16($KSp), $STATE_1, $STATE_1
1593    vaesenclast 10*16($KSp), $STATE_1, $STATE_1    # STATE_1 == IV
1594
1595    vmovdqa $STATE_1, (%rsi)
1596
1597    ret
1598.cfi_endproc
1599.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
1600___
1601}
1602aes128gcmsiv_ecb_enc_block();
1603
1604sub aes256gcmsiv_aes_ks_enc_x1 {
1605  my $KS = "%rdx";
1606  my $KEYp = "%rcx";
1607  my $CON_MASK = "%xmm0";
1608  my $MASK_256 = "%xmm15";
1609  my $KEY_1 = "%xmm1";
1610  my $KEY_2 = "%xmm3";
1611  my $BLOCK1 = "%xmm8";
1612  my $AUX_REG = "%xmm14";
1613  my $PT = "%rdi";
1614  my $CT = "%rsi";
1615
1616  my $round_double = sub {
1617    my ($i, $j) = @_;
1618    return <<___;
1619    vpshufb %xmm15, %xmm3, %xmm2
1620    vaesenclast %xmm0, %xmm2, %xmm2
1621    vpslld \$1, %xmm0, %xmm0
1622    vpslldq \$4, %xmm1, %xmm4
1623    vpxor %xmm4, %xmm1, %xmm1
1624    vpslldq \$4, %xmm4, %xmm4
1625    vpxor %xmm4, %xmm1, %xmm1
1626    vpslldq \$4, %xmm4, %xmm4
1627    vpxor %xmm4, %xmm1, %xmm1
1628    vpxor %xmm2, %xmm1, %xmm1
1629    vaesenc %xmm1, $BLOCK1, $BLOCK1
1630    vmovdqu %xmm1, ${\eval(16*$i)}($KS)
1631
1632    vpshufd \$0xff, %xmm1, %xmm2
1633    vaesenclast %xmm14, %xmm2, %xmm2
1634    vpslldq \$4, %xmm3, %xmm4
1635    vpxor %xmm4, %xmm3, %xmm3
1636    vpslldq \$4, %xmm4, %xmm4
1637    vpxor %xmm4, %xmm3, %xmm3
1638    vpslldq \$4, %xmm4, %xmm4
1639    vpxor %xmm4, %xmm3, %xmm3
1640    vpxor %xmm2, %xmm3, %xmm3
1641    vaesenc %xmm3, $BLOCK1, $BLOCK1
1642    vmovdqu %xmm3, ${\eval(16*$j)}($KS)
1643___
1644  };
1645
1646  my $round_last = sub {
1647    my ($i) = @_;
1648    return <<___;
1649    vpshufb %xmm15, %xmm3, %xmm2
1650    vaesenclast %xmm0, %xmm2, %xmm2
1651    vpslldq \$4, %xmm1, %xmm4
1652    vpxor %xmm4, %xmm1, %xmm1
1653    vpslldq \$4, %xmm4, %xmm4
1654    vpxor %xmm4, %xmm1, %xmm1
1655    vpslldq \$4, %xmm4, %xmm4
1656    vpxor %xmm4, %xmm1, %xmm1
1657    vpxor %xmm2, %xmm1, %xmm1
1658    vaesenclast %xmm1, $BLOCK1, $BLOCK1
1659    vmovdqu %xmm1, ${\eval(16*$i)}($KS)
1660___
1661  };
1662
1663  # parameter 1: %rdi         Pointer to PT1
1664  # parameter 2: %rsi         Pointer to CT1
1665  # parameter 3: %rdx         Pointer to KS
1666  # parameter 4: %rcx         Pointer to initial key
1667  $code.=<<___;
1668.globl aes256gcmsiv_aes_ks_enc_x1
1669.type aes256gcmsiv_aes_ks_enc_x1,\@function,4
1670.align 16
1671aes256gcmsiv_aes_ks_enc_x1:
1672.cfi_startproc
1673    vmovdqa con1(%rip), $CON_MASK    # CON_MASK  = 1,1,1,1
1674    vmovdqa mask(%rip), $MASK_256    # MASK_256
1675    vmovdqa ($PT), $BLOCK1
1676    vmovdqa ($KEYp), $KEY_1          # KEY_1 || KEY_2 [0..7] = user key
1677    vmovdqa 16($KEYp), $KEY_2
1678    vpxor $KEY_1, $BLOCK1, $BLOCK1
1679    vaesenc $KEY_2, $BLOCK1, $BLOCK1
1680    vmovdqu $KEY_1, ($KS)            # First round key
1681    vmovdqu $KEY_2, 16($KS)
1682    vpxor $AUX_REG, $AUX_REG, $AUX_REG
1683
1684    ${\$round_double->(2, 3)}
1685    ${\$round_double->(4, 5)}
1686    ${\$round_double->(6, 7)}
1687    ${\$round_double->(8, 9)}
1688    ${\$round_double->(10, 11)}
1689    ${\$round_double->(12, 13)}
1690    ${\$round_last->(14)}
1691    vmovdqa $BLOCK1, ($CT)
1692    ret
1693.cfi_endproc
1694.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
1695___
1696}
1697aes256gcmsiv_aes_ks_enc_x1();
1698
1699sub aes256gcmsiv_ecb_enc_block {
1700  my $STATE_1 = "%xmm1";
1701  my $PT = "%rdi";
1702  my $CT = "%rsi";
1703  my $KSp = "%rdx";
1704
1705  # parameter 1: PT            %rdi    (pointer to 128 bit)
1706  # parameter 2: CT            %rsi    (pointer to 128 bit)
1707  # parameter 3: ks            %rdx    (pointer to ks)
1708  $code.=<<___;
1709.globl aes256gcmsiv_ecb_enc_block
1710.type aes256gcmsiv_ecb_enc_block,\@function,3
1711.align 16
1712aes256gcmsiv_ecb_enc_block:
1713.cfi_startproc
1714    vmovdqa (%rdi), $STATE_1
1715    vpxor ($KSp), $STATE_1, $STATE_1
1716    vaesenc 1*16($KSp), $STATE_1, $STATE_1
1717    vaesenc 2*16($KSp), $STATE_1, $STATE_1
1718    vaesenc 3*16($KSp), $STATE_1, $STATE_1
1719    vaesenc 4*16($KSp), $STATE_1, $STATE_1
1720    vaesenc 5*16($KSp), $STATE_1, $STATE_1
1721    vaesenc 6*16($KSp), $STATE_1, $STATE_1
1722    vaesenc 7*16($KSp), $STATE_1, $STATE_1
1723    vaesenc 8*16($KSp), $STATE_1, $STATE_1
1724    vaesenc 9*16($KSp), $STATE_1, $STATE_1
1725    vaesenc 10*16($KSp), $STATE_1, $STATE_1
1726    vaesenc 11*16($KSp), $STATE_1, $STATE_1
1727    vaesenc 12*16($KSp), $STATE_1, $STATE_1
1728    vaesenc 13*16($KSp), $STATE_1, $STATE_1
1729    vaesenclast 14*16($KSp), $STATE_1, $STATE_1    # $STATE_1 == IV
1730    vmovdqa $STATE_1, (%rsi)
1731    ret
1732.cfi_endproc
1733.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
1734___
1735}
1736aes256gcmsiv_ecb_enc_block();
1737
1738sub aes256gcmsiv_enc_msg_x4 {
1739  my $CTR1 = "%xmm0";
1740  my $CTR2 = "%xmm1";
1741  my $CTR3 = "%xmm2";
1742  my $CTR4 = "%xmm3";
1743  my $ADDER = "%xmm4";
1744
1745  my $STATE1 = "%xmm5";
1746  my $STATE2 = "%xmm6";
1747  my $STATE3 = "%xmm7";
1748  my $STATE4 = "%xmm8";
1749
1750  my $TMP = "%xmm12";
1751  my $TMP2 = "%xmm13";
1752  my $TMP3 = "%xmm14";
1753  my $IV = "%xmm15";
1754
1755  my $PT = "%rdi";
1756  my $CT = "%rsi";
1757  my $TAG = "%rdx";
1758  my $KS = "%rcx";
1759  my $LEN = "%r8";
1760
1761  my $aes_round = sub {
1762    my ($i) = @_;
1763    return <<___;
1764    vmovdqu ${\eval($i*16)}($KS), $TMP
1765    vaesenc $TMP, $STATE1, $STATE1
1766    vaesenc $TMP, $STATE2, $STATE2
1767    vaesenc $TMP, $STATE3, $STATE3
1768    vaesenc $TMP, $STATE4, $STATE4
1769___
1770  };
1771
1772  my $aes_lastround = sub {
1773    my ($i) = @_;
1774    return <<___;
1775    vmovdqu ${\eval($i*16)}($KS), $TMP
1776    vaesenclast $TMP, $STATE1, $STATE1
1777    vaesenclast $TMP, $STATE2, $STATE2
1778    vaesenclast $TMP, $STATE3, $STATE3
1779    vaesenclast $TMP, $STATE4, $STATE4
1780___
1781  };
1782
1783  # void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
1784  #                              unsigned char* TAG, unsigned char* KS,
1785  #                              size_t byte_len);
1786  # parameter 1: %rdi     #PT
1787  # parameter 2: %rsi     #CT
1788  # parameter 3: %rdx     #TAG  [127 126 ... 0]  IV=[127...32]
1789  # parameter 4: %rcx     #KS
1790  # parameter 5: %r8      #LEN MSG_length in bytes
1791  $code.=<<___;
1792.globl aes256gcmsiv_enc_msg_x4
1793.type aes256gcmsiv_enc_msg_x4,\@function,5
1794.align 16
1795aes256gcmsiv_enc_msg_x4:
1796.cfi_startproc
1797    test $LEN, $LEN
1798    jnz .L256_enc_msg_x4_start
1799    ret
1800
1801.L256_enc_msg_x4_start:
1802    movq $LEN, %r10
1803    shrq \$4, $LEN                       # LEN = num of blocks
1804    shlq \$60, %r10
1805    jz .L256_enc_msg_x4_start2
1806    addq \$1, $LEN
1807
1808.L256_enc_msg_x4_start2:
1809    movq $LEN, %r10
1810    shlq \$62, %r10
1811    shrq \$62, %r10
1812
1813    # make IV from TAG
1814    vmovdqa ($TAG), $IV
1815    vpor OR_MASK(%rip), $IV, $IV        # IV = [1]TAG[126...32][00..00]
1816
1817    vmovdqa four(%rip), $ADDER          # Register to increment counters
1818    vmovdqa $IV, $CTR1                  # CTR1 = TAG[1][127...32][00..00]
1819    vpaddd one(%rip), $IV, $CTR2        # CTR2 = TAG[1][127...32][00..01]
1820    vpaddd two(%rip), $IV, $CTR3        # CTR3 = TAG[1][127...32][00..02]
1821    vpaddd three(%rip), $IV, $CTR4      # CTR4 = TAG[1][127...32][00..03]
1822
1823    shrq \$2, $LEN
1824    je .L256_enc_msg_x4_check_remainder
1825
1826    subq \$64, $CT
1827    subq \$64, $PT
1828
1829.L256_enc_msg_x4_loop1:
1830    addq \$64, $CT
1831    addq \$64, $PT
1832
1833    vmovdqa $CTR1, $STATE1
1834    vmovdqa $CTR2, $STATE2
1835    vmovdqa $CTR3, $STATE3
1836    vmovdqa $CTR4, $STATE4
1837
1838    vpxor ($KS), $STATE1, $STATE1
1839    vpxor ($KS), $STATE2, $STATE2
1840    vpxor ($KS), $STATE3, $STATE3
1841    vpxor ($KS), $STATE4, $STATE4
1842
1843    ${\$aes_round->(1)}
1844    vpaddd $ADDER, $CTR1, $CTR1
1845    ${\$aes_round->(2)}
1846    vpaddd $ADDER, $CTR2, $CTR2
1847    ${\$aes_round->(3)}
1848    vpaddd $ADDER, $CTR3, $CTR3
1849    ${\$aes_round->(4)}
1850    vpaddd $ADDER, $CTR4, $CTR4
1851
1852    ${\$aes_round->(5)}
1853    ${\$aes_round->(6)}
1854    ${\$aes_round->(7)}
1855    ${\$aes_round->(8)}
1856    ${\$aes_round->(9)}
1857    ${\$aes_round->(10)}
1858    ${\$aes_round->(11)}
1859    ${\$aes_round->(12)}
1860    ${\$aes_round->(13)}
1861    ${\$aes_lastround->(14)}
1862
1863    # XOR with Plaintext
1864    vpxor 0*16($PT), $STATE1, $STATE1
1865    vpxor 1*16($PT), $STATE2, $STATE2
1866    vpxor 2*16($PT), $STATE3, $STATE3
1867    vpxor 3*16($PT), $STATE4, $STATE4
1868
1869    subq \$1, $LEN
1870
1871    vmovdqu $STATE1, 0*16($CT)
1872    vmovdqu $STATE2, 1*16($CT)
1873    vmovdqu $STATE3, 2*16($CT)
1874    vmovdqu $STATE4, 3*16($CT)
1875
1876    jne .L256_enc_msg_x4_loop1
1877
1878    addq \$64, $CT
1879    addq \$64, $PT
1880
1881.L256_enc_msg_x4_check_remainder:
1882    cmpq \$0, %r10
1883    je .L256_enc_msg_x4_out
1884
1885.L256_enc_msg_x4_loop2:
1886    # encrypt each block separately
1887    # CTR1 is the highest counter (even if no LOOP done)
1888
1889    vmovdqa $CTR1, $STATE1
1890    vpaddd one(%rip), $CTR1, $CTR1      # inc counter
1891    vpxor ($KS), $STATE1, $STATE1
1892    vaesenc 16($KS), $STATE1, $STATE1
1893    vaesenc 32($KS), $STATE1, $STATE1
1894    vaesenc 48($KS), $STATE1, $STATE1
1895    vaesenc 64($KS), $STATE1, $STATE1
1896    vaesenc 80($KS), $STATE1, $STATE1
1897    vaesenc 96($KS), $STATE1, $STATE1
1898    vaesenc 112($KS), $STATE1, $STATE1
1899    vaesenc 128($KS), $STATE1, $STATE1
1900    vaesenc 144($KS), $STATE1, $STATE1
1901    vaesenc 160($KS), $STATE1, $STATE1
1902    vaesenc 176($KS), $STATE1, $STATE1
1903    vaesenc 192($KS), $STATE1, $STATE1
1904    vaesenc 208($KS), $STATE1, $STATE1
1905    vaesenclast 224($KS), $STATE1, $STATE1
1906
1907    # XOR with Plaintext
1908    vpxor ($PT), $STATE1, $STATE1
1909
1910    vmovdqu $STATE1, ($CT)
1911
1912    addq \$16, $PT
1913    addq \$16, $CT
1914
1915    subq \$1, %r10
1916    jne .L256_enc_msg_x4_loop2
1917
1918.L256_enc_msg_x4_out:
1919    ret
1920.cfi_endproc
1921.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
1922___
1923}
1924aes256gcmsiv_enc_msg_x4();
1925
1926sub aes256gcmsiv_enc_msg_x8() {
1927  my $STATE1 = "%xmm1";
1928  my $STATE2 = "%xmm2";
1929  my $STATE3 = "%xmm3";
1930  my $STATE4 = "%xmm4";
1931  my $STATE5 = "%xmm5";
1932  my $STATE6 = "%xmm6";
1933  my $STATE7 = "%xmm7";
1934  my $STATE8 = "%xmm8";
1935  my $CTR1 = "%xmm0";
1936  my $CTR2 = "%xmm9";
1937  my $CTR3 = "%xmm10";
1938  my $CTR4 = "%xmm11";
1939  my $CTR5 = "%xmm12";
1940  my $CTR6 = "%xmm13";
1941  my $CTR7 = "%xmm14";
1942  my $TMP1 = "%xmm1";
1943  my $TMP2 = "%xmm2";
1944  my $KS = "%rcx";
1945  my $LEN = "%r8";
1946  my $PT = "%rdi";
1947  my $CT = "%rsi";
1948  my $TAG = "%rdx";
1949  my $SCHED = "%xmm15";
1950
1951  my $aes_round8 = sub {
1952    my ($i) = @_;
1953    return <<___;
1954    vmovdqu ${\eval($i*16)}($KS), $SCHED
1955    vaesenc $SCHED, $STATE1, $STATE1
1956    vaesenc $SCHED, $STATE2, $STATE2
1957    vaesenc $SCHED, $STATE3, $STATE3
1958    vaesenc $SCHED, $STATE4, $STATE4
1959    vaesenc $SCHED, $STATE5, $STATE5
1960    vaesenc $SCHED, $STATE6, $STATE6
1961    vaesenc $SCHED, $STATE7, $STATE7
1962    vaesenc $SCHED, $STATE8, $STATE8
1963___
1964  };
1965
1966  my $aes_lastround8 = sub {
1967    my ($i) = @_;
1968    return <<___;
1969    vmovdqu ${\eval($i*16)}($KS), $SCHED
1970    vaesenclast $SCHED, $STATE1, $STATE1
1971    vaesenclast $SCHED, $STATE2, $STATE2
1972    vaesenclast $SCHED, $STATE3, $STATE3
1973    vaesenclast $SCHED, $STATE4, $STATE4
1974    vaesenclast $SCHED, $STATE5, $STATE5
1975    vaesenclast $SCHED, $STATE6, $STATE6
1976    vaesenclast $SCHED, $STATE7, $STATE7
1977    vaesenclast $SCHED, $STATE8, $STATE8
1978___
1979  };
1980
1981  # void ENC_MSG_x8(unsigned char* PT,
1982  #                 unsigned char* CT,
1983  #                 unsigned char* TAG,
1984  #                 unsigned char* KS,
1985  #                 size_t byte_len);
1986  # parameter 1: %rdi     #PT
1987  # parameter 2: %rsi     #CT
1988  # parameter 3: %rdx     #TAG        [127 126 ... 0]  IV=[127...32]
1989  # parameter 4: %rcx     #KS
1990  # parameter 5: %r8      #LEN MSG_length in bytes
1991  $code.=<<___;
1992.globl aes256gcmsiv_enc_msg_x8
1993.type aes256gcmsiv_enc_msg_x8,\@function,5
1994.align 16
1995aes256gcmsiv_enc_msg_x8:
1996.cfi_startproc
1997    test $LEN, $LEN
1998    jnz .L256_enc_msg_x8_start
1999    ret
2000
2001.L256_enc_msg_x8_start:
2002    # Place in stack
2003    movq %rsp, %r11
2004    subq \$16, %r11
2005    andq \$-64, %r11
2006
2007    movq $LEN, %r10
2008    shrq \$4, $LEN                       # LEN = num of blocks
2009    shlq \$60, %r10
2010    jz .L256_enc_msg_x8_start2
2011    addq \$1, $LEN
2012
2013.L256_enc_msg_x8_start2:
2014    movq $LEN, %r10
2015    shlq \$61, %r10
2016    shrq \$61, %r10
2017
2018    # Make IV from TAG
2019    vmovdqa ($TAG), $TMP1
2020    vpor OR_MASK(%rip), $TMP1, $TMP1    # TMP1= IV = [1]TAG[126...32][00..00]
2021
2022    # store counter8 on the stack
2023    vpaddd seven(%rip), $TMP1, $CTR1
2024    vmovdqa $CTR1, (%r11)                # CTR8 = TAG[127...32][00..07]
2025    vpaddd one(%rip), $TMP1, $CTR2       # CTR2 = TAG[127...32][00..01]
2026    vpaddd two(%rip), $TMP1, $CTR3       # CTR3 = TAG[127...32][00..02]
2027    vpaddd three(%rip), $TMP1, $CTR4     # CTR4 = TAG[127...32][00..03]
2028    vpaddd four(%rip), $TMP1, $CTR5      # CTR5 = TAG[127...32][00..04]
2029    vpaddd five(%rip), $TMP1, $CTR6      # CTR6 = TAG[127...32][00..05]
2030    vpaddd six(%rip), $TMP1, $CTR7       # CTR7 = TAG[127...32][00..06]
2031    vmovdqa $TMP1, $CTR1                 # CTR1 = TAG[127...32][00..00]
2032
2033    shrq \$3, $LEN
2034    jz .L256_enc_msg_x8_check_remainder
2035
2036    subq \$128, $CT
2037    subq \$128, $PT
2038
2039.L256_enc_msg_x8_loop1:
2040    addq \$128, $CT
2041    addq \$128, $PT
2042
2043    vmovdqa $CTR1, $STATE1
2044    vmovdqa $CTR2, $STATE2
2045    vmovdqa $CTR3, $STATE3
2046    vmovdqa $CTR4, $STATE4
2047    vmovdqa $CTR5, $STATE5
2048    vmovdqa $CTR6, $STATE6
2049    vmovdqa $CTR7, $STATE7
2050    # move from stack
2051    vmovdqa (%r11), $STATE8
2052
2053    vpxor ($KS), $STATE1, $STATE1
2054    vpxor ($KS), $STATE2, $STATE2
2055    vpxor ($KS), $STATE3, $STATE3
2056    vpxor ($KS), $STATE4, $STATE4
2057    vpxor ($KS), $STATE5, $STATE5
2058    vpxor ($KS), $STATE6, $STATE6
2059    vpxor ($KS), $STATE7, $STATE7
2060    vpxor ($KS), $STATE8, $STATE8
2061
2062    ${\$aes_round8->(1)}
2063    vmovdqa (%r11), $CTR7                # deal with CTR8
2064    vpaddd eight(%rip), $CTR7, $CTR7
2065    vmovdqa $CTR7, (%r11)
2066    ${\$aes_round8->(2)}
2067    vpsubd one(%rip), $CTR7, $CTR7
2068    ${\$aes_round8->(3)}
2069    vpaddd eight(%rip), $CTR1, $CTR1
2070    ${\$aes_round8->(4)}
2071    vpaddd eight(%rip), $CTR2, $CTR2
2072    ${\$aes_round8->(5)}
2073    vpaddd eight(%rip), $CTR3, $CTR3
2074    ${\$aes_round8->(6)}
2075    vpaddd eight(%rip), $CTR4, $CTR4
2076    ${\$aes_round8->(7)}
2077    vpaddd eight(%rip), $CTR5, $CTR5
2078    ${\$aes_round8->(8)}
2079    vpaddd eight(%rip), $CTR6, $CTR6
2080    ${\$aes_round8->(9)}
2081    ${\$aes_round8->(10)}
2082    ${\$aes_round8->(11)}
2083    ${\$aes_round8->(12)}
2084    ${\$aes_round8->(13)}
2085    ${\$aes_lastround8->(14)}
2086
2087    # XOR with Plaintext
2088    vpxor 0*16($PT), $STATE1, $STATE1
2089    vpxor 1*16($PT), $STATE2, $STATE2
2090    vpxor 2*16($PT), $STATE3, $STATE3
2091    vpxor 3*16($PT), $STATE4, $STATE4
2092    vpxor 4*16($PT), $STATE5, $STATE5
2093    vpxor 5*16($PT), $STATE6, $STATE6
2094    vpxor 6*16($PT), $STATE7, $STATE7
2095    vpxor 7*16($PT), $STATE8, $STATE8
2096
2097    subq \$1, $LEN
2098
2099    vmovdqu $STATE1, 0*16($CT)
2100    vmovdqu $STATE2, 1*16($CT)
2101    vmovdqu $STATE3, 2*16($CT)
2102    vmovdqu $STATE4, 3*16($CT)
2103    vmovdqu $STATE5, 4*16($CT)
2104    vmovdqu $STATE6, 5*16($CT)
2105    vmovdqu $STATE7, 6*16($CT)
2106    vmovdqu $STATE8, 7*16($CT)
2107
2108    jne .L256_enc_msg_x8_loop1
2109
2110    addq \$128, $CT
2111    addq \$128, $PT
2112
2113.L256_enc_msg_x8_check_remainder:
2114   cmpq \$0, %r10
2115   je .L256_enc_msg_x8_out
2116
2117.L256_enc_msg_x8_loop2:
2118    # encrypt each block separately
2119    # CTR1 is the highest counter (even if no LOOP done)
2120    vmovdqa $CTR1, $STATE1
2121    vpaddd one(%rip), $CTR1, $CTR1
2122
2123    vpxor ($KS), $STATE1, $STATE1
2124    vaesenc 16($KS), $STATE1, $STATE1
2125    vaesenc 32($KS), $STATE1, $STATE1
2126    vaesenc 48($KS), $STATE1, $STATE1
2127    vaesenc 64($KS), $STATE1, $STATE1
2128    vaesenc 80($KS), $STATE1, $STATE1
2129    vaesenc 96($KS), $STATE1, $STATE1
2130    vaesenc 112($KS), $STATE1, $STATE1
2131    vaesenc 128($KS), $STATE1, $STATE1
2132    vaesenc 144($KS), $STATE1, $STATE1
2133    vaesenc 160($KS), $STATE1, $STATE1
2134    vaesenc 176($KS), $STATE1, $STATE1
2135    vaesenc 192($KS), $STATE1, $STATE1
2136    vaesenc 208($KS), $STATE1, $STATE1
2137    vaesenclast 224($KS), $STATE1, $STATE1
2138
2139    # XOR with Plaintext
2140    vpxor ($PT), $STATE1, $STATE1
2141
2142    vmovdqu $STATE1, ($CT)
2143
2144    addq \$16, $PT
2145    addq \$16, $CT
2146    subq \$1, %r10
2147    jnz .L256_enc_msg_x8_loop2
2148
2149.L256_enc_msg_x8_out:
2150    ret
2151
2152.cfi_endproc
2153.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
2154___
2155}
2156aes256gcmsiv_enc_msg_x8();
2157aesgcmsiv_dec(1);
2158
2159sub aes256gcmsiv_kdf {
2160  my $ONE = "%xmm8";
2161  my $BLOCK1 = "%xmm4";
2162  my $BLOCK2 = "%xmm6";
2163  my $BLOCK3 = "%xmm7";
2164  my $BLOCK4 = "%xmm11";
2165  my $BLOCK5 = "%xmm12";
2166  my $BLOCK6 = "%xmm13";
2167
2168  my $enc_roundx6 = sub {
2169    my ($i, $j) = @_;
2170    return <<___;
2171    vmovdqa ${\eval($i*16)}(%rdx), $j
2172    vaesenc $j, $BLOCK1, $BLOCK1
2173    vaesenc $j, $BLOCK2, $BLOCK2
2174    vaesenc $j, $BLOCK3, $BLOCK3
2175    vaesenc $j, $BLOCK4, $BLOCK4
2176    vaesenc $j, $BLOCK5, $BLOCK5
2177    vaesenc $j, $BLOCK6, $BLOCK6
2178___
2179  };
2180
2181  my $enc_roundlastx6 = sub {
2182    my ($i, $j) = @_;
2183    return <<___;
2184    vmovdqa ${\eval($i*16)}(%rdx), $j
2185    vaesenclast $j, $BLOCK1, $BLOCK1
2186    vaesenclast $j, $BLOCK2, $BLOCK2
2187    vaesenclast $j, $BLOCK3, $BLOCK3
2188    vaesenclast $j, $BLOCK4, $BLOCK4
2189    vaesenclast $j, $BLOCK5, $BLOCK5
2190    vaesenclast $j, $BLOCK6, $BLOCK6
2191___
2192  };
2193
2194  # void aes256gcmsiv_kdf(const uint8_t nonce[16],
2195  #                       uint8_t *out_key_material,
2196  #                       const uint8_t *key_schedule);
2197  $code.=<<___;
2198.globl aes256gcmsiv_kdf
2199.type aes256gcmsiv_kdf,\@function,3
2200.align 16
2201aes256gcmsiv_kdf:
2202.cfi_startproc
2203# parameter 1: %rdi                         Pointer to NONCE
2204# parameter 2: %rsi                         Pointer to CT
2205# parameter 4: %rdx                         Pointer to keys
2206
2207    vmovdqa (%rdx), %xmm1                  # xmm1 = first 16 bytes of random key
2208    vmovdqa 0*16(%rdi), $BLOCK1
2209    vmovdqa and_mask(%rip), $BLOCK4
2210    vmovdqa one(%rip), $ONE
2211    vpshufd \$0x90, $BLOCK1, $BLOCK1
2212    vpand $BLOCK4, $BLOCK1, $BLOCK1
2213    vpaddd $ONE, $BLOCK1, $BLOCK2
2214    vpaddd $ONE, $BLOCK2, $BLOCK3
2215    vpaddd $ONE, $BLOCK3, $BLOCK4
2216    vpaddd $ONE, $BLOCK4, $BLOCK5
2217    vpaddd $ONE, $BLOCK5, $BLOCK6
2218
2219    vpxor %xmm1, $BLOCK1, $BLOCK1
2220    vpxor %xmm1, $BLOCK2, $BLOCK2
2221    vpxor %xmm1, $BLOCK3, $BLOCK3
2222    vpxor %xmm1, $BLOCK4, $BLOCK4
2223    vpxor %xmm1, $BLOCK5, $BLOCK5
2224    vpxor %xmm1, $BLOCK6, $BLOCK6
2225
2226    ${\$enc_roundx6->(1, "%xmm1")}
2227    ${\$enc_roundx6->(2, "%xmm2")}
2228    ${\$enc_roundx6->(3, "%xmm1")}
2229    ${\$enc_roundx6->(4, "%xmm2")}
2230    ${\$enc_roundx6->(5, "%xmm1")}
2231    ${\$enc_roundx6->(6, "%xmm2")}
2232    ${\$enc_roundx6->(7, "%xmm1")}
2233    ${\$enc_roundx6->(8, "%xmm2")}
2234    ${\$enc_roundx6->(9, "%xmm1")}
2235    ${\$enc_roundx6->(10, "%xmm2")}
2236    ${\$enc_roundx6->(11, "%xmm1")}
2237    ${\$enc_roundx6->(12, "%xmm2")}
2238    ${\$enc_roundx6->(13, "%xmm1")}
2239    ${\$enc_roundlastx6->(14, "%xmm2")}
2240
2241    vmovdqa $BLOCK1, 0*16(%rsi)
2242    vmovdqa $BLOCK2, 1*16(%rsi)
2243    vmovdqa $BLOCK3, 2*16(%rsi)
2244    vmovdqa $BLOCK4, 3*16(%rsi)
2245    vmovdqa $BLOCK5, 4*16(%rsi)
2246    vmovdqa $BLOCK6, 5*16(%rsi)
2247    ret
2248.cfi_endproc
2249.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
2250___
2251}
2252aes256gcmsiv_kdf();
2253
2254print $code;
2255
2256close STDOUT;
2257