18ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan#!/usr/bin/env perl
28ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
38ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# Copyright (c) 2017, Shay Gueron.
48ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# Copyright (c) 2017, Google Inc.
58ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan#
68ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# Permission to use, copy, modify, and/or distribute this software for any
78ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# purpose with or without fee is hereby granted, provided that the above
88ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# copyright notice and this permission notice appear in all copies.
98ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan#
108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanuse warnings FATAL => 'all';
198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$flavour = shift;
218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$output  = shift;
228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloandie "can't locate x86_64-xlate.pl";
308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanopen OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan*STDOUT=*OUT;
338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.data
368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanone:
398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.quad 1,0
408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloantwo:
418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.quad 2,0
428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanthree:
438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.quad 3,0
448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanfour:
458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.quad 4,0
468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanfive:
478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.quad 5,0
488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansix:
498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.quad 6,0
508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanseven:
518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.quad 7,0
528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloaneight:
538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.quad 8,0
548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert SloanOR_MASK:
568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.long 0x00000000,0x00000000,0x00000000,0x80000000
578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanpoly:
588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.quad 0x1, 0xc200000000000000
598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanmask:
608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloancon1:
628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.long 1,1,1,1
638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloancon2:
648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.long 0x1b,0x1b,0x1b,0x1b
658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloancon3:
668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanand_mask:
688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.long 0,0xffffffff, 0xffffffff, 0xffffffff
698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.text
738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub gfmul {
768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #########################
778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # a = T
788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # b = TMP0 - remains unchanged
798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # res = T
808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # uses also TMP1,TMP2,TMP3,TMP4
818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # __m128i GFMUL(__m128i A, __m128i B);
828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $T = "%xmm0";
848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP0 = "%xmm1";
858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP1 = "%xmm2";
868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP2 = "%xmm3";
878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP3 = "%xmm4";
888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP4 = "%xmm5";
898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type GFMUL,\@abi-omnipotent
928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert SloanGFMUL:
948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x00, $TMP0, $T, $TMP1
968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x11, $TMP0, $T, $TMP4
978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x10, $TMP0, $T, $TMP2
988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x01, $TMP0, $T, $TMP3
998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP3, $TMP2, $TMP2
1008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq     \$8, $TMP2, $TMP3
1018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpsrldq     \$8, $TMP2, $TMP2
1028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP3, $TMP1, $TMP1
1038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP2, $TMP4, $TMP4
1048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
1058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x10, poly(%rip), $TMP1, $TMP2
1068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufd     \$78, $TMP1, $TMP3
1078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP3, $TMP2, $TMP1
1088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
1098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x10, poly(%rip), $TMP1, $TMP2
1108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufd     \$78, $TMP1, $TMP3
1118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP3, $TMP2, $TMP1
1128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
1138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP4, $TMP1, $T
1148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
1158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
1168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size GFMUL, .-GFMUL
1178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
1188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
1198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloangfmul();
1208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
1218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aesgcmsiv_htable_init {
1228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to
1238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # |out_htable|.
1248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H);
1258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
1268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $Htbl = "%rdi";
1278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $H = "%rsi";
1288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $T = "%xmm0";
1298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP0 = "%xmm1";
1308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
1318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
1328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aesgcmsiv_htable_init
1338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aesgcmsiv_htable_init,\@function,2
1348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
1358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaesgcmsiv_htable_init:
1368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
1378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($H), $T
1388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, $TMP0
1398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, ($Htbl)      # H
1408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 16($Htbl)    # H^2
1428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 32($Htbl)    # H^3
1448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 48($Htbl)    # H^4
1468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 64($Htbl)    # H^5
1488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 80($Htbl)    # H^6
1508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 96($Htbl)    # H^7
1528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 112($Htbl)   # H^8
1548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
1558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
1568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
1578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
1588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
1598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaesgcmsiv_htable_init();
1608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
1618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aesgcmsiv_htable6_init {
1628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to
1638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # |out_htable|.
1648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H);
1658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #
1668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $Htbl = "%rdi";
1678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $H = "%rsi";
1688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $T = "%xmm0";
1698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP0 = "%xmm1";
1708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
1718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
1728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aesgcmsiv_htable6_init
1738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aesgcmsiv_htable6_init,\@function,2
1748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
1758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaesgcmsiv_htable6_init:
1768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
1778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($H), $T
1788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, $TMP0
1798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, ($Htbl)      # H
1808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 16($Htbl)    # H^2
1828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 32($Htbl)    # H^3
1848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 48($Htbl)    # H^4
1868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 64($Htbl)    # H^5
1888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
1898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $T, 80($Htbl)    # H^6
1908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
1918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
1928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
1938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
1948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
1958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaesgcmsiv_htable6_init();
1968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
1978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aesgcmsiv_htable_polyval {
1988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T);
1998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 1: %rdi     Htable  - pointer to Htable
2008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 2: %rsi     INp     - pointer to input
2018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 3: %rdx     LEN     - length of BUFFER in bytes
2028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 4: %rcx     T       - pointer to POLYVAL output
2038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $DATA = "%xmm0";
2058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $hlp0 = "%r11";
2068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $Htbl = "%rdi";
2078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $inp = "%rsi";
2088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $len = "%rdx";
2098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP0 = "%xmm3";
2108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP1 = "%xmm4";
2118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP2 = "%xmm5";
2128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP3 = "%xmm6";
2138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP4 = "%xmm7";
2148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $Tp = "%rcx";
2158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $T = "%xmm1";
2168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $Xhi = "%xmm9";
2178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $SCHOOLBOOK_AAD = sub {
2198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i)=@_;
2208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
2218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
2228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP2, $TMP2
2238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
2248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP0, $TMP0
2258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
2268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP1, $TMP1
2278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
2288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP2, $TMP2
2298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
2308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
2318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
2338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aesgcmsiv_htable_polyval
2348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aesgcmsiv_htable_polyval,\@function,4
2358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
2368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaesgcmsiv_htable_polyval:
2378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
2388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    test  $len, $len
2398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jnz   .Lhtable_polyval_start
2408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
2418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.Lhtable_polyval_start:
2438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vzeroall
2448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # We hash 8 blocks each iteration. If the total number of blocks is not a
2468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # multiple of 8, we first hash the leading n%8 blocks.
2478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq $len, $hlp0
2488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    andq \$127, $hlp0
2498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jz .Lhtable_polyval_no_prefix
2518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $Xhi, $Xhi, $Xhi
2538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($Tp), $T
2548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    sub $hlp0, $len
2558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    sub \$16, $hlp0
2578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # hash first prefix block
2598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ($inp), $DATA
2608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $T, $DATA, $DATA
2618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2
2638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0
2648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1
2658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
2668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP2, $TMP2
2678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    lea 16($inp), $inp
2698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    test $hlp0, $hlp0
2708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jnz .Lhtable_polyval_prefix_loop
2718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jmp .Lhtable_polyval_prefix_complete
2728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # hash remaining prefix bocks (up to 7 total prefix blocks)
2748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 64
2758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.Lhtable_polyval_prefix_loop:
2768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    sub \$16, $hlp0
2778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ($inp), $DATA           # next data block
2798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x00, ($Htbl,$hlp0), $DATA, $TMP3
2818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP3, $TMP0, $TMP0
2828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x11, ($Htbl,$hlp0), $DATA, $TMP3
2838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP3, $TMP1, $TMP1
2848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x01, ($Htbl,$hlp0), $DATA, $TMP3
2858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP3, $TMP2, $TMP2
2868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
2878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP3, $TMP2, $TMP2
2888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    test $hlp0, $hlp0
2908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    lea 16($inp), $inp
2928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jnz .Lhtable_polyval_prefix_loop
2948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.Lhtable_polyval_prefix_complete:
2968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpsrldq \$8, $TMP2, $TMP3
2978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$8, $TMP2, $TMP2
2988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
2998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP1, $Xhi
3008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP2, $TMP0, $T
3018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jmp .Lhtable_polyval_main_loop
3038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.Lhtable_polyval_no_prefix:
3058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # At this point we know the number of blocks is a multiple of 8. However,
3068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # the reduction in the main loop includes a multiplication by x^(-128). In
3078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # order to counter this, the existing tag needs to be multipled by x^128.
3088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # In practice, this just means that it is loaded into $Xhi, not $T.
3098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $T, $T, $T
3108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($Tp), $Xhi
3118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 64
3138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.Lhtable_polyval_main_loop:
3148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    sub \$0x80, $len
3158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jb .Lhtable_polyval_out
3168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 16*7($inp), $DATA      # Ii
3188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2
3208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0
3218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1
3228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3
3238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP2, $TMP2
3248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    #########################################################
3268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 16*6($inp), $DATA
3278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$SCHOOLBOOK_AAD->(1)}
3288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    #########################################################
3308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 16*5($inp), $DATA
3318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, poly(%rip), $T, $TMP4         # reduction stage 1a
3338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpalignr \$8, $T, $T, $T
3348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$SCHOOLBOOK_AAD->(2)}
3368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP4, $T, $T                              # reduction stage 1b
3388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    #########################################################
3398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu     16*4($inp), $DATA
3408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$SCHOOLBOOK_AAD->(3)}
3428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    #########################################################
3438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu     16*3($inp), $DATA
3448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, poly(%rip), $T, $TMP4         # reduction stage 2a
3468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpalignr \$8, $T, $T, $T
3478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$SCHOOLBOOK_AAD->(4)}
3498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP4, $T, $T                              # reduction stage 2b
3518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    #########################################################
3528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 16*2($inp), $DATA
3538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$SCHOOLBOOK_AAD->(5)}
3558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $Xhi, $T, $T                               # reduction finalize
3578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    #########################################################
3588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 16*1($inp), $DATA
3598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$SCHOOLBOOK_AAD->(6)}
3618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    #########################################################
3628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 16*0($inp), $DATA
3638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $T, $DATA, $DATA
3648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$SCHOOLBOOK_AAD->(7)}
3668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    #########################################################
3678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpsrldq \$8, $TMP2, $TMP3
3688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$8, $TMP2, $TMP2
3698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP1, $Xhi
3718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP2, $TMP0, $T
3728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    lea 16*8($inp), $inp
3748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jmp .Lhtable_polyval_main_loop
3758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    #########################################################
3778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.Lhtable_polyval_out:
3798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x10, poly(%rip), $T, $TMP3
3808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpalignr    \$8, $T, $T, $T
3818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP3, $T, $T
3828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq  \$0x10, poly(%rip), $T, $TMP3
3848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpalignr    \$8, $T, $T, $T
3858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $TMP3, $T, $T
3868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       $Xhi, $T, $T
3878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $T, ($Tp)
3898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vzeroupper
3908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
3918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
3928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
3938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
3948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
3958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaesgcmsiv_htable_polyval();
3968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
3978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aesgcmsiv_polyval_horner {
3988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #void aesgcmsiv_polyval_horner(unsigned char T[16],  // output
3998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #      const unsigned char* H, // H
4008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #      unsigned char* BUF,  // Buffer
4018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #      unsigned int blocks);  // Len2
4028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #
4038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 1: %rdi T - pointers to POLYVAL output
4048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 2: %rsi Hp - pointer to H (user key)
4058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 3: %rdx INp - pointer to input
4068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 4: %rcx L - total number of blocks in input BUFFER
4078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #
4088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $T = "%rdi";
4098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $Hp = "%rsi";
4108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $INp = "%rdx";
4118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $L = "%rcx";
4128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $LOC = "%r10";
4138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $LEN = "%eax";
4148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $H = "%xmm1";
4158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $RES = "%xmm0";
4168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
4188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aesgcmsiv_polyval_horner
4198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aesgcmsiv_polyval_horner,\@function,4
4208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
4218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaesgcmsiv_polyval_horner:
4228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
4238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    test $L, $L
4248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jnz .Lpolyval_horner_start
4258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
4268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.Lpolyval_horner_start:
4288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # We will start with L GFMULS for POLYVAL(BIG_BUFFER)
4298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # RES = GFMUL(RES, H)
4308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    xorq $LOC, $LOC
4328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shlq \$4, $L    # L contains number of bytes to process
4338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($Hp), $H
4358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($T), $RES
4368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.Lpolyval_horner_loop:
4388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($INp,$LOC), $RES, $RES  # RES = RES + Xi
4398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL  # RES = RES * H
4408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    add \$16, $LOC
4428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    cmp $LOC, $L
4438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jne .Lpolyval_horner_loop
4448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # calculation of T is complete. RES=T
4468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $RES, ($T)
4478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
4488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
4498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
4508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
4518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
4528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaesgcmsiv_polyval_horner();
4538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
4558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 1: %rdi
4568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 2: %rsi
4578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
4588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes128gcmsiv_aes_ks
4598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes128gcmsiv_aes_ks,\@function,2
4608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
4618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_aes_ks:
4628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
4638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu (%rdi), %xmm1           # xmm1 = user key
4648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm1, (%rsi)           # rsi points to output
4658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa con1(%rip), %xmm0
4678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa mask(%rip), %xmm15
4688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq \$8, %rax
4708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.Lks128_loop:
4728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$16, %rsi                 # rsi points for next key
4738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$1, %rax
4748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb %xmm15, %xmm1, %xmm2    # xmm2 = shuffled user key
4758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm0, %xmm2, %xmm2
4768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslld \$1, %xmm0, %xmm0
4778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm1, %xmm3
4788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm3, %xmm1, %xmm1
4798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm3, %xmm3
4808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm3, %xmm1, %xmm1
4818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm3, %xmm3
4828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm3, %xmm1, %xmm1
4838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm2, %xmm1, %xmm1
4848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm1, (%rsi)
4858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jne .Lks128_loop
4868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
4878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa con2(%rip), %xmm0
4888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb %xmm15, %xmm1, %xmm2
4898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm0, %xmm2, %xmm2
4908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslld \$1, %xmm0, %xmm0
4918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm1, %xmm3
4928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm3, %xmm1, %xmm1
4938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm3, %xmm3
4948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm3, %xmm1, %xmm1
4958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm3, %xmm3
4968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm3, %xmm1, %xmm1
4978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm2, %xmm1, %xmm1
4988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm1, 16(%rsi)
4998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
5008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb %xmm15, %xmm1, %xmm2
5018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm0, %xmm2, %xmm2
5028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm1, %xmm3
5038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm3, %xmm1, %xmm1
5048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm3, %xmm3
5058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm3, %xmm1, %xmm1
5068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm3, %xmm3
5078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm3, %xmm1, %xmm1
5088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm2, %xmm1, %xmm1
5098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm1, 32(%rsi)
5108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
5118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
5128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
5138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
5148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
5158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
5168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 1: %rdi
5178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 2: %rsi
5188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
5198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes256gcmsiv_aes_ks
5208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes256gcmsiv_aes_ks,\@function,2
5218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
5228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_aes_ks:
5238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
5248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu (%rdi), %xmm1
5258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 16(%rdi), %xmm3
5268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm1, (%rsi)
5278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm3, 16(%rsi)
5288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa con1(%rip), %xmm0
5298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa mask(%rip), %xmm15
5308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm14, %xmm14, %xmm14
5318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    mov \$6, %rax
5328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
5338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.Lks256_loop:
5348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    add \$32, %rsi
5358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$1, %rax
5368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb %xmm15, %xmm3, %xmm2
5378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm0, %xmm2, %xmm2
5388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslld \$1, %xmm0, %xmm0
5398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpsllq \$32, %xmm1, %xmm4
5408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm1, %xmm1
5418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb con3(%rip), %xmm1,  %xmm4
5428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm1, %xmm1
5438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm2, %xmm1, %xmm1
5448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm1, (%rsi)
5458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufd \$0xff, %xmm1, %xmm2
5468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm14, %xmm2, %xmm2
5478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpsllq \$32, %xmm3, %xmm4
5488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm3, %xmm3
5498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb con3(%rip), %xmm3,  %xmm4
5508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm3, %xmm3
5518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm2, %xmm3, %xmm3
5528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm3, 16(%rsi)
5538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jne .Lks256_loop
5548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
5558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb %xmm15, %xmm3, %xmm2
5568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm0, %xmm2, %xmm2
5578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpsllq \$32, %xmm1, %xmm4
5588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm1, %xmm1
5598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb con3(%rip), %xmm1,  %xmm4
5608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm1, %xmm1
5618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm2, %xmm1, %xmm1
5628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm1, 32(%rsi)
5638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
5648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
5658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
5668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
5678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aes128gcmsiv_aes_ks_enc_x1 {
5688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KS1_REGA = "%xmm1";
5698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KS1_REGB = "%xmm2";
5708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK1 = "%xmm4";
5718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $AUXREG = "%xmm3";
5728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
5738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KS_BLOCK = sub {
5748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($reg, $reg2, $auxReg) = @_;
5758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
5768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpsllq \$32, $reg, $auxReg         #!!saving mov instruction to xmm3
5778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $auxReg, $reg, $reg
5788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb con3(%rip), $reg,  $auxReg
5798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $auxReg, $reg, $reg
5808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $reg2, $reg, $reg
5818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
5828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
5838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
5848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $round = sub {
5858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i, $j) = @_;
5868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
5878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb %xmm15, %xmm1, %xmm2      #!!saving mov instruction to xmm2
5888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm0, %xmm2, %xmm2
5898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslld \$1, %xmm0, %xmm0
5908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
5918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc %xmm1, $BLOCK1, $BLOCK1
5928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm1, ${\eval(16*$i)}($j)
5938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
5948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
5958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
5968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $roundlast = sub {
5978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i, $j) = @_;
5988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
5998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb %xmm15, %xmm1, %xmm2      #!!saving mov instruction to xmm2
6008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm0, %xmm2, %xmm2
6018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
6028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm1, $BLOCK1, $BLOCK1
6038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm1, ${\eval(16*$i)}($j)
6048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
6058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
6068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 1: %rdi                         Pointer to PT
6088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 2: %rsi                         Pointer to CT
6098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 4: %rdx                         Pointer to keys
6108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 5: %rcx                         Pointer to initial key
6118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
6128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes128gcmsiv_aes_ks_enc_x1
6138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes128gcmsiv_aes_ks_enc_x1,\@function,4
6148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
6158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_aes_ks_enc_x1:
6168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
6178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa (%rcx), %xmm1                 # xmm1 = first 16 bytes of random key
6188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa 0*16(%rdi), $BLOCK1
6198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa %xmm1, (%rdx)                 # KEY[0] = first 16 bytes of random key
6218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm1, $BLOCK1, $BLOCK1
6228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa con1(%rip), %xmm0             # xmm0  = 1,1,1,1
6248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa mask(%rip), %xmm15            # xmm15 = mask
6258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round->(1, "%rdx")}
6278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round->(2, "%rdx")}
6288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round->(3, "%rdx")}
6298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round->(4, "%rdx")}
6308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round->(5, "%rdx")}
6318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round->(6, "%rdx")}
6328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round->(7, "%rdx")}
6338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round->(8, "%rdx")}
6348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa con2(%rip), %xmm0
6368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round->(9, "%rdx")}
6388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$roundlast->(10, "%rdx")}
6398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK1, 0*16(%rsi)
6418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
6428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
6438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
6448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
6458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
6468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_aes_ks_enc_x1();
6478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aes128gcmsiv_kdf {
6498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK1 = "%xmm9";
6508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK2 = "%xmm10";
6518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK3 = "%xmm11";
6528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK4 = "%xmm12";
6538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK5 = "%xmm13";
6548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK6 = "%xmm14";
6558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $ONE = "%xmm13";
6568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KSp = "%rdx";
6578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE_1 = "%xmm1";
6588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $enc_roundx4 = sub {
6608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i, $j) = @_;
6618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
6628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ${\eval($i*16)}(%rdx), $j
6638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $j, $BLOCK1, $BLOCK1
6648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $j, $BLOCK2, $BLOCK2
6658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $j, $BLOCK3, $BLOCK3
6668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $j, $BLOCK4, $BLOCK4
6678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
6688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
6698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $enc_roundlastx4 = sub {
6718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i, $j) = @_;
6728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
6738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ${\eval($i*16)}(%rdx), $j
6748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $j, $BLOCK1, $BLOCK1
6758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $j, $BLOCK2, $BLOCK2
6768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $j, $BLOCK3, $BLOCK3
6778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $j, $BLOCK4, $BLOCK4
6788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
6798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
6808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# void aes128gcmsiv_kdf(const uint8_t nonce[16],
6828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan#                       uint8_t *out_key_material,
6838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan#                       const uint8_t *key_schedule);
6848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
6858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes128gcmsiv_kdf
6868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes128gcmsiv_kdf,\@function,3
6878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
6888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_kdf:
6898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
6908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 1: %rdi                         Pointer to NONCE
6918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 2: %rsi                         Pointer to CT
6928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 4: %rdx                         Pointer to keys
6938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
6948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa (%rdx), %xmm1                  # xmm1 = first 16 bytes of random key
6958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa 0*16(%rdi), $BLOCK1
6968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa and_mask(%rip), $BLOCK4
6978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa one(%rip), $ONE
6988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufd \$0x90, $BLOCK1, $BLOCK1
6998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpand $BLOCK4, $BLOCK1, $BLOCK1
7008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ONE, $BLOCK1, $BLOCK2
7018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ONE, $BLOCK2, $BLOCK3
7028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ONE, $BLOCK3, $BLOCK4
7038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
7048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm1, $BLOCK1, $BLOCK1
7058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm1, $BLOCK2, $BLOCK2
7068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm1, $BLOCK3, $BLOCK3
7078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm1, $BLOCK4, $BLOCK4
7088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
7098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx4->(1, "%xmm1")}
7108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx4->(2, "%xmm2")}
7118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx4->(3, "%xmm1")}
7128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx4->(4, "%xmm2")}
7138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx4->(5, "%xmm1")}
7148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx4->(6, "%xmm2")}
7158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx4->(7, "%xmm1")}
7168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx4->(8, "%xmm2")}
7178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx4->(9, "%xmm1")}
7188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundlastx4->(10, "%xmm2")}
7198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
7208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK1, 0*16(%rsi)
7218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK2, 1*16(%rsi)
7228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK3, 2*16(%rsi)
7238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK4, 3*16(%rsi)
7248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
7258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
7268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
7278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
7288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
7298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_kdf();
7308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
7318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aes128gcmsiv_enc_msg_x4 {
7328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR1 = "%xmm0";
7338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR2 = "%xmm1";
7348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR3 = "%xmm2";
7358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR4 = "%xmm3";
7368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $ADDER = "%xmm4";
7378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
7388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE1 = "%xmm5";
7398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE2 = "%xmm6";
7408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE3 = "%xmm7";
7418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE4 = "%xmm8";
7428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
7438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP = "%xmm12";
7448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP2 = "%xmm13";
7458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP3 = "%xmm14";
7468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $IV = "%xmm15";
7478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
7488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $PT = "%rdi";
7498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CT = "%rsi";
7508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TAG = "%rdx";
7518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KS = "%rcx";
7528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $LEN = "%r8";
7538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
7548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $aes_round = sub {
7558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
7568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
7578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16)}($KS), $TMP
7588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP, $STATE1, $STATE1
7598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP, $STATE2, $STATE2
7608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP, $STATE3, $STATE3
7618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP, $STATE4, $STATE4
7628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
7638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
7648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
7658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $aes_lastround = sub {
7668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
7678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
7688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16)}($KS), $TMP
7698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP, $STATE1, $STATE1
7708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP, $STATE2, $STATE2
7718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP, $STATE3, $STATE3
7728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP, $STATE4, $STATE4
7738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
7748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
7758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
7768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
7778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan#                              unsigned char* TAG, unsigned char* KS,
7788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan#                              size_t byte_len);
7798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 1: %rdi     #PT
7808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 2: %rsi     #CT
7818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 3: %rdx     #TAG  [127 126 ... 0]  IV=[127...32]
7828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 4: %rcx     #KS
7838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 5: %r8      #LEN MSG_length in bytes
7848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
7858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes128gcmsiv_enc_msg_x4
7868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes128gcmsiv_enc_msg_x4,\@function,5
7878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
7888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_enc_msg_x4:
7898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
7908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    test $LEN, $LEN
7918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jnz .L128_enc_msg_x4_start
7928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
7938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
7948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L128_enc_msg_x4_start:
7958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    pushq %r12
7968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_push %r12
7978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    pushq %r13
7988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_push %r13
7998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$4, $LEN      # LEN = num of blocks
8018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq $LEN, %r10
8028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shlq \$62, %r10
8038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$62, %r10
8048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # make IV from TAG
8068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($TAG), $IV
8078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpor OR_MASK(%rip), $IV, $IV  #IV = [1]TAG[126...32][00..00]
8088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu four(%rip), $ADDER     # Register to increment counters
8108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $IV, $CTR1             # CTR1 = TAG[1][127...32][00..00]
8118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $IV, $CTR2   # CTR2 = TAG[1][127...32][00..01]
8128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd two(%rip), $IV, $CTR3   # CTR3 = TAG[1][127...32][00..02]
8138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
8148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$2, $LEN
8168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    je .L128_enc_msg_x4_check_remainder
8178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$64, $CT
8198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$64, $PT
8208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L128_enc_msg_x4_loop1:
8228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$64, $CT
8238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$64, $PT
8248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR1, $STATE1
8268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR2, $STATE2
8278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR3, $STATE3
8288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR4, $STATE4
8298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE1, $STATE1
8318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE2, $STATE2
8328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE3, $STATE3
8338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE4, $STATE4
8348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(1)}
8368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ADDER, $CTR1, $CTR1
8378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(2)}
8388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ADDER, $CTR2, $CTR2
8398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(3)}
8408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ADDER, $CTR3, $CTR3
8418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(4)}
8428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ADDER, $CTR4, $CTR4
8438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(5)}
8458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(6)}
8468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(7)}
8478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(8)}
8488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(9)}
8498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_lastround->(10)}
8508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # XOR with Plaintext
8528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 0*16($PT), $STATE1, $STATE1
8538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 1*16($PT), $STATE2, $STATE2
8548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 2*16($PT), $STATE3, $STATE3
8558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 3*16($PT), $STATE4, $STATE4
8568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$1, $LEN
8588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE1, 0*16($CT)
8608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE2, 1*16($CT)
8618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE3, 2*16($CT)
8628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE4, 3*16($CT)
8638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jne .L128_enc_msg_x4_loop1
8658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$64,$CT
8678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$64,$PT
8688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L128_enc_msg_x4_check_remainder:
8708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    cmpq \$0, %r10
8718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    je .L128_enc_msg_x4_out
8728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L128_enc_msg_x4_loop2:
8748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # enc each block separately
8758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # CTR1 is the highest counter (even if no LOOP done)
8768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR1, $STATE1
8778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $CTR1, $CTR1  # inc counter
8788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE1, $STATE1
8808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 16($KS), $STATE1, $STATE1
8818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 32($KS), $STATE1, $STATE1
8828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 48($KS), $STATE1, $STATE1
8838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 64($KS), $STATE1, $STATE1
8848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 80($KS), $STATE1, $STATE1
8858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 96($KS), $STATE1, $STATE1
8868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 112($KS), $STATE1, $STATE1
8878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 128($KS), $STATE1, $STATE1
8888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 144($KS), $STATE1, $STATE1
8898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast 160($KS), $STATE1, $STATE1
8908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # XOR with plaintext
8928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($PT), $STATE1, $STATE1
8938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE1, ($CT)
8948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$16, $PT
8968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$16, $CT
8978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
8988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$1, %r10
8998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jne .L128_enc_msg_x4_loop2
9008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
9018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L128_enc_msg_x4_out:
9028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    popq %r13
9038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_pop %r13
9048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    popq %r12
9058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_pop %r12
9068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
9078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
9088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
9098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
9108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
9118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_enc_msg_x4();
9128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
9138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aes128gcmsiv_enc_msg_x8 {
9148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE1 = "%xmm1";
9158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE2 = "%xmm2";
9168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE3 = "%xmm3";
9178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE4 = "%xmm4";
9188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE5 = "%xmm5";
9198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE6 = "%xmm6";
9208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE7 = "%xmm7";
9218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE8 = "%xmm8";
9228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
9238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR1 = "%xmm0";
9248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR2 = "%xmm9";
9258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR3 = "%xmm10";
9268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR4 = "%xmm11";
9278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR5 = "%xmm12";
9288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR6 = "%xmm13";
9298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR7 = "%xmm14";
9308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $SCHED = "%xmm15";
9318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
9328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP1 = "%xmm1";
9338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP2 = "%xmm2";
9348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
9358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $PT = "%rdi";
9368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CT = "%rsi";
9378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TAG = "%rdx";
9388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KS = "%rcx";
9398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $LEN = "%r8";
9408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
9418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $aes_round8 = sub {
9428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
9438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
9448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16)}($KS), $SCHED
9458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE1, $STATE1
9468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE2, $STATE2
9478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE3, $STATE3
9488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE4, $STATE4
9498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE5, $STATE5
9508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE6, $STATE6
9518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE7, $STATE7
9528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE8, $STATE8
9538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
9548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
9558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
9568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $aes_lastround8 = sub {
9578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
9588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
9598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16)}($KS), $SCHED
9608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE1, $STATE1
9618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE2, $STATE2
9628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE3, $STATE3
9638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE4, $STATE4
9648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE5, $STATE5
9658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE6, $STATE6
9668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE7, $STATE7
9678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE8, $STATE8
9688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
9698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
9708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
9718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# void ENC_MSG_x8(unsigned char* PT,
9728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan#                 unsigned char* CT,
9738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan#                 unsigned char* TAG,
9748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan#                 unsigned char* KS,
9758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan#                 size_t byte_len);
9768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 1: %rdi     #PT
9778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 2: %rsi     #CT
9788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 3: %rdx     #TAG        [127 126 ... 0]  IV=[127...32]
9798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 4: %rcx     #KS
9808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 5: %r8      #LEN MSG_length in bytes
9818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
9828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes128gcmsiv_enc_msg_x8
9838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes128gcmsiv_enc_msg_x8,\@function,5
9848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
9858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_enc_msg_x8:
9868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
9878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    test $LEN, $LEN
9888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jnz .L128_enc_msg_x8_start
9898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
9908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
9918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L128_enc_msg_x8_start:
9928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    pushq %r12
9938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_push %r12
9948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    pushq %r13
9958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_push %r13
9968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    pushq %rbp
9978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_push %rbp
9988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq %rsp, %rbp
9998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_def_cfa_register rbp
10008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # Place in stack
10028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$128, %rsp
10038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    andq \$-64, %rsp
10048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$4, $LEN  # LEN = num of blocks
10068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq $LEN, %r10
10078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shlq \$61, %r10
10088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$61, %r10
10098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # make IV from TAG
10118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ($TAG), $TMP1
10128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpor OR_MASK(%rip), $TMP1, $TMP1  # TMP1= IV = [1]TAG[126...32][00..00]
10138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # store counter8 in the stack
10158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd seven(%rip), $TMP1, $CTR1
10168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR1, (%rsp)             # CTR8 = TAG[127...32][00..07]
10178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $TMP1, $CTR2    # CTR2 = TAG[127...32][00..01]
10188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd two(%rip), $TMP1, $CTR3    # CTR3 = TAG[127...32][00..02]
10198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd three(%rip), $TMP1, $CTR4  # CTR4 = TAG[127...32][00..03]
10208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd four(%rip), $TMP1, $CTR5   # CTR5 = TAG[127...32][00..04]
10218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd five(%rip), $TMP1, $CTR6   # CTR6 = TAG[127...32][00..05]
10228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd six(%rip), $TMP1, $CTR7    # CTR7 = TAG[127...32][00..06]
10238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $TMP1, $CTR1              # CTR1 = TAG[127...32][00..00]
10248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$3, $LEN
10268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    je .L128_enc_msg_x8_check_remainder
10278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$128, $CT
10298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$128, $PT
10308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L128_enc_msg_x8_loop1:
10328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$128, $CT
10338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$128, $PT
10348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR1, $STATE1
10368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR2, $STATE2
10378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR3, $STATE3
10388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR4, $STATE4
10398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR5, $STATE5
10408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR6, $STATE6
10418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR7, $STATE7
10428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # move from stack
10438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu (%rsp), $STATE8
10448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE1, $STATE1
10468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE2, $STATE2
10478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE3, $STATE3
10488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE4, $STATE4
10498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE5, $STATE5
10508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE6, $STATE6
10518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE7, $STATE7
10528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE8, $STATE8
10538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(1)}
10558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu (%rsp), $CTR7  # deal with CTR8
10568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR7, $CTR7
10578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR7, (%rsp)
10588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(2)}
10598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpsubd one(%rip), $CTR7, $CTR7
10608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(3)}
10618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR1, $CTR1
10628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(4)}
10638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR2, $CTR2
10648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(5)}
10658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR3, $CTR3
10668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(6)}
10678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR4, $CTR4
10688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(7)}
10698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR5, $CTR5
10708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(8)}
10718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR6, $CTR6
10728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(9)}
10738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_lastround8->(10)}
10748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # XOR with Plaintext
10768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 0*16($PT), $STATE1, $STATE1
10778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 1*16($PT), $STATE2, $STATE2
10788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 2*16($PT), $STATE3, $STATE3
10798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 3*16($PT), $STATE4, $STATE4
10808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 4*16($PT), $STATE5, $STATE5
10818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 5*16($PT), $STATE6, $STATE6
10828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 6*16($PT), $STATE7, $STATE7
10838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 7*16($PT), $STATE8, $STATE8
10848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    dec $LEN
10868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE1, 0*16($CT)
10888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE2, 1*16($CT)
10898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE3, 2*16($CT)
10908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE4, 3*16($CT)
10918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE5, 4*16($CT)
10928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE6, 5*16($CT)
10938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE7, 6*16($CT)
10948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE8, 7*16($CT)
10958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jne .L128_enc_msg_x8_loop1
10978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
10988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$128, $CT
10998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$128, $PT
11008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L128_enc_msg_x8_check_remainder:
11028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    cmpq \$0, %r10
11038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    je .L128_enc_msg_x8_out
11048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L128_enc_msg_x8_loop2:
11068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # enc each block separately
11078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # CTR1 is the highest counter (even if no LOOP done)
11088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR1, $STATE1
11098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $CTR1, $CTR1  # inc counter
11108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE1, $STATE1
11128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 16($KS), $STATE1, $STATE1
11138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 32($KS), $STATE1, $STATE1
11148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 48($KS), $STATE1, $STATE1
11158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 64($KS), $STATE1, $STATE1
11168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 80($KS), $STATE1, $STATE1
11178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 96($KS), $STATE1, $STATE1
11188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 112($KS), $STATE1, $STATE1
11198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 128($KS), $STATE1, $STATE1
11208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 144($KS), $STATE1, $STATE1
11218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast 160($KS), $STATE1, $STATE1
11228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # XOR with Plaintext
11248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($PT), $STATE1, $STATE1
11258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE1, ($CT)
11278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$16, $PT
11298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$16, $CT
11308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    decq %r10
11328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jne .L128_enc_msg_x8_loop2
11338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L128_enc_msg_x8_out:
11358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq %rbp, %rsp
11368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_def_cfa_register %rsp
11378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    popq %rbp
11388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_pop %rbp
11398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    popq %r13
11408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_pop %r13
11418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    popq %r12
11428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_pop %r12
11438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
11448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
11458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
11468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
11478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
11488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_enc_msg_x8();
11498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aesgcmsiv_dec {
11518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my ($aes256) = @_;
11528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $T = "%xmm0";
11548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP0 = "%xmm1";
11558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP1 = "%xmm2";
11568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP2 = "%xmm3";
11578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP3 = "%xmm4";
11588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP4 = "%xmm5";
11598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP5 = "%xmm6";
11608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR1 = "%xmm7";
11618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR2 = "%xmm8";
11628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR3 = "%xmm9";
11638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR4 = "%xmm10";
11648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR5 = "%xmm11";
11658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR6 = "%xmm12";
11668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR = "%xmm15";
11678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CT = "%rdi";
11688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $PT = "%rsi";
11698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $POL = "%rdx";
11708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $Htbl = "%rcx";
11718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KS = "%r8";
11728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $LEN = "%r9";
11738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $secureBuffer = "%rax";
11748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $HTABLE_ROUNDS = "%xmm13";
11758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $labelPrefix = "128";
11778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  if ($aes256) {
11788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    $labelPrefix = "256";
11798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  }
11808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $aes_round_dec = sub {
11828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
11838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
11848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16)}($KS), $TMP3
11858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP3, $CTR1, $CTR1
11868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP3, $CTR2, $CTR2
11878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP3, $CTR3, $CTR3
11888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP3, $CTR4, $CTR4
11898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP3, $CTR5, $CTR5
11908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP3, $CTR6, $CTR6
11918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
11928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
11938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
11948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $aes_lastround_dec = sub {
11958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
11968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
11978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16)}($KS), $TMP3
11988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR1, $CTR1
11998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR2, $CTR2
12008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR3, $CTR3
12018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR4, $CTR4
12028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR5, $CTR5
12038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR6, $CTR6
12048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
12058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
12068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $schoolbook = sub {
12088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
12098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
12108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5
12118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS
12128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3
12148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP0, $TMP0
12158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3
12168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP1, $TMP1
12178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3
12188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP2, $TMP2
12198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3
12208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP0, $TMP0
12218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
12228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
12238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  if ($aes256) {
12258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    $code.=<<___;
12268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes256gcmsiv_dec
12278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes256gcmsiv_dec,\@function,6
12288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
12298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_dec:
12308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
12318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  } else {
12328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    $code.=<<___;
12338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes128gcmsiv_dec
12348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes128gcmsiv_dec,\@function,6
12358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
12368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_dec:
12378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
12388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  }
12398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
12418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
12428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    test \$~15, $LEN
12438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jnz .L${labelPrefix}_dec_start
12448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
12458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L${labelPrefix}_dec_start:
12478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vzeroupper
12488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($POL), $T
12498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq $POL, $secureBuffer
12508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    leaq 32($secureBuffer), $secureBuffer
12528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    leaq 32($Htbl), $Htbl
12538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # make CTRBLKs from given tag.
12558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ($CT,$LEN), $CTR
12568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpor OR_MASK(%rip), $CTR, $CTR      # CTR = [1]TAG[126...32][00..00]
12578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    andq \$~15, $LEN
12588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # If less then 6 blocks, make singles
12608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    cmp \$96, $LEN
12618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jb .L${labelPrefix}_dec_loop2
12628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # Decrypt the first six blocks
12648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    sub \$96, $LEN
12658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR, $CTR1
12668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $CTR1, $CTR2
12678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd two(%rip), $CTR1, $CTR3
12688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $CTR3, $CTR4
12698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd two(%rip), $CTR3, $CTR5
12708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $CTR5, $CTR6
12718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd two(%rip), $CTR5, $CTR
12728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $CTR1, $CTR1
12748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $CTR2, $CTR2
12758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $CTR3, $CTR3
12768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $CTR4, $CTR4
12778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $CTR5, $CTR5
12788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $CTR6, $CTR6
12798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(1)}
12818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(2)}
12828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(3)}
12838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(4)}
12848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(5)}
12858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(6)}
12868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(7)}
12878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(8)}
12888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(9)}
12898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
12908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
12918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanif ($aes256) {
12928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
12938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(10)}
12948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(11)}
12958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(12)}
12968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(13)}
12978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_lastround_dec->(14)}
12988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
12998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan} else {
13008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
13018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_lastround_dec->(10)}
13028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
13038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
13048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
13068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # XOR with CT
13078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 0*16($CT), $CTR1, $CTR1
13088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 1*16($CT), $CTR2, $CTR2
13098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 2*16($CT), $CTR3, $CTR3
13108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 3*16($CT), $CTR4, $CTR4
13118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 4*16($CT), $CTR5, $CTR5
13128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 5*16($CT), $CTR6, $CTR6
13138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR1, 0*16($PT)
13158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR2, 1*16($PT)
13168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR3, 2*16($PT)
13178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR4, 3*16($PT)
13188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR5, 4*16($PT)
13198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR6, 5*16($PT)
13208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$96, $CT
13228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$96, $PT
13238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jmp .L${labelPrefix}_dec_loop1
13248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# Decrypt 6 blocks each time while hashing previous 6 blocks
13268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 64
13278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L${labelPrefix}_dec_loop1:
13288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    cmp \$96, $LEN
13298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jb .L${labelPrefix}_dec_finish_96
13308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    sub \$96, $LEN
13318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR6, $TMP5
13338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR5, 1*16-32($secureBuffer)
13348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR4, 2*16-32($secureBuffer)
13358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR3, 3*16-32($secureBuffer)
13368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR2, 4*16-32($secureBuffer)
13378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR1, 5*16-32($secureBuffer)
13388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR, $CTR1
13408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $CTR1, $CTR2
13418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd two(%rip), $CTR1, $CTR3
13428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $CTR3, $CTR4
13438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd two(%rip), $CTR3, $CTR5
13448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $CTR5, $CTR6
13458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd two(%rip), $CTR5, $CTR
13468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($KS), $TMP3
13488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $CTR1, $CTR1
13498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $CTR2, $CTR2
13508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $CTR3, $CTR3
13518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $CTR4, $CTR4
13528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $CTR5, $CTR5
13538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $CTR6, $CTR6
13548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 0*16-32($Htbl), $TMP3
13568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
13578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
13588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0
13598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3
13608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP0, $TMP0
13618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(1)}
13638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$schoolbook->(1)}
13648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(2)}
13668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$schoolbook->(2)}
13678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(3)}
13698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$schoolbook->(3)}
13708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(4)}
13728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$schoolbook->(4)}
13738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(5)}
13758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(6)}
13768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(7)}
13778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa 5*16-32($secureBuffer), $TMP5
13798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $T, $TMP5, $TMP5
13808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 5*16-32($Htbl), $TMP4
13818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
13838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP0, $TMP0
13848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
13858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP1, $TMP1
13868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
13878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP2, $TMP2
13888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
13898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP0, $TMP0
13908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(8)}
13928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpsrldq \$8, $TMP0, $TMP3
13948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP1, $TMP4
13958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$8, $TMP0, $TMP3
13968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP2, $T
13978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
13988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa poly(%rip), $TMP2
13998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(9)}
14018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
14028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanif ($aes256) {
14048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
14058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(10)}
14068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(11)}
14078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(12)}
14088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round_dec->(13)}
14098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 14*16($KS), $TMP5
14108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
14118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan} else {
14128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
14138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 10*16($KS), $TMP5
14148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
14158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
14168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
14188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpalignr \$8, $T, $T, $TMP1
14198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, $TMP2, $T, $T
14208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $T, $TMP1, $T
14218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 0*16($CT), $TMP5, $TMP3
14238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR1, $CTR1
14248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 1*16($CT), $TMP5, $TMP3
14258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR2, $CTR2
14268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 2*16($CT), $TMP5, $TMP3
14278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR3, $CTR3
14288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 3*16($CT), $TMP5, $TMP3
14298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR4, $CTR4
14308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 4*16($CT), $TMP5, $TMP3
14318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR5, $CTR5
14328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 5*16($CT), $TMP5, $TMP3
14338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP3, $CTR6, $CTR6
14348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpalignr \$8, $T, $T, $TMP1
14368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, $TMP2, $T, $T
14378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $T, $TMP1, $T
14388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR1, 0*16($PT)
14408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR2, 1*16($PT)
14418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR3, 2*16($PT)
14428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR4, 3*16($PT)
14438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR5, 4*16($PT)
14448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $CTR6, 5*16($PT)
14458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP4, $T, $T
14478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    lea 96($CT), $CT
14498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    lea 96($PT), $PT
14508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jmp .L${labelPrefix}_dec_loop1
14518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L${labelPrefix}_dec_finish_96:
14538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR6, $TMP5
14548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR5, 1*16-32($secureBuffer)
14558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR4, 2*16-32($secureBuffer)
14568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR3, 3*16-32($secureBuffer)
14578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR2, 4*16-32($secureBuffer)
14588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR1, 5*16-32($secureBuffer)
14598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 0*16-32($Htbl), $TMP3
14618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0
14628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
14638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
14648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3
14658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP0, $TMP0
14668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$schoolbook->(1)}
14688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$schoolbook->(2)}
14698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$schoolbook->(3)}
14708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$schoolbook->(4)}
14718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 5*16-32($secureBuffer), $TMP5
14738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $T, $TMP5, $TMP5
14748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu 5*16-32($Htbl), $TMP4
14758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
14768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP1, $TMP1
14778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
14788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP2, $TMP2
14798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
14808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP0, $TMP0
14818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
14828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP0, $TMP0
14838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpsrldq \$8, $TMP0, $TMP3
14858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP1, $TMP4
14868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$8, $TMP0, $TMP3
14878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP3, $TMP2, $T
14888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa poly(%rip), $TMP2
14908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpalignr \$8, $T, $T, $TMP1
14928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, $TMP2, $T, $T
14938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $T, $TMP1, $T
14948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpalignr \$8, $T, $T, $TMP1
14968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpclmulqdq \$0x10, $TMP2, $T, $T
14978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $T, $TMP1, $T
14988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
14998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP4, $T, $T
15008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L${labelPrefix}_dec_loop2:
15028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # Here we encrypt any remaining whole block
15038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # if there are no whole blocks
15058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    cmp \$16, $LEN
15068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jb .L${labelPrefix}_dec_out
15078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    sub \$16, $LEN
15088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR, $TMP1
15108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $CTR, $CTR
15118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 0*16($KS), $TMP1, $TMP1
15138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 1*16($KS), $TMP1, $TMP1
15148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 2*16($KS), $TMP1, $TMP1
15158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 3*16($KS), $TMP1, $TMP1
15168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 4*16($KS), $TMP1, $TMP1
15178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 5*16($KS), $TMP1, $TMP1
15188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 6*16($KS), $TMP1, $TMP1
15198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 7*16($KS), $TMP1, $TMP1
15208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 8*16($KS), $TMP1, $TMP1
15218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 9*16($KS), $TMP1, $TMP1
15228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
15238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanif ($aes256) {
15248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
15258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 10*16($KS), $TMP1, $TMP1
15268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 11*16($KS), $TMP1, $TMP1
15278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 12*16($KS), $TMP1, $TMP1
15288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 13*16($KS), $TMP1, $TMP1
15298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast 14*16($KS), $TMP1, $TMP1
15308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
15318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan} else {
15328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
15338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast 10*16($KS), $TMP1, $TMP1
15348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
15358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
15368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan$code.=<<___;
15388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($CT), $TMP1, $TMP1
15398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $TMP1, ($PT)
15408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$16, $CT
15418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$16, $PT
15428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $TMP1, $T, $T
15448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa -32($Htbl), $TMP0
15458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    call GFMUL
15468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jmp .L${labelPrefix}_dec_loop2
15488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L${labelPrefix}_dec_out:
15508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $T, ($POL)
15518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
15528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
15538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
15548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  if ($aes256) {
15568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    $code.=<<___;
15578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes256gcmsiv_dec, .-aes256gcmsiv_dec
15588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
15598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  } else {
15608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    $code.=<<___;
15618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes128gcmsiv_dec, .-aes128gcmsiv_dec
15628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
15638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  }
15648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
15658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaesgcmsiv_dec(0);  # emit 128-bit version
15678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aes128gcmsiv_ecb_enc_block {
15698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE_1 = "%xmm1";
15708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KSp = "%rdx";
15718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 1: PT            %rdi    (pointer to 128 bit)
15738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 2: CT            %rsi    (pointer to 128 bit)
15748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 3: ks            %rdx    (pointer to ks)
15758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
15768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes128gcmsiv_ecb_enc_block
15778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes128gcmsiv_ecb_enc_block,\@function,3
15788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
15798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_ecb_enc_block:
15808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
15818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa (%rdi), $STATE_1
15828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor       ($KSp), $STATE_1, $STATE_1
15848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 1*16($KSp), $STATE_1, $STATE_1
15858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 2*16($KSp), $STATE_1, $STATE_1
15868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 3*16($KSp), $STATE_1, $STATE_1
15878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 4*16($KSp), $STATE_1, $STATE_1
15888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 5*16($KSp), $STATE_1, $STATE_1
15898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 6*16($KSp), $STATE_1, $STATE_1
15908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 7*16($KSp), $STATE_1, $STATE_1
15918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 8*16($KSp), $STATE_1, $STATE_1
15928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 9*16($KSp), $STATE_1, $STATE_1
15938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast 10*16($KSp), $STATE_1, $STATE_1    # STATE_1 == IV
15948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $STATE_1, (%rsi)
15968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
15978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
15988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
15998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
16008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
16018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
16028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes128gcmsiv_ecb_enc_block();
16038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
16048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aes256gcmsiv_aes_ks_enc_x1 {
16058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KS = "%rdx";
16068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KEYp = "%rcx";
16078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CON_MASK = "%xmm0";
16088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $MASK_256 = "%xmm15";
16098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KEY_1 = "%xmm1";
16108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KEY_2 = "%xmm3";
16118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK1 = "%xmm8";
16128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $AUX_REG = "%xmm14";
16138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $PT = "%rdi";
16148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CT = "%rsi";
16158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
16168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $round_double = sub {
16178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i, $j) = @_;
16188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
16198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb %xmm15, %xmm3, %xmm2
16208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm0, %xmm2, %xmm2
16218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslld \$1, %xmm0, %xmm0
16228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm1, %xmm4
16238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm1, %xmm1
16248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm4, %xmm4
16258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm1, %xmm1
16268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm4, %xmm4
16278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm1, %xmm1
16288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm2, %xmm1, %xmm1
16298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc %xmm1, $BLOCK1, $BLOCK1
16308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu %xmm1, ${\eval(16*$i)}($KS)
16318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
16328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufd \$0xff, %xmm1, %xmm2
16338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm14, %xmm2, %xmm2
16348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm3, %xmm4
16358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm3, %xmm3
16368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm4, %xmm4
16378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm3, %xmm3
16388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm4, %xmm4
16398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm3, %xmm3
16408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm2, %xmm3, %xmm3
16418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc %xmm3, $BLOCK1, $BLOCK1
16428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu %xmm3, ${\eval(16*$j)}($KS)
16438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
16448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
16458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
16468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $round_last = sub {
16478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
16488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
16498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufb %xmm15, %xmm3, %xmm2
16508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm0, %xmm2, %xmm2
16518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm1, %xmm4
16528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm1, %xmm1
16538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm4, %xmm4
16548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm1, %xmm1
16558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpslldq \$4, %xmm4, %xmm4
16568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm4, %xmm1, %xmm1
16578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm2, %xmm1, %xmm1
16588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast %xmm1, $BLOCK1, $BLOCK1
16598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu %xmm1, ${\eval(16*$i)}($KS)
16608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
16618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
16628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
16638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 1: %rdi         Pointer to PT1
16648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 2: %rsi         Pointer to CT1
16658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 3: %rdx         Pointer to KS
16668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 4: %rcx         Pointer to initial key
16678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
16688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes256gcmsiv_aes_ks_enc_x1
16698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes256gcmsiv_aes_ks_enc_x1,\@function,4
16708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
16718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_aes_ks_enc_x1:
16728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
16738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa con1(%rip), $CON_MASK    # CON_MASK  = 1,1,1,1
16748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa mask(%rip), $MASK_256    # MASK_256
16758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($PT), $BLOCK1
16768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($KEYp), $KEY_1          # KEY_1 || KEY_2 [0..7] = user key
16778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa 16($KEYp), $KEY_2
16788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $KEY_1, $BLOCK1, $BLOCK1
16798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $KEY_2, $BLOCK1, $BLOCK1
16808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $KEY_1, ($KS)            # First round key
16818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $KEY_2, 16($KS)
16828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor $AUX_REG, $AUX_REG, $AUX_REG
16838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
16848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round_double->(2, 3)}
16858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round_double->(4, 5)}
16868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round_double->(6, 7)}
16878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round_double->(8, 9)}
16888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round_double->(10, 11)}
16898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round_double->(12, 13)}
16908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$round_last->(14)}
16918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK1, ($CT)
16928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
16938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
16948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
16958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
16968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
16978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_aes_ks_enc_x1();
16988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
16998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aes256gcmsiv_ecb_enc_block {
17008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE_1 = "%xmm1";
17018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $PT = "%rdi";
17028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CT = "%rsi";
17038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KSp = "%rdx";
17048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
17058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 1: PT            %rdi    (pointer to 128 bit)
17068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 2: CT            %rsi    (pointer to 128 bit)
17078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 3: ks            %rdx    (pointer to ks)
17088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
17098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes256gcmsiv_ecb_enc_block
17108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes256gcmsiv_ecb_enc_block,\@function,3
17118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
17128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_ecb_enc_block:
17138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
17148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa (%rdi), $STATE_1
17158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KSp), $STATE_1, $STATE_1
17168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 1*16($KSp), $STATE_1, $STATE_1
17178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 2*16($KSp), $STATE_1, $STATE_1
17188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 3*16($KSp), $STATE_1, $STATE_1
17198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 4*16($KSp), $STATE_1, $STATE_1
17208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 5*16($KSp), $STATE_1, $STATE_1
17218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 6*16($KSp), $STATE_1, $STATE_1
17228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 7*16($KSp), $STATE_1, $STATE_1
17238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 8*16($KSp), $STATE_1, $STATE_1
17248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 9*16($KSp), $STATE_1, $STATE_1
17258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 10*16($KSp), $STATE_1, $STATE_1
17268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 11*16($KSp), $STATE_1, $STATE_1
17278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 12*16($KSp), $STATE_1, $STATE_1
17288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 13*16($KSp), $STATE_1, $STATE_1
17298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast 14*16($KSp), $STATE_1, $STATE_1    # $STATE_1 == IV
17308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $STATE_1, (%rsi)
17318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
17328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
17338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
17348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
17358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
17368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_ecb_enc_block();
17378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
17388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aes256gcmsiv_enc_msg_x4 {
17398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR1 = "%xmm0";
17408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR2 = "%xmm1";
17418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR3 = "%xmm2";
17428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR4 = "%xmm3";
17438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $ADDER = "%xmm4";
17448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
17458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE1 = "%xmm5";
17468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE2 = "%xmm6";
17478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE3 = "%xmm7";
17488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE4 = "%xmm8";
17498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
17508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP = "%xmm12";
17518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP2 = "%xmm13";
17528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP3 = "%xmm14";
17538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $IV = "%xmm15";
17548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
17558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $PT = "%rdi";
17568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CT = "%rsi";
17578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TAG = "%rdx";
17588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KS = "%rcx";
17598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $LEN = "%r8";
17608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
17618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $aes_round = sub {
17628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
17638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
17648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16)}($KS), $TMP
17658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP, $STATE1, $STATE1
17668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP, $STATE2, $STATE2
17678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP, $STATE3, $STATE3
17688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $TMP, $STATE4, $STATE4
17698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
17708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
17718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
17728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $aes_lastround = sub {
17738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
17748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
17758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16)}($KS), $TMP
17768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP, $STATE1, $STATE1
17778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP, $STATE2, $STATE2
17788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP, $STATE3, $STATE3
17798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $TMP, $STATE4, $STATE4
17808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
17818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
17828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
17838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
17848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #                              unsigned char* TAG, unsigned char* KS,
17858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #                              size_t byte_len);
17868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 1: %rdi     #PT
17878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 2: %rsi     #CT
17888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 3: %rdx     #TAG  [127 126 ... 0]  IV=[127...32]
17898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 4: %rcx     #KS
17908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 5: %r8      #LEN MSG_length in bytes
17918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
17928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes256gcmsiv_enc_msg_x4
17938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes256gcmsiv_enc_msg_x4,\@function,5
17948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
17958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_enc_msg_x4:
17968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
17978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    test $LEN, $LEN
17988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jnz .L256_enc_msg_x4_start
17998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
18008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x4_start:
18028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq $LEN, %r10
18038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$4, $LEN                       # LEN = num of blocks
18048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shlq \$60, %r10
18058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jz .L256_enc_msg_x4_start2
18068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$1, $LEN
18078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x4_start2:
18098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq $LEN, %r10
18108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shlq \$62, %r10
18118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$62, %r10
18128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # make IV from TAG
18148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($TAG), $IV
18158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpor OR_MASK(%rip), $IV, $IV        # IV = [1]TAG[126...32][00..00]
18168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa four(%rip), $ADDER          # Register to increment counters
18188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $IV, $CTR1                  # CTR1 = TAG[1][127...32][00..00]
18198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $IV, $CTR2        # CTR2 = TAG[1][127...32][00..01]
18208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd two(%rip), $IV, $CTR3        # CTR3 = TAG[1][127...32][00..02]
18218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd three(%rip), $IV, $CTR4      # CTR4 = TAG[1][127...32][00..03]
18228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$2, $LEN
18248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    je .L256_enc_msg_x4_check_remainder
18258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$64, $CT
18278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$64, $PT
18288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x4_loop1:
18308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$64, $CT
18318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$64, $PT
18328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR1, $STATE1
18348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR2, $STATE2
18358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR3, $STATE3
18368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR4, $STATE4
18378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE1, $STATE1
18398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE2, $STATE2
18408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE3, $STATE3
18418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE4, $STATE4
18428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(1)}
18448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ADDER, $CTR1, $CTR1
18458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(2)}
18468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ADDER, $CTR2, $CTR2
18478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(3)}
18488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ADDER, $CTR3, $CTR3
18498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(4)}
18508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ADDER, $CTR4, $CTR4
18518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(5)}
18538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(6)}
18548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(7)}
18558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(8)}
18568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(9)}
18578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(10)}
18588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(11)}
18598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(12)}
18608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round->(13)}
18618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_lastround->(14)}
18628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # XOR with Plaintext
18648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 0*16($PT), $STATE1, $STATE1
18658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 1*16($PT), $STATE2, $STATE2
18668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 2*16($PT), $STATE3, $STATE3
18678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 3*16($PT), $STATE4, $STATE4
18688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$1, $LEN
18708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE1, 0*16($CT)
18728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE2, 1*16($CT)
18738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE3, 2*16($CT)
18748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE4, 3*16($CT)
18758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jne .L256_enc_msg_x4_loop1
18778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$64, $CT
18798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$64, $PT
18808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x4_check_remainder:
18828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    cmpq \$0, %r10
18838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    je .L256_enc_msg_x4_out
18848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x4_loop2:
18868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # encrypt each block separately
18878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # CTR1 is the highest counter (even if no LOOP done)
18888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
18898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR1, $STATE1
18908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $CTR1, $CTR1      # inc counter
18918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE1, $STATE1
18928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 16($KS), $STATE1, $STATE1
18938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 32($KS), $STATE1, $STATE1
18948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 48($KS), $STATE1, $STATE1
18958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 64($KS), $STATE1, $STATE1
18968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 80($KS), $STATE1, $STATE1
18978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 96($KS), $STATE1, $STATE1
18988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 112($KS), $STATE1, $STATE1
18998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 128($KS), $STATE1, $STATE1
19008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 144($KS), $STATE1, $STATE1
19018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 160($KS), $STATE1, $STATE1
19028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 176($KS), $STATE1, $STATE1
19038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 192($KS), $STATE1, $STATE1
19048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 208($KS), $STATE1, $STATE1
19058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast 224($KS), $STATE1, $STATE1
19068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
19078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # XOR with Plaintext
19088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($PT), $STATE1, $STATE1
19098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
19108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE1, ($CT)
19118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
19128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$16, $PT
19138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$16, $CT
19148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
19158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$1, %r10
19168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jne .L256_enc_msg_x4_loop2
19178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
19188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x4_out:
19198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
19208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
19218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
19228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
19238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
19248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_enc_msg_x4();
19258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
19268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aes256gcmsiv_enc_msg_x8() {
19278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE1 = "%xmm1";
19288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE2 = "%xmm2";
19298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE3 = "%xmm3";
19308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE4 = "%xmm4";
19318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE5 = "%xmm5";
19328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE6 = "%xmm6";
19338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE7 = "%xmm7";
19348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $STATE8 = "%xmm8";
19358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR1 = "%xmm0";
19368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR2 = "%xmm9";
19378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR3 = "%xmm10";
19388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR4 = "%xmm11";
19398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR5 = "%xmm12";
19408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR6 = "%xmm13";
19418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CTR7 = "%xmm14";
19428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP1 = "%xmm1";
19438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TMP2 = "%xmm2";
19448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $KS = "%rcx";
19458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $LEN = "%r8";
19468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $PT = "%rdi";
19478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $CT = "%rsi";
19488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $TAG = "%rdx";
19498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $SCHED = "%xmm15";
19508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
19518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $aes_round8 = sub {
19528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
19538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
19548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16)}($KS), $SCHED
19558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE1, $STATE1
19568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE2, $STATE2
19578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE3, $STATE3
19588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE4, $STATE4
19598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE5, $STATE5
19608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE6, $STATE6
19618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE7, $STATE7
19628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $SCHED, $STATE8, $STATE8
19638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
19648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
19658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
19668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $aes_lastround8 = sub {
19678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i) = @_;
19688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
19698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu ${\eval($i*16)}($KS), $SCHED
19708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE1, $STATE1
19718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE2, $STATE2
19728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE3, $STATE3
19738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE4, $STATE4
19748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE5, $STATE5
19758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE6, $STATE6
19768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE7, $STATE7
19778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $SCHED, $STATE8, $STATE8
19788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
19798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
19808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
19818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # void ENC_MSG_x8(unsigned char* PT,
19828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #                 unsigned char* CT,
19838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #                 unsigned char* TAG,
19848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #                 unsigned char* KS,
19858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #                 size_t byte_len);
19868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 1: %rdi     #PT
19878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 2: %rsi     #CT
19888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 3: %rdx     #TAG        [127 126 ... 0]  IV=[127...32]
19898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 4: %rcx     #KS
19908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # parameter 5: %r8      #LEN MSG_length in bytes
19918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
19928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes256gcmsiv_enc_msg_x8
19938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes256gcmsiv_enc_msg_x8,\@function,5
19948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
19958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_enc_msg_x8:
19968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
19978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    test $LEN, $LEN
19988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jnz .L256_enc_msg_x8_start
19998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
20008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x8_start:
20028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # Place in stack
20038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq %rsp, %r11
20048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$16, %r11
20058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    andq \$-64, %r11
20068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq $LEN, %r10
20088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$4, $LEN                       # LEN = num of blocks
20098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shlq \$60, %r10
20108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jz .L256_enc_msg_x8_start2
20118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$1, $LEN
20128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x8_start2:
20148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    movq $LEN, %r10
20158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shlq \$61, %r10
20168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$61, %r10
20178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # Make IV from TAG
20198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ($TAG), $TMP1
20208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpor OR_MASK(%rip), $TMP1, $TMP1    # TMP1= IV = [1]TAG[126...32][00..00]
20218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # store counter8 on the stack
20238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd seven(%rip), $TMP1, $CTR1
20248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR1, (%r11)                # CTR8 = TAG[127...32][00..07]
20258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $TMP1, $CTR2       # CTR2 = TAG[127...32][00..01]
20268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd two(%rip), $TMP1, $CTR3       # CTR3 = TAG[127...32][00..02]
20278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd three(%rip), $TMP1, $CTR4     # CTR4 = TAG[127...32][00..03]
20288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd four(%rip), $TMP1, $CTR5      # CTR5 = TAG[127...32][00..04]
20298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd five(%rip), $TMP1, $CTR6      # CTR6 = TAG[127...32][00..05]
20308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd six(%rip), $TMP1, $CTR7       # CTR7 = TAG[127...32][00..06]
20318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $TMP1, $CTR1                 # CTR1 = TAG[127...32][00..00]
20328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    shrq \$3, $LEN
20348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jz .L256_enc_msg_x8_check_remainder
20358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$128, $CT
20378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$128, $PT
20388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x8_loop1:
20408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$128, $CT
20418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$128, $PT
20428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR1, $STATE1
20448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR2, $STATE2
20458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR3, $STATE3
20468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR4, $STATE4
20478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR5, $STATE5
20488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR6, $STATE6
20498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR7, $STATE7
20508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # move from stack
20518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa (%r11), $STATE8
20528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE1, $STATE1
20548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE2, $STATE2
20558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE3, $STATE3
20568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE4, $STATE4
20578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE5, $STATE5
20588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE6, $STATE6
20598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE7, $STATE7
20608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE8, $STATE8
20618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(1)}
20638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa (%r11), $CTR7                # deal with CTR8
20648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR7, $CTR7
20658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR7, (%r11)
20668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(2)}
20678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpsubd one(%rip), $CTR7, $CTR7
20688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(3)}
20698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR1, $CTR1
20708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(4)}
20718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR2, $CTR2
20728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(5)}
20738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR3, $CTR3
20748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(6)}
20758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR4, $CTR4
20768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(7)}
20778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR5, $CTR5
20788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(8)}
20798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd eight(%rip), $CTR6, $CTR6
20808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(9)}
20818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(10)}
20828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(11)}
20838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(12)}
20848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_round8->(13)}
20858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$aes_lastround8->(14)}
20868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # XOR with Plaintext
20888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 0*16($PT), $STATE1, $STATE1
20898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 1*16($PT), $STATE2, $STATE2
20908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 2*16($PT), $STATE3, $STATE3
20918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 3*16($PT), $STATE4, $STATE4
20928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 4*16($PT), $STATE5, $STATE5
20938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 5*16($PT), $STATE6, $STATE6
20948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 6*16($PT), $STATE7, $STATE7
20958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor 7*16($PT), $STATE8, $STATE8
20968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$1, $LEN
20988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
20998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE1, 0*16($CT)
21008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE2, 1*16($CT)
21018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE3, 2*16($CT)
21028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE4, 3*16($CT)
21038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE5, 4*16($CT)
21048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE6, 5*16($CT)
21058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE7, 6*16($CT)
21068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE8, 7*16($CT)
21078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jne .L256_enc_msg_x8_loop1
21098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$128, $CT
21118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$128, $PT
21128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x8_check_remainder:
21148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan   cmpq \$0, %r10
21158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan   je .L256_enc_msg_x8_out
21168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x8_loop2:
21188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # encrypt each block separately
21198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # CTR1 is the highest counter (even if no LOOP done)
21208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $CTR1, $STATE1
21218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd one(%rip), $CTR1, $CTR1
21228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($KS), $STATE1, $STATE1
21248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 16($KS), $STATE1, $STATE1
21258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 32($KS), $STATE1, $STATE1
21268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 48($KS), $STATE1, $STATE1
21278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 64($KS), $STATE1, $STATE1
21288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 80($KS), $STATE1, $STATE1
21298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 96($KS), $STATE1, $STATE1
21308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 112($KS), $STATE1, $STATE1
21318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 128($KS), $STATE1, $STATE1
21328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 144($KS), $STATE1, $STATE1
21338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 160($KS), $STATE1, $STATE1
21348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 176($KS), $STATE1, $STATE1
21358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 192($KS), $STATE1, $STATE1
21368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc 208($KS), $STATE1, $STATE1
21378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast 224($KS), $STATE1, $STATE1
21388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    # XOR with Plaintext
21408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor ($PT), $STATE1, $STATE1
21418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqu $STATE1, ($CT)
21438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$16, $PT
21458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    addq \$16, $CT
21468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    subq \$1, %r10
21478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    jnz .L256_enc_msg_x8_loop2
21488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.L256_enc_msg_x8_out:
21508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
21518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
21538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
21548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
21558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
21568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_enc_msg_x8();
21578ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaesgcmsiv_dec(1);
21588ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21598ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloansub aes256gcmsiv_kdf {
21608ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $ONE = "%xmm8";
21618ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK1 = "%xmm4";
21628ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK2 = "%xmm6";
21638ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK3 = "%xmm7";
21648ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK4 = "%xmm11";
21658ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK5 = "%xmm12";
21668ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $BLOCK6 = "%xmm13";
21678ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21688ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $enc_roundx6 = sub {
21698ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i, $j) = @_;
21708ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
21718ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ${\eval($i*16)}(%rdx), $j
21728ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $j, $BLOCK1, $BLOCK1
21738ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $j, $BLOCK2, $BLOCK2
21748ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $j, $BLOCK3, $BLOCK3
21758ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $j, $BLOCK4, $BLOCK4
21768ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $j, $BLOCK5, $BLOCK5
21778ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenc $j, $BLOCK6, $BLOCK6
21788ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
21798ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
21808ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21818ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  my $enc_roundlastx6 = sub {
21828ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    my ($i, $j) = @_;
21838ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    return <<___;
21848ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa ${\eval($i*16)}(%rdx), $j
21858ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $j, $BLOCK1, $BLOCK1
21868ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $j, $BLOCK2, $BLOCK2
21878ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $j, $BLOCK3, $BLOCK3
21888ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $j, $BLOCK4, $BLOCK4
21898ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $j, $BLOCK5, $BLOCK5
21908ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vaesenclast $j, $BLOCK6, $BLOCK6
21918ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
21928ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  };
21938ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
21948ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  # void aes256gcmsiv_kdf(const uint8_t nonce[16],
21958ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #                       uint8_t *out_key_material,
21968ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  #                       const uint8_t *key_schedule);
21978ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan  $code.=<<___;
21988ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.globl aes256gcmsiv_kdf
21998ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.type aes256gcmsiv_kdf,\@function,3
22008ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.align 16
22018ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_kdf:
22028ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_startproc
22038ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 1: %rdi                         Pointer to NONCE
22048ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 2: %rsi                         Pointer to CT
22058ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan# parameter 4: %rdx                         Pointer to keys
22068ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
22078ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa (%rdx), %xmm1                  # xmm1 = first 16 bytes of random key
22088ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa 0*16(%rdi), $BLOCK1
22098ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa and_mask(%rip), $BLOCK4
22108ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa one(%rip), $ONE
22118ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpshufd \$0x90, $BLOCK1, $BLOCK1
22128ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpand $BLOCK4, $BLOCK1, $BLOCK1
22138ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ONE, $BLOCK1, $BLOCK2
22148ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ONE, $BLOCK2, $BLOCK3
22158ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ONE, $BLOCK3, $BLOCK4
22168ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ONE, $BLOCK4, $BLOCK5
22178ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpaddd $ONE, $BLOCK5, $BLOCK6
22188ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
22198ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm1, $BLOCK1, $BLOCK1
22208ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm1, $BLOCK2, $BLOCK2
22218ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm1, $BLOCK3, $BLOCK3
22228ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm1, $BLOCK4, $BLOCK4
22238ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm1, $BLOCK5, $BLOCK5
22248ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vpxor %xmm1, $BLOCK6, $BLOCK6
22258ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
22268ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(1, "%xmm1")}
22278ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(2, "%xmm2")}
22288ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(3, "%xmm1")}
22298ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(4, "%xmm2")}
22308ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(5, "%xmm1")}
22318ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(6, "%xmm2")}
22328ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(7, "%xmm1")}
22338ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(8, "%xmm2")}
22348ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(9, "%xmm1")}
22358ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(10, "%xmm2")}
22368ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(11, "%xmm1")}
22378ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(12, "%xmm2")}
22388ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundx6->(13, "%xmm1")}
22398ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ${\$enc_roundlastx6->(14, "%xmm2")}
22408ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
22418ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK1, 0*16(%rsi)
22428ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK2, 1*16(%rsi)
22438ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK3, 2*16(%rsi)
22448ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK4, 3*16(%rsi)
22458ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK5, 4*16(%rsi)
22468ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    vmovdqa $BLOCK6, 5*16(%rsi)
22478ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan    ret
22488ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.cfi_endproc
22498ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
22508ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan___
22518ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan}
22528ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanaes256gcmsiv_kdf();
22538ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
22548ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanprint $code;
22558ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloan
22568ff035535f7cf2903f02bbe94d2fa10b7ab855f1Robert Sloanclose STDOUT;
2257