From aaca1424fdce281e954cac07d07176a4dc0af75a Mon Sep 17 00:00:00 2001
From: Logan oos Even <46396513+Logan007@users.noreply.github.com>
Date: Mon, 8 Feb 2021 15:27:29 +0545
Subject: [PATCH] added AVX-512 support to SPECK cipher (#630)

---
 doc/Building.md |   4 +-
 doc/Crypto.md   |   2 +-
 include/speck.h |  19 +++-
 src/speck.c     | 224 +++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 241 insertions(+), 8 deletions(-)

diff --git a/doc/Building.md b/doc/Building.md
index 2d14a70..f837b88 100644
--- a/doc/Building.md
+++ b/doc/Building.md
@@ -42,7 +42,7 @@ In order to run n2n, you will need the following:
 
 - The TAP drivers should be installed into the system. They can be installed from
   http://build.openvpn.net/downloads/releases, search for "tap-windows".
-  
+
 - If OpenSSL has been linked dynamically, the corresponding `.dll` file should be available
   onto the target computer.
 
@@ -113,7 +113,7 @@ So far, the following portions of n2n's code benefit from hardware features:
 ```
 AES:               AES-NI
 ChaCha20:          SSE2, SSSE3
-SPECK:             SSE2, SSSE3, AVX2, (NEON)
+SPECK:             SSE2, SSSE3, AVX2, AVX512, (NEON)
 Pearson Hashing:   AES-NI
 Random Numbers:    RDSEED, RDRND (not faster but more random seed)
 ```
diff --git a/doc/Crypto.md b/doc/Crypto.md
index 430301b..798cb05 100644
--- a/doc/Crypto.md
+++ b/doc/Crypto.md
@@ -60,7 +60,7 @@ ChaCha20 usually performs faster than AES-CTS.
 
 SPECK is recommended by the NSA for offical use in case AES implementation is not feasible due to system constraints (performance, size, …). The block cipher is used in CTR mode making it a stream cipher. The random full 128-bit IV is transmitted in plain.
 
-On modern Intel CPUs, SPECK performs even faster than openSSL's ChaCha20 as it takes advantage of SSE4 or AVX2 if available. On Raspberry's ARM CPU, it is second place behind ChaCha20 and before Twofish.
+On modern Intel CPUs, SPECK performs even faster than openSSL's ChaCha20 as it takes advantage of SSE4, AVX2, or AVX512 if available. On Raspberry's ARM CPU, it is second place behind ChaCha20 and before Twofish.
 
 ### Random Numbers
 
diff --git a/include/speck.h b/include/speck.h
index ad1fbe3..1c19f90 100644
--- a/include/speck.h
+++ b/include/speck.h
@@ -39,7 +39,24 @@
 #define SPECK_KEY_BYTES       (256/8)
 
 
-#if defined (__AVX2__) // AVX support -----------------------------------------------------------------------------
+#if defined (__AVX512F__) // AVX512 support -----------------------------------------------------------------------
+
+
+#include <immintrin.h>
+#include <string.h>    /* memcpy() */
+
+#define u512 __m512i
+
+#define SPECK_ALIGNED_CTX	64
+
+typedef struct {
+    u512 rk[34];
+    u64 key[34];
+    u32 keysize;
+} speck_context_t;
+
+
+#elif defined (__AVX2__) // AVX2 support --------------------------------------------------------------------------
 
 
 #include <immintrin.h>
diff --git a/src/speck.c b/src/speck.c
index f92b5e0..9666607 100644
--- a/src/speck.c
+++ b/src/speck.c
@@ -25,7 +25,224 @@
 #include "speck.h"
 
 
-#if defined (__AVX2__)  // AVX support ----------------------------------------------------------------------------
+#if defined (__AVX512F__)  // AVX512 support ----------------------------------------------------------------------
+
+
+#define LCS(x,r) (((x)<<r)|((x)>>(64-r)))
+#define RCS(x,r) (((x)>>r)|((x)<<(64-r)))
+
+#define SET _mm512_set_epi64
+#define XOR _mm512_xor_si512
+#define ADD _mm512_add_epi64
+#define AND _mm512_and_si512
+#define ROL(X,r) (_mm512_rol_epi64(X,r))
+#define ROR(X,r) (_mm512_ror_epi64(X,r))
+
+#define _q8 SET(0x7LL,0x3LL,0x6LL,0x2LL,0x5LL,0x1LL,0x4LL,0x0LL)
+#define _eight SET(0x8LL,0x8LL,0x8LL,0x8LL,0x8LL,0x8LL,0x8LL,0x8LL)
+
+#define SET1(X,c) (X=SET(c,c,c,c,c,c,c,c))
+#define SET8(X,c) (X=SET(c,c,c,c,c,c,c,c), X=ADD(X,_q8))
+
+#define LOW  _mm512_unpacklo_epi64
+#define HIGH _mm512_unpackhi_epi64
+#define LD(ip) (_mm512_load_epi64(((void *)(ip))))
+#define ST(ip,X) _mm512_storeu_si512((void *)(ip),X)
+#define STORE(out,X,Y) (ST(out,LOW(Y,X)), ST(out+64,HIGH(Y,X)))
+#define XOR_STORE(in,out,X,Y) (ST(out,XOR(LD(in),LOW(Y,X))), ST(out+64,XOR(LD(in+64),HIGH(Y,X))))
+
+#define Rx8(X,Y,k)  (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), \
+                     Y[0]=XOR(ROL(Y[0],3),X[0]))
+#define Rx16(X,Y,k) (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), X[1]=XOR(ADD(ROR(X[1],8),Y[1]),k), \
+                     Y[0]=XOR(ROL(Y[0],3),X[0]), Y[1]=XOR(ROL(Y[1],3),X[1]))
+#define Rx24(X,Y,k) (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), X[1]=XOR(ADD(ROR(X[1],8),Y[1]),k), X[2]=XOR(ADD(ROR(X[2],8),Y[2]),k), \
+                     Y[0]=XOR(ROL(Y[0],3),X[0]), Y[1]=XOR(ROL(Y[1],3),X[1]), Y[2]=XOR(ROL(Y[2],3),X[2]))
+#define Rx32(X,Y,k) (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), X[1]=XOR(ADD(ROR(X[1],8),Y[1]),k), \
+                     X[2]=XOR(ADD(ROR(X[2],8),Y[2]),k), X[3]=XOR(ADD(ROR(X[3],8),Y[3]),k), \
+                     Y[0]=XOR(ROL(Y[0],3),X[0]), Y[1]=XOR(ROL(Y[1],3),X[1]),               \
+                     Y[2]=XOR(ROL(Y[2],3),X[2]), Y[3]=XOR(ROL(Y[3],3),X[3]))
+
+#define Rx1(x,y,k)  (x[0]=RCS(x[0],8), x[0]+=y[0], x[0]^=k, y[0]=LCS(y[0],3), y[0]^=x[0])
+#define Rx1b(x,y,k) (x=RCS(x,8), x+=y, x^=k, y=LCS(y,3), y^=x)
+#define Rx2(x,y,k)  (x[0]=RCS(x[0],8), x[1]=RCS(x[1],8), x[0]+=y[0], x[1]+=y[1],                   \
+                     x[0]^=k, x[1]^=k, y[0]=LCS(y[0],3), y[1]=LCS(y[1],3), y[0]^=x[0], y[1]^=x[1])
+
+#define Encrypt_128(X,Y,k,n) (Rx##n(X,Y,k[0]),  Rx##n(X,Y,k[1]),  Rx##n(X,Y,k[2]),  Rx##n(X,Y,k[3]),  Rx##n(X,Y,k[4]),  Rx##n(X,Y,k[5]),  Rx##n(X,Y,k[6]),  Rx##n(X,Y,k[7]),  \
+                              Rx##n(X,Y,k[8]),  Rx##n(X,Y,k[9]),  Rx##n(X,Y,k[10]), Rx##n(X,Y,k[11]), Rx##n(X,Y,k[12]), Rx##n(X,Y,k[13]), Rx##n(X,Y,k[14]), Rx##n(X,Y,k[15]), \
+                              Rx##n(X,Y,k[16]), Rx##n(X,Y,k[17]), Rx##n(X,Y,k[18]), Rx##n(X,Y,k[19]), Rx##n(X,Y,k[20]), Rx##n(X,Y,k[21]), Rx##n(X,Y,k[22]), Rx##n(X,Y,k[23]), \
+                              Rx##n(X,Y,k[24]), Rx##n(X,Y,k[25]), Rx##n(X,Y,k[26]), Rx##n(X,Y,k[27]), Rx##n(X,Y,k[28]), Rx##n(X,Y,k[29]), Rx##n(X,Y,k[30]), Rx##n(X,Y,k[31]))
+
+#define Encrypt_256(X,Y,k,n) (Encrypt_128(X,Y,k,n),               \
+                              Rx##n(X,Y,k[32]), Rx##n(X,Y,k[33]))
+
+#define RK(X,Y,k,key,i)   (SET1(k[i],Y), key[i]=Y, X=RCS(X,8), X+=Y, X^=i, Y=LCS(Y,3), Y^=X)
+
+#define EK(A,B,C,D,k,key) (RK(B,A,k,key,0),  RK(C,A,k,key,1),  RK(D,A,k,key,2),  RK(B,A,k,key,3),  RK(C,A,k,key,4),  RK(D,A,k,key,5),  RK(B,A,k,key,6),  \
+                           RK(C,A,k,key,7),  RK(D,A,k,key,8),  RK(B,A,k,key,9),  RK(C,A,k,key,10), RK(D,A,k,key,11), RK(B,A,k,key,12), RK(C,A,k,key,13), \
+                           RK(D,A,k,key,14), RK(B,A,k,key,15), RK(C,A,k,key,16), RK(D,A,k,key,17), RK(B,A,k,key,18), RK(C,A,k,key,19), RK(D,A,k,key,20), \
+                           RK(B,A,k,key,21), RK(C,A,k,key,22), RK(D,A,k,key,23), RK(B,A,k,key,24), RK(C,A,k,key,25), RK(D,A,k,key,26), RK(B,A,k,key,27), \
+                           RK(C,A,k,key,28), RK(D,A,k,key,29), RK(B,A,k,key,30), RK(C,A,k,key,31), RK(D,A,k,key,32), RK(B,A,k,key,33))
+
+#define Encrypt_Dispatcher(keysize)                                                       \
+    u64  x[2], y[2];                                                                      \
+    u512 X[4], Y[4];                                                                      \
+    unsigned char block1024[128];                                                         \
+                                                                                          \
+    if(numbytes == 16) {                                                                  \
+        x[0] = nonce[1]; y[0] = nonce[0]; nonce[0]++;                                     \
+        Encrypt_##keysize(x, y, ctx->key, 1);                                             \
+        ((u64 *)out)[1] = x[0]; ((u64 *)out)[0] = y[0];                                   \
+        return 0;                                                                         \
+    }                                                                                     \
+                                                                                          \
+    if(numbytes == 32) {                                                                  \
+        x[0] = nonce[1]; y[0] = nonce[0]; nonce[0]++;                                     \
+        x[1] = nonce[1]; y[1] = nonce[0]; nonce[0]++;                                     \
+        Encrypt_##keysize(x, y, ctx->key, 2);                                             \
+        ((u64 *)out)[1] = x[0] ^ ((u64 *)in)[1]; ((u64 *)out)[0] = y[0] ^ ((u64 *)in)[0]; \
+        ((u64 *)out)[3] = x[1] ^ ((u64 *)in)[3]; ((u64 *)out)[2] = y[1] ^ ((u64 *)in)[2]; \
+        return 0;                                                                         \
+    }                                                                                     \
+                                                                                          \
+    if(numbytes == 64) {                                                                  \
+        SET1(X[0], nonce[1]);                                                             \
+        SET8(Y[0], nonce[0]);                                                             \
+        Encrypt_##keysize(X, Y, ctx->rk, 8);                                              \
+        nonce[0] += (numbytes >> 4);                                                      \
+        memcpy(block1024, in, 64);                                                        \
+        XOR_STORE(block1024, block1024, X[0], Y[0]);                                      \
+        memcpy(out, block1024, 64);                                                       \
+        return 0;                                                                         \
+    }                                                                                     \
+                                                                                          \
+    SET1(X[0], nonce[1]); SET8(Y[0], nonce[0]);                                           \
+                                                                                          \
+    if(numbytes == 128)                                                                   \
+        Encrypt_##keysize(X, Y, ctx->rk, 8);                                              \
+    else {                                                                                \
+        X[1] = X[0];                                                                      \
+        Y[1] = ADD(Y[0], _eight);                                                         \
+        if(numbytes == 256)                                                               \
+            Encrypt_##keysize(X, Y, ctx->rk, 16);                                         \
+        else {                                                                            \
+            X[2] = X[0];                                                                  \
+            Y[2] = ADD(Y[1], _eight);                                                     \
+            if(numbytes == 384)                                                           \
+                Encrypt_##keysize(X, Y, ctx->rk, 24);                                     \
+            else {                                                                        \
+                X[3] = X[0];                                                              \
+                Y[3] = ADD(Y[2], _eight);                                                 \
+                Encrypt_##keysize(X, Y, ctx->rk, 32);                                     \
+            }                                                                             \
+        }                                                                                 \
+    }                                                                                     \
+                                                                                          \
+    nonce[0] += (numbytes >> 4);                                                          \
+                                                                                          \
+    XOR_STORE(in, out, X[0], Y[0]);                                                       \
+    if (numbytes >= 256)                                                                  \
+        XOR_STORE(in + 128, out + 128, X[1], Y[1]);                                       \
+    if(numbytes >= 384)                                                                   \
+        XOR_STORE(in + 256, out + 256, X[2], Y[2]);                                       \
+    if(numbytes >= 512)                                                                   \
+        XOR_STORE(in + 384, out + 384, X[3], Y[3]);                                       \
+                                                                                          \
+    return 0
+
+
+static int speck_encrypt_xor(unsigned char *out, const unsigned char *in, u64 nonce[], speck_context_t *ctx, int numbytes) {
+
+    if(ctx->keysize == 256) {
+        Encrypt_Dispatcher(256);
+    } else {
+        Encrypt_Dispatcher(128);
+    }
+}
+
+
+static int internal_speck_ctr(unsigned char *out, const unsigned char *in, unsigned long long inlen,
+                              const unsigned char *n, speck_context_t *ctx) {
+
+    int i;
+    u64 nonce[2];
+    unsigned char block[16];
+    u64 * const block64 = (u64 *)block;
+
+    if (!inlen)
+        return 0;
+
+    nonce[0] = ((u64 *)n)[0];
+    nonce[1] = ((u64 *)n)[1];
+
+    while(inlen >= 512) {
+        speck_encrypt_xor(out, in, nonce, ctx, 512);
+        in += 512; inlen -= 512; out += 512;
+    }
+
+    if(inlen >= 384) {
+        speck_encrypt_xor(out, in, nonce, ctx, 384);
+        in += 384; inlen -= 384; out += 384;
+    }
+
+    if(inlen >= 256) {
+        speck_encrypt_xor(out, in, nonce, ctx, 256);
+        in += 256; inlen -= 256; out += 256;
+    }
+
+    if(inlen >= 128) {
+        speck_encrypt_xor(out, in, nonce, ctx, 128);
+        in += 128; inlen -= 128; out += 128;
+    }
+
+    if(inlen >= 64) {
+        speck_encrypt_xor(out, in, nonce, ctx, 64);
+        in += 64; inlen -= 64; out += 64;
+    }
+
+    if(inlen >= 32) {
+        speck_encrypt_xor(out, in, nonce, ctx, 32);
+        in += 32; inlen -= 32; out += 32;
+    }
+
+    if(inlen >= 16) {
+        speck_encrypt_xor(block, in, nonce, ctx, 16);
+        ((u64 *)out)[0] = block64[0] ^ ((u64 *)in)[0];
+        ((u64 *)out)[1] = block64[1] ^ ((u64 *)in)[1];
+        in += 16; inlen -= 16; out += 16;
+    }
+
+    if(inlen > 0) {
+        speck_encrypt_xor(block, in, nonce, ctx, 16);
+        for(i = 0; i < inlen; i++)
+            out[i] = block[i] ^ in[i];
+    }
+
+    return 0;
+}
+
+
+static int speck_expand_key (speck_context_t *ctx, const unsigned char *k, int keysize) {
+
+    u64 K[4];
+    size_t i;
+
+    for(i = 0; i < (keysize >> 6); i++)
+        K[i] = ((u64 *)k)[i];
+
+    // 128 bit has only two keys A and B thus replacing both C and D with B then
+    if(keysize == 128) {
+        EK(K[0], K[1], K[1], K[1], ctx->rk, ctx->key);
+    } else {
+        EK(K[0], K[1], K[2], K[3], ctx->rk, ctx->key);
+    }
+
+    ctx->keysize = keysize;
+
+    return 0;
+}
+
+
+#elif defined (__AVX2__)  // AVX2 support -------------------------------------------------------------------------
 
 
 #define LCS(x,r) (((x)<<r)|((x)>>(64-r)))
@@ -80,8 +297,7 @@
 #define Rx2(x,y,k)  (x[0]=RCS(x[0],8), x[1]=RCS(x[1],8), x[0]+=y[0],       x[1]+=y[1], \
                      x[0]^=k,          x[1]^=k,          y[0]=LCS(y[0],3), y[1]=LCS(y[1],3), y[0]^=x[0], y[1]^=x[1])
 
-
-#define Encrypt_128(X,Y,k,n) (Rx##n(X,Y,k[0]),  Rx##n(X,Y,k[1]),  Rx##n(X,Y,k[2]),  Rx##n(X,Y,k[3]),  Rx##n(X,Y,k[4]),  Rx##n(X,Y,k[5]),  Rx##n(X,Y,k[6]),  Rx##n(X,Y,k[7]), \
+#define Encrypt_128(X,Y,k,n) (Rx##n(X,Y,k[0]),  Rx##n(X,Y,k[1]),  Rx##n(X,Y,k[2]),  Rx##n(X,Y,k[3]),  Rx##n(X,Y,k[4]),  Rx##n(X,Y,k[5]),  Rx##n(X,Y,k[6]),  Rx##n(X,Y,k[7]),  \
                               Rx##n(X,Y,k[8]),  Rx##n(X,Y,k[9]),  Rx##n(X,Y,k[10]), Rx##n(X,Y,k[11]), Rx##n(X,Y,k[12]), Rx##n(X,Y,k[13]), Rx##n(X,Y,k[14]), Rx##n(X,Y,k[15]), \
                               Rx##n(X,Y,k[16]), Rx##n(X,Y,k[17]), Rx##n(X,Y,k[18]), Rx##n(X,Y,k[19]), Rx##n(X,Y,k[20]), Rx##n(X,Y,k[21]), Rx##n(X,Y,k[22]), Rx##n(X,Y,k[23]), \
                               Rx##n(X,Y,k[24]), Rx##n(X,Y,k[25]), Rx##n(X,Y,k[26]), Rx##n(X,Y,k[27]), Rx##n(X,Y,k[28]), Rx##n(X,Y,k[29]), Rx##n(X,Y,k[30]), Rx##n(X,Y,k[31]))
@@ -91,7 +307,7 @@
 
 #define RK(X,Y,k,key,i)   (SET1(k[i],Y), key[i]=Y, X=RCS(X,8), X+=Y, X^=i, Y=LCS(Y,3), Y^=X)
 
-#define EK(A,B,C,D,k,key) (RK(B,A,k,key,0),  RK(C,A,k,key,1),  RK(D,A,k,key,2),  RK(B,A,k,key,3),  RK(C,A,k,key,4),  RK(D,A,k,key,5),  RK(B,A,k,key,6), \
+#define EK(A,B,C,D,k,key) (RK(B,A,k,key,0),  RK(C,A,k,key,1),  RK(D,A,k,key,2),  RK(B,A,k,key,3),  RK(C,A,k,key,4),  RK(D,A,k,key,5),  RK(B,A,k,key,6),  \
                            RK(C,A,k,key,7),  RK(D,A,k,key,8),  RK(B,A,k,key,9),  RK(C,A,k,key,10), RK(D,A,k,key,11), RK(B,A,k,key,12), RK(C,A,k,key,13), \
                            RK(D,A,k,key,14), RK(B,A,k,key,15), RK(C,A,k,key,16), RK(D,A,k,key,17), RK(B,A,k,key,18), RK(C,A,k,key,19), RK(D,A,k,key,20), \
                            RK(B,A,k,key,21), RK(C,A,k,key,22), RK(D,A,k,key,23), RK(B,A,k,key,24), RK(C,A,k,key,25), RK(D,A,k,key,26), RK(B,A,k,key,27), \