added AVX-512 support to SPECK cipher (#630)

2024-09-19 16:41:11 +02:00 · 2021-02-08 15:27:29 +05:45 · 2021-02-08 15:27:29 +05:45 · aaca1424fd
commit aaca1424fd
parent e31fc5f56a
4 changed files with 241 additions and 8 deletions
--- a/doc/Building.md
+++ b/doc/Building.md
@ -42,7 +42,7 @@ In order to run n2n, you will need the following:

 - The TAP drivers should be installed into the system. They can be installed from
  http://build.openvpn.net/downloads/releases, search for "tap-windows".
-  
+
 - If OpenSSL has been linked dynamically, the corresponding `.dll` file should be available
  onto the target computer.

@ -113,7 +113,7 @@ So far, the following portions of n2n's code benefit from hardware features:
 ```
 AES:               AES-NI
 ChaCha20:          SSE2, SSSE3
-SPECK:             SSE2, SSSE3, AVX2, (NEON)
+SPECK:             SSE2, SSSE3, AVX2, AVX512, (NEON)
 Pearson Hashing:   AES-NI
 Random Numbers:    RDSEED, RDRND (not faster but more random seed)
 ```
--- a/doc/Crypto.md
+++ b/doc/Crypto.md
@ -60,7 +60,7 @@ ChaCha20 usually performs faster than AES-CTS.

 SPECK is recommended by the NSA for offical use in case AES implementation is not feasible due to system constraints (performance, size, …). The block cipher is used in CTR mode making it a stream cipher. The random full 128-bit IV is transmitted in plain.

-On modern Intel CPUs, SPECK performs even faster than openSSL's ChaCha20 as it takes advantage of SSE4 or AVX2 if available. On Raspberry's ARM CPU, it is second place behind ChaCha20 and before Twofish.
+On modern Intel CPUs, SPECK performs even faster than openSSL's ChaCha20 as it takes advantage of SSE4, AVX2, or AVX512 if available. On Raspberry's ARM CPU, it is second place behind ChaCha20 and before Twofish.

 ### Random Numbers

--- a/include/speck.h
+++ b/include/speck.h
@ -39,7 +39,24 @@
 #define SPECK_KEY_BYTES       (256/8)


-#if defined (__AVX2__) // AVX support -----------------------------------------------------------------------------
+#if defined (__AVX512F__) // AVX512 support -----------------------------------------------------------------------
+
+
+#include <immintrin.h>
+#include <string.h>    /* memcpy() */
+
+#define u512 __m512i
+
+#define SPECK_ALIGNED_CTX	64
+
+typedef struct {
+    u512 rk[34];
+    u64 key[34];
+    u32 keysize;
+} speck_context_t;
+
+
+#elif defined (__AVX2__) // AVX2 support --------------------------------------------------------------------------


 #include <immintrin.h>
--- a/src/speck.c
+++ b/src/speck.c
@ -25,7 +25,224 @@
 #include "speck.h"


-#if defined (__AVX2__)  // AVX support ----------------------------------------------------------------------------
+#if defined (__AVX512F__)  // AVX512 support ----------------------------------------------------------------------
+
+
+#define LCS(x,r) (((x)<<r)|((x)>>(64-r)))
+#define RCS(x,r) (((x)>>r)|((x)<<(64-r)))
+
+#define SET _mm512_set_epi64
+#define XOR _mm512_xor_si512
+#define ADD _mm512_add_epi64
+#define AND _mm512_and_si512
+#define ROL(X,r) (_mm512_rol_epi64(X,r))
+#define ROR(X,r) (_mm512_ror_epi64(X,r))
+
+#define _q8 SET(0x7LL,0x3LL,0x6LL,0x2LL,0x5LL,0x1LL,0x4LL,0x0LL)
+#define _eight SET(0x8LL,0x8LL,0x8LL,0x8LL,0x8LL,0x8LL,0x8LL,0x8LL)
+
+#define SET1(X,c) (X=SET(c,c,c,c,c,c,c,c))
+#define SET8(X,c) (X=SET(c,c,c,c,c,c,c,c), X=ADD(X,_q8))
+
+#define LOW  _mm512_unpacklo_epi64
+#define HIGH _mm512_unpackhi_epi64
+#define LD(ip) (_mm512_load_epi64(((void *)(ip))))
+#define ST(ip,X) _mm512_storeu_si512((void *)(ip),X)
+#define STORE(out,X,Y) (ST(out,LOW(Y,X)), ST(out+64,HIGH(Y,X)))
+#define XOR_STORE(in,out,X,Y) (ST(out,XOR(LD(in),LOW(Y,X))), ST(out+64,XOR(LD(in+64),HIGH(Y,X))))
+
+#define Rx8(X,Y,k)  (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), \
+                     Y[0]=XOR(ROL(Y[0],3),X[0]))
+#define Rx16(X,Y,k) (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), X[1]=XOR(ADD(ROR(X[1],8),Y[1]),k), \
+                     Y[0]=XOR(ROL(Y[0],3),X[0]), Y[1]=XOR(ROL(Y[1],3),X[1]))
+#define Rx24(X,Y,k) (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), X[1]=XOR(ADD(ROR(X[1],8),Y[1]),k), X[2]=XOR(ADD(ROR(X[2],8),Y[2]),k), \
+                     Y[0]=XOR(ROL(Y[0],3),X[0]), Y[1]=XOR(ROL(Y[1],3),X[1]), Y[2]=XOR(ROL(Y[2],3),X[2]))
+#define Rx32(X,Y,k) (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), X[1]=XOR(ADD(ROR(X[1],8),Y[1]),k), \
+                     X[2]=XOR(ADD(ROR(X[2],8),Y[2]),k), X[3]=XOR(ADD(ROR(X[3],8),Y[3]),k), \
+                     Y[0]=XOR(ROL(Y[0],3),X[0]), Y[1]=XOR(ROL(Y[1],3),X[1]),               \
+                     Y[2]=XOR(ROL(Y[2],3),X[2]), Y[3]=XOR(ROL(Y[3],3),X[3]))
+
+#define Rx1(x,y,k)  (x[0]=RCS(x[0],8), x[0]+=y[0], x[0]^=k, y[0]=LCS(y[0],3), y[0]^=x[0])
+#define Rx1b(x,y,k) (x=RCS(x,8), x+=y, x^=k, y=LCS(y,3), y^=x)
+#define Rx2(x,y,k)  (x[0]=RCS(x[0],8), x[1]=RCS(x[1],8), x[0]+=y[0], x[1]+=y[1],                   \
+                     x[0]^=k, x[1]^=k, y[0]=LCS(y[0],3), y[1]=LCS(y[1],3), y[0]^=x[0], y[1]^=x[1])
+
+#define Encrypt_128(X,Y,k,n) (Rx##n(X,Y,k[0]),  Rx##n(X,Y,k[1]),  Rx##n(X,Y,k[2]),  Rx##n(X,Y,k[3]),  Rx##n(X,Y,k[4]),  Rx##n(X,Y,k[5]),  Rx##n(X,Y,k[6]),  Rx##n(X,Y,k[7]),  \
+                              Rx##n(X,Y,k[8]),  Rx##n(X,Y,k[9]),  Rx##n(X,Y,k[10]), Rx##n(X,Y,k[11]), Rx##n(X,Y,k[12]), Rx##n(X,Y,k[13]), Rx##n(X,Y,k[14]), Rx##n(X,Y,k[15]), \
+                              Rx##n(X,Y,k[16]), Rx##n(X,Y,k[17]), Rx##n(X,Y,k[18]), Rx##n(X,Y,k[19]), Rx##n(X,Y,k[20]), Rx##n(X,Y,k[21]), Rx##n(X,Y,k[22]), Rx##n(X,Y,k[23]), \
+                              Rx##n(X,Y,k[24]), Rx##n(X,Y,k[25]), Rx##n(X,Y,k[26]), Rx##n(X,Y,k[27]), Rx##n(X,Y,k[28]), Rx##n(X,Y,k[29]), Rx##n(X,Y,k[30]), Rx##n(X,Y,k[31]))
+
+#define Encrypt_256(X,Y,k,n) (Encrypt_128(X,Y,k,n),               \
+                              Rx##n(X,Y,k[32]), Rx##n(X,Y,k[33]))
+
+#define RK(X,Y,k,key,i)   (SET1(k[i],Y), key[i]=Y, X=RCS(X,8), X+=Y, X^=i, Y=LCS(Y,3), Y^=X)
+
+#define EK(A,B,C,D,k,key) (RK(B,A,k,key,0),  RK(C,A,k,key,1),  RK(D,A,k,key,2),  RK(B,A,k,key,3),  RK(C,A,k,key,4),  RK(D,A,k,key,5),  RK(B,A,k,key,6),  \
+                           RK(C,A,k,key,7),  RK(D,A,k,key,8),  RK(B,A,k,key,9),  RK(C,A,k,key,10), RK(D,A,k,key,11), RK(B,A,k,key,12), RK(C,A,k,key,13), \
+                           RK(D,A,k,key,14), RK(B,A,k,key,15), RK(C,A,k,key,16), RK(D,A,k,key,17), RK(B,A,k,key,18), RK(C,A,k,key,19), RK(D,A,k,key,20), \
+                           RK(B,A,k,key,21), RK(C,A,k,key,22), RK(D,A,k,key,23), RK(B,A,k,key,24), RK(C,A,k,key,25), RK(D,A,k,key,26), RK(B,A,k,key,27), \
+                           RK(C,A,k,key,28), RK(D,A,k,key,29), RK(B,A,k,key,30), RK(C,A,k,key,31), RK(D,A,k,key,32), RK(B,A,k,key,33))
+
+#define Encrypt_Dispatcher(keysize)                                                       \
+    u64  x[2], y[2];                                                                      \
+    u512 X[4], Y[4];                                                                      \
+    unsigned char block1024[128];                                                         \
+                                                                                          \
+    if(numbytes == 16) {                                                                  \
+        x[0] = nonce[1]; y[0] = nonce[0]; nonce[0]++;                                     \
+        Encrypt_##keysize(x, y, ctx->key, 1);                                             \
+        ((u64 *)out)[1] = x[0]; ((u64 *)out)[0] = y[0];                                   \
+        return 0;                                                                         \
+    }                                                                                     \
+                                                                                          \
+    if(numbytes == 32) {                                                                  \
+        x[0] = nonce[1]; y[0] = nonce[0]; nonce[0]++;                                     \
+        x[1] = nonce[1]; y[1] = nonce[0]; nonce[0]++;                                     \
+        Encrypt_##keysize(x, y, ctx->key, 2);                                             \
+        ((u64 *)out)[1] = x[0] ^ ((u64 *)in)[1]; ((u64 *)out)[0] = y[0] ^ ((u64 *)in)[0]; \
+        ((u64 *)out)[3] = x[1] ^ ((u64 *)in)[3]; ((u64 *)out)[2] = y[1] ^ ((u64 *)in)[2]; \
+        return 0;                                                                         \
+    }                                                                                     \
+                                                                                          \
+    if(numbytes == 64) {                                                                  \
+        SET1(X[0], nonce[1]);                                                             \
+        SET8(Y[0], nonce[0]);                                                             \
+        Encrypt_##keysize(X, Y, ctx->rk, 8);                                              \
+        nonce[0] += (numbytes >> 4);                                                      \
+        memcpy(block1024, in, 64);                                                        \
+        XOR_STORE(block1024, block1024, X[0], Y[0]);                                      \
+        memcpy(out, block1024, 64);                                                       \
+        return 0;                                                                         \
+    }                                                                                     \
+                                                                                          \
+    SET1(X[0], nonce[1]); SET8(Y[0], nonce[0]);                                           \
+                                                                                          \
+    if(numbytes == 128)                                                                   \
+        Encrypt_##keysize(X, Y, ctx->rk, 8);                                              \
+    else {                                                                                \
+        X[1] = X[0];                                                                      \
+        Y[1] = ADD(Y[0], _eight);                                                         \
+        if(numbytes == 256)                                                               \
+            Encrypt_##keysize(X, Y, ctx->rk, 16);                                         \
+        else {                                                                            \
+            X[2] = X[0];                                                                  \
+            Y[2] = ADD(Y[1], _eight);                                                     \
+            if(numbytes == 384)                                                           \
+                Encrypt_##keysize(X, Y, ctx->rk, 24);                                     \
+            else {                                                                        \
+                X[3] = X[0];                                                              \
+                Y[3] = ADD(Y[2], _eight);                                                 \
+                Encrypt_##keysize(X, Y, ctx->rk, 32);                                     \
+            }                                                                             \
+        }                                                                                 \
+    }                                                                                     \
+                                                                                          \
+    nonce[0] += (numbytes >> 4);                                                          \
+                                                                                          \
+    XOR_STORE(in, out, X[0], Y[0]);                                                       \
+    if (numbytes >= 256)                                                                  \
+        XOR_STORE(in + 128, out + 128, X[1], Y[1]);                                       \
+    if(numbytes >= 384)                                                                   \
+        XOR_STORE(in + 256, out + 256, X[2], Y[2]);                                       \
+    if(numbytes >= 512)                                                                   \
+        XOR_STORE(in + 384, out + 384, X[3], Y[3]);                                       \
+                                                                                          \
+    return 0
+
+
+static int speck_encrypt_xor(unsigned char *out, const unsigned char *in, u64 nonce[], speck_context_t *ctx, int numbytes) {
+
+    if(ctx->keysize == 256) {
+        Encrypt_Dispatcher(256);
+    } else {
+        Encrypt_Dispatcher(128);
+    }
+}
+
+
+static int internal_speck_ctr(unsigned char *out, const unsigned char *in, unsigned long long inlen,
+                              const unsigned char *n, speck_context_t *ctx) {
+
+    int i;
+    u64 nonce[2];
+    unsigned char block[16];
+    u64 * const block64 = (u64 *)block;
+
+    if (!inlen)
+        return 0;
+
+    nonce[0] = ((u64 *)n)[0];
+    nonce[1] = ((u64 *)n)[1];
+
+    while(inlen >= 512) {
+        speck_encrypt_xor(out, in, nonce, ctx, 512);
+        in += 512; inlen -= 512; out += 512;
+    }
+
+    if(inlen >= 384) {
+        speck_encrypt_xor(out, in, nonce, ctx, 384);
+        in += 384; inlen -= 384; out += 384;
+    }
+
+    if(inlen >= 256) {
+        speck_encrypt_xor(out, in, nonce, ctx, 256);
+        in += 256; inlen -= 256; out += 256;
+    }
+
+    if(inlen >= 128) {
+        speck_encrypt_xor(out, in, nonce, ctx, 128);
+        in += 128; inlen -= 128; out += 128;
+    }
+
+    if(inlen >= 64) {
+        speck_encrypt_xor(out, in, nonce, ctx, 64);
+        in += 64; inlen -= 64; out += 64;
+    }
+
+    if(inlen >= 32) {
+        speck_encrypt_xor(out, in, nonce, ctx, 32);
+        in += 32; inlen -= 32; out += 32;
+    }
+
+    if(inlen >= 16) {
+        speck_encrypt_xor(block, in, nonce, ctx, 16);
+        ((u64 *)out)[0] = block64[0] ^ ((u64 *)in)[0];
+        ((u64 *)out)[1] = block64[1] ^ ((u64 *)in)[1];
+        in += 16; inlen -= 16; out += 16;
+    }
+
+    if(inlen > 0) {
+        speck_encrypt_xor(block, in, nonce, ctx, 16);
+        for(i = 0; i < inlen; i++)
+            out[i] = block[i] ^ in[i];
+    }
+
+    return 0;
+}
+
+
+static int speck_expand_key (speck_context_t *ctx, const unsigned char *k, int keysize) {
+
+    u64 K[4];
+    size_t i;
+
+    for(i = 0; i < (keysize >> 6); i++)
+        K[i] = ((u64 *)k)[i];
+
+    // 128 bit has only two keys A and B thus replacing both C and D with B then
+    if(keysize == 128) {
+        EK(K[0], K[1], K[1], K[1], ctx->rk, ctx->key);
+    } else {
+        EK(K[0], K[1], K[2], K[3], ctx->rk, ctx->key);
+    }
+
+    ctx->keysize = keysize;
+
+    return 0;
+}
+
+
+#elif defined (__AVX2__)  // AVX2 support -------------------------------------------------------------------------


 #define LCS(x,r) (((x)<<r)|((x)>>(64-r)))
@ -80,8 +297,7 @@
 #define Rx2(x,y,k)  (x[0]=RCS(x[0],8), x[1]=RCS(x[1],8), x[0]+=y[0],       x[1]+=y[1], \
                     x[0]^=k,          x[1]^=k,          y[0]=LCS(y[0],3), y[1]=LCS(y[1],3), y[0]^=x[0], y[1]^=x[1])

-
-#define Encrypt_128(X,Y,k,n) (Rx##n(X,Y,k[0]),  Rx##n(X,Y,k[1]),  Rx##n(X,Y,k[2]),  Rx##n(X,Y,k[3]),  Rx##n(X,Y,k[4]),  Rx##n(X,Y,k[5]),  Rx##n(X,Y,k[6]),  Rx##n(X,Y,k[7]), \
+#define Encrypt_128(X,Y,k,n) (Rx##n(X,Y,k[0]),  Rx##n(X,Y,k[1]),  Rx##n(X,Y,k[2]),  Rx##n(X,Y,k[3]),  Rx##n(X,Y,k[4]),  Rx##n(X,Y,k[5]),  Rx##n(X,Y,k[6]),  Rx##n(X,Y,k[7]),  \
                              Rx##n(X,Y,k[8]),  Rx##n(X,Y,k[9]),  Rx##n(X,Y,k[10]), Rx##n(X,Y,k[11]), Rx##n(X,Y,k[12]), Rx##n(X,Y,k[13]), Rx##n(X,Y,k[14]), Rx##n(X,Y,k[15]), \
                              Rx##n(X,Y,k[16]), Rx##n(X,Y,k[17]), Rx##n(X,Y,k[18]), Rx##n(X,Y,k[19]), Rx##n(X,Y,k[20]), Rx##n(X,Y,k[21]), Rx##n(X,Y,k[22]), Rx##n(X,Y,k[23]), \
                              Rx##n(X,Y,k[24]), Rx##n(X,Y,k[25]), Rx##n(X,Y,k[26]), Rx##n(X,Y,k[27]), Rx##n(X,Y,k[28]), Rx##n(X,Y,k[29]), Rx##n(X,Y,k[30]), Rx##n(X,Y,k[31]))
@ -91,7 +307,7 @@

 #define RK(X,Y,k,key,i)   (SET1(k[i],Y), key[i]=Y, X=RCS(X,8), X+=Y, X^=i, Y=LCS(Y,3), Y^=X)

-#define EK(A,B,C,D,k,key) (RK(B,A,k,key,0),  RK(C,A,k,key,1),  RK(D,A,k,key,2),  RK(B,A,k,key,3),  RK(C,A,k,key,4),  RK(D,A,k,key,5),  RK(B,A,k,key,6), \
+#define EK(A,B,C,D,k,key) (RK(B,A,k,key,0),  RK(C,A,k,key,1),  RK(D,A,k,key,2),  RK(B,A,k,key,3),  RK(C,A,k,key,4),  RK(D,A,k,key,5),  RK(B,A,k,key,6),  \
                           RK(C,A,k,key,7),  RK(D,A,k,key,8),  RK(B,A,k,key,9),  RK(C,A,k,key,10), RK(D,A,k,key,11), RK(B,A,k,key,12), RK(C,A,k,key,13), \
                           RK(D,A,k,key,14), RK(B,A,k,key,15), RK(C,A,k,key,16), RK(D,A,k,key,17), RK(B,A,k,key,18), RK(C,A,k,key,19), RK(D,A,k,key,20), \
                           RK(B,A,k,key,21), RK(C,A,k,key,22), RK(D,A,k,key,23), RK(B,A,k,key,24), RK(C,A,k,key,25), RK(D,A,k,key,26), RK(B,A,k,key,27), \