added AVX-512 support to SPECK cipher (#630)

This commit is contained in:
Logan oos Even 2021-02-08 15:27:29 +05:45 committed by GitHub
parent e31fc5f56a
commit aaca1424fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 241 additions and 8 deletions

View File

@ -42,7 +42,7 @@ In order to run n2n, you will need the following:
- The TAP drivers should be installed into the system. They can be installed from
http://build.openvpn.net/downloads/releases, search for "tap-windows".
- If OpenSSL has been linked dynamically, the corresponding `.dll` file should be available
onto the target computer.
@ -113,7 +113,7 @@ So far, the following portions of n2n's code benefit from hardware features:
```
AES: AES-NI
ChaCha20: SSE2, SSSE3
SPECK: SSE2, SSSE3, AVX2, (NEON)
SPECK: SSE2, SSSE3, AVX2, AVX512, (NEON)
Pearson Hashing: AES-NI
Random Numbers: RDSEED, RDRND (not faster but more random seed)
```

View File

@ -60,7 +60,7 @@ ChaCha20 usually performs faster than AES-CTS.
SPECK is recommended by the NSA for offical use in case AES implementation is not feasible due to system constraints (performance, size, …). The block cipher is used in CTR mode making it a stream cipher. The random full 128-bit IV is transmitted in plain.
On modern Intel CPUs, SPECK performs even faster than openSSL's ChaCha20 as it takes advantage of SSE4 or AVX2 if available. On Raspberry's ARM CPU, it is second place behind ChaCha20 and before Twofish.
On modern Intel CPUs, SPECK performs even faster than openSSL's ChaCha20 as it takes advantage of SSE4, AVX2, or AVX512 if available. On Raspberry's ARM CPU, it is second place behind ChaCha20 and before Twofish.
### Random Numbers

View File

@ -39,7 +39,24 @@
#define SPECK_KEY_BYTES (256/8)
#if defined (__AVX2__) // AVX support -----------------------------------------------------------------------------
#if defined (__AVX512F__) // AVX512 support -----------------------------------------------------------------------
#include <immintrin.h>
#include <string.h> /* memcpy() */
#define u512 __m512i
#define SPECK_ALIGNED_CTX 64
typedef struct {
u512 rk[34];
u64 key[34];
u32 keysize;
} speck_context_t;
#elif defined (__AVX2__) // AVX2 support --------------------------------------------------------------------------
#include <immintrin.h>

View File

@ -25,7 +25,224 @@
#include "speck.h"
#if defined (__AVX2__) // AVX support ----------------------------------------------------------------------------
#if defined (__AVX512F__) // AVX512 support ----------------------------------------------------------------------
#define LCS(x,r) (((x)<<r)|((x)>>(64-r)))
#define RCS(x,r) (((x)>>r)|((x)<<(64-r)))
#define SET _mm512_set_epi64
#define XOR _mm512_xor_si512
#define ADD _mm512_add_epi64
#define AND _mm512_and_si512
#define ROL(X,r) (_mm512_rol_epi64(X,r))
#define ROR(X,r) (_mm512_ror_epi64(X,r))
#define _q8 SET(0x7LL,0x3LL,0x6LL,0x2LL,0x5LL,0x1LL,0x4LL,0x0LL)
#define _eight SET(0x8LL,0x8LL,0x8LL,0x8LL,0x8LL,0x8LL,0x8LL,0x8LL)
#define SET1(X,c) (X=SET(c,c,c,c,c,c,c,c))
#define SET8(X,c) (X=SET(c,c,c,c,c,c,c,c), X=ADD(X,_q8))
#define LOW _mm512_unpacklo_epi64
#define HIGH _mm512_unpackhi_epi64
#define LD(ip) (_mm512_load_epi64(((void *)(ip))))
#define ST(ip,X) _mm512_storeu_si512((void *)(ip),X)
#define STORE(out,X,Y) (ST(out,LOW(Y,X)), ST(out+64,HIGH(Y,X)))
#define XOR_STORE(in,out,X,Y) (ST(out,XOR(LD(in),LOW(Y,X))), ST(out+64,XOR(LD(in+64),HIGH(Y,X))))
#define Rx8(X,Y,k) (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), \
Y[0]=XOR(ROL(Y[0],3),X[0]))
#define Rx16(X,Y,k) (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), X[1]=XOR(ADD(ROR(X[1],8),Y[1]),k), \
Y[0]=XOR(ROL(Y[0],3),X[0]), Y[1]=XOR(ROL(Y[1],3),X[1]))
#define Rx24(X,Y,k) (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), X[1]=XOR(ADD(ROR(X[1],8),Y[1]),k), X[2]=XOR(ADD(ROR(X[2],8),Y[2]),k), \
Y[0]=XOR(ROL(Y[0],3),X[0]), Y[1]=XOR(ROL(Y[1],3),X[1]), Y[2]=XOR(ROL(Y[2],3),X[2]))
#define Rx32(X,Y,k) (X[0]=XOR(ADD(ROR(X[0],8),Y[0]),k), X[1]=XOR(ADD(ROR(X[1],8),Y[1]),k), \
X[2]=XOR(ADD(ROR(X[2],8),Y[2]),k), X[3]=XOR(ADD(ROR(X[3],8),Y[3]),k), \
Y[0]=XOR(ROL(Y[0],3),X[0]), Y[1]=XOR(ROL(Y[1],3),X[1]), \
Y[2]=XOR(ROL(Y[2],3),X[2]), Y[3]=XOR(ROL(Y[3],3),X[3]))
#define Rx1(x,y,k) (x[0]=RCS(x[0],8), x[0]+=y[0], x[0]^=k, y[0]=LCS(y[0],3), y[0]^=x[0])
#define Rx1b(x,y,k) (x=RCS(x,8), x+=y, x^=k, y=LCS(y,3), y^=x)
#define Rx2(x,y,k) (x[0]=RCS(x[0],8), x[1]=RCS(x[1],8), x[0]+=y[0], x[1]+=y[1], \
x[0]^=k, x[1]^=k, y[0]=LCS(y[0],3), y[1]=LCS(y[1],3), y[0]^=x[0], y[1]^=x[1])
#define Encrypt_128(X,Y,k,n) (Rx##n(X,Y,k[0]), Rx##n(X,Y,k[1]), Rx##n(X,Y,k[2]), Rx##n(X,Y,k[3]), Rx##n(X,Y,k[4]), Rx##n(X,Y,k[5]), Rx##n(X,Y,k[6]), Rx##n(X,Y,k[7]), \
Rx##n(X,Y,k[8]), Rx##n(X,Y,k[9]), Rx##n(X,Y,k[10]), Rx##n(X,Y,k[11]), Rx##n(X,Y,k[12]), Rx##n(X,Y,k[13]), Rx##n(X,Y,k[14]), Rx##n(X,Y,k[15]), \
Rx##n(X,Y,k[16]), Rx##n(X,Y,k[17]), Rx##n(X,Y,k[18]), Rx##n(X,Y,k[19]), Rx##n(X,Y,k[20]), Rx##n(X,Y,k[21]), Rx##n(X,Y,k[22]), Rx##n(X,Y,k[23]), \
Rx##n(X,Y,k[24]), Rx##n(X,Y,k[25]), Rx##n(X,Y,k[26]), Rx##n(X,Y,k[27]), Rx##n(X,Y,k[28]), Rx##n(X,Y,k[29]), Rx##n(X,Y,k[30]), Rx##n(X,Y,k[31]))
#define Encrypt_256(X,Y,k,n) (Encrypt_128(X,Y,k,n), \
Rx##n(X,Y,k[32]), Rx##n(X,Y,k[33]))
#define RK(X,Y,k,key,i) (SET1(k[i],Y), key[i]=Y, X=RCS(X,8), X+=Y, X^=i, Y=LCS(Y,3), Y^=X)
#define EK(A,B,C,D,k,key) (RK(B,A,k,key,0), RK(C,A,k,key,1), RK(D,A,k,key,2), RK(B,A,k,key,3), RK(C,A,k,key,4), RK(D,A,k,key,5), RK(B,A,k,key,6), \
RK(C,A,k,key,7), RK(D,A,k,key,8), RK(B,A,k,key,9), RK(C,A,k,key,10), RK(D,A,k,key,11), RK(B,A,k,key,12), RK(C,A,k,key,13), \
RK(D,A,k,key,14), RK(B,A,k,key,15), RK(C,A,k,key,16), RK(D,A,k,key,17), RK(B,A,k,key,18), RK(C,A,k,key,19), RK(D,A,k,key,20), \
RK(B,A,k,key,21), RK(C,A,k,key,22), RK(D,A,k,key,23), RK(B,A,k,key,24), RK(C,A,k,key,25), RK(D,A,k,key,26), RK(B,A,k,key,27), \
RK(C,A,k,key,28), RK(D,A,k,key,29), RK(B,A,k,key,30), RK(C,A,k,key,31), RK(D,A,k,key,32), RK(B,A,k,key,33))
#define Encrypt_Dispatcher(keysize) \
u64 x[2], y[2]; \
u512 X[4], Y[4]; \
unsigned char block1024[128]; \
\
if(numbytes == 16) { \
x[0] = nonce[1]; y[0] = nonce[0]; nonce[0]++; \
Encrypt_##keysize(x, y, ctx->key, 1); \
((u64 *)out)[1] = x[0]; ((u64 *)out)[0] = y[0]; \
return 0; \
} \
\
if(numbytes == 32) { \
x[0] = nonce[1]; y[0] = nonce[0]; nonce[0]++; \
x[1] = nonce[1]; y[1] = nonce[0]; nonce[0]++; \
Encrypt_##keysize(x, y, ctx->key, 2); \
((u64 *)out)[1] = x[0] ^ ((u64 *)in)[1]; ((u64 *)out)[0] = y[0] ^ ((u64 *)in)[0]; \
((u64 *)out)[3] = x[1] ^ ((u64 *)in)[3]; ((u64 *)out)[2] = y[1] ^ ((u64 *)in)[2]; \
return 0; \
} \
\
if(numbytes == 64) { \
SET1(X[0], nonce[1]); \
SET8(Y[0], nonce[0]); \
Encrypt_##keysize(X, Y, ctx->rk, 8); \
nonce[0] += (numbytes >> 4); \
memcpy(block1024, in, 64); \
XOR_STORE(block1024, block1024, X[0], Y[0]); \
memcpy(out, block1024, 64); \
return 0; \
} \
\
SET1(X[0], nonce[1]); SET8(Y[0], nonce[0]); \
\
if(numbytes == 128) \
Encrypt_##keysize(X, Y, ctx->rk, 8); \
else { \
X[1] = X[0]; \
Y[1] = ADD(Y[0], _eight); \
if(numbytes == 256) \
Encrypt_##keysize(X, Y, ctx->rk, 16); \
else { \
X[2] = X[0]; \
Y[2] = ADD(Y[1], _eight); \
if(numbytes == 384) \
Encrypt_##keysize(X, Y, ctx->rk, 24); \
else { \
X[3] = X[0]; \
Y[3] = ADD(Y[2], _eight); \
Encrypt_##keysize(X, Y, ctx->rk, 32); \
} \
} \
} \
\
nonce[0] += (numbytes >> 4); \
\
XOR_STORE(in, out, X[0], Y[0]); \
if (numbytes >= 256) \
XOR_STORE(in + 128, out + 128, X[1], Y[1]); \
if(numbytes >= 384) \
XOR_STORE(in + 256, out + 256, X[2], Y[2]); \
if(numbytes >= 512) \
XOR_STORE(in + 384, out + 384, X[3], Y[3]); \
\
return 0
static int speck_encrypt_xor(unsigned char *out, const unsigned char *in, u64 nonce[], speck_context_t *ctx, int numbytes) {
if(ctx->keysize == 256) {
Encrypt_Dispatcher(256);
} else {
Encrypt_Dispatcher(128);
}
}
static int internal_speck_ctr(unsigned char *out, const unsigned char *in, unsigned long long inlen,
const unsigned char *n, speck_context_t *ctx) {
int i;
u64 nonce[2];
unsigned char block[16];
u64 * const block64 = (u64 *)block;
if (!inlen)
return 0;
nonce[0] = ((u64 *)n)[0];
nonce[1] = ((u64 *)n)[1];
while(inlen >= 512) {
speck_encrypt_xor(out, in, nonce, ctx, 512);
in += 512; inlen -= 512; out += 512;
}
if(inlen >= 384) {
speck_encrypt_xor(out, in, nonce, ctx, 384);
in += 384; inlen -= 384; out += 384;
}
if(inlen >= 256) {
speck_encrypt_xor(out, in, nonce, ctx, 256);
in += 256; inlen -= 256; out += 256;
}
if(inlen >= 128) {
speck_encrypt_xor(out, in, nonce, ctx, 128);
in += 128; inlen -= 128; out += 128;
}
if(inlen >= 64) {
speck_encrypt_xor(out, in, nonce, ctx, 64);
in += 64; inlen -= 64; out += 64;
}
if(inlen >= 32) {
speck_encrypt_xor(out, in, nonce, ctx, 32);
in += 32; inlen -= 32; out += 32;
}
if(inlen >= 16) {
speck_encrypt_xor(block, in, nonce, ctx, 16);
((u64 *)out)[0] = block64[0] ^ ((u64 *)in)[0];
((u64 *)out)[1] = block64[1] ^ ((u64 *)in)[1];
in += 16; inlen -= 16; out += 16;
}
if(inlen > 0) {
speck_encrypt_xor(block, in, nonce, ctx, 16);
for(i = 0; i < inlen; i++)
out[i] = block[i] ^ in[i];
}
return 0;
}
static int speck_expand_key (speck_context_t *ctx, const unsigned char *k, int keysize) {
u64 K[4];
size_t i;
for(i = 0; i < (keysize >> 6); i++)
K[i] = ((u64 *)k)[i];
// 128 bit has only two keys A and B thus replacing both C and D with B then
if(keysize == 128) {
EK(K[0], K[1], K[1], K[1], ctx->rk, ctx->key);
} else {
EK(K[0], K[1], K[2], K[3], ctx->rk, ctx->key);
}
ctx->keysize = keysize;
return 0;
}
#elif defined (__AVX2__) // AVX2 support -------------------------------------------------------------------------
#define LCS(x,r) (((x)<<r)|((x)>>(64-r)))
@ -80,8 +297,7 @@
#define Rx2(x,y,k) (x[0]=RCS(x[0],8), x[1]=RCS(x[1],8), x[0]+=y[0], x[1]+=y[1], \
x[0]^=k, x[1]^=k, y[0]=LCS(y[0],3), y[1]=LCS(y[1],3), y[0]^=x[0], y[1]^=x[1])
#define Encrypt_128(X,Y,k,n) (Rx##n(X,Y,k[0]), Rx##n(X,Y,k[1]), Rx##n(X,Y,k[2]), Rx##n(X,Y,k[3]), Rx##n(X,Y,k[4]), Rx##n(X,Y,k[5]), Rx##n(X,Y,k[6]), Rx##n(X,Y,k[7]), \
#define Encrypt_128(X,Y,k,n) (Rx##n(X,Y,k[0]), Rx##n(X,Y,k[1]), Rx##n(X,Y,k[2]), Rx##n(X,Y,k[3]), Rx##n(X,Y,k[4]), Rx##n(X,Y,k[5]), Rx##n(X,Y,k[6]), Rx##n(X,Y,k[7]), \
Rx##n(X,Y,k[8]), Rx##n(X,Y,k[9]), Rx##n(X,Y,k[10]), Rx##n(X,Y,k[11]), Rx##n(X,Y,k[12]), Rx##n(X,Y,k[13]), Rx##n(X,Y,k[14]), Rx##n(X,Y,k[15]), \
Rx##n(X,Y,k[16]), Rx##n(X,Y,k[17]), Rx##n(X,Y,k[18]), Rx##n(X,Y,k[19]), Rx##n(X,Y,k[20]), Rx##n(X,Y,k[21]), Rx##n(X,Y,k[22]), Rx##n(X,Y,k[23]), \
Rx##n(X,Y,k[24]), Rx##n(X,Y,k[25]), Rx##n(X,Y,k[26]), Rx##n(X,Y,k[27]), Rx##n(X,Y,k[28]), Rx##n(X,Y,k[29]), Rx##n(X,Y,k[30]), Rx##n(X,Y,k[31]))
@ -91,7 +307,7 @@
#define RK(X,Y,k,key,i) (SET1(k[i],Y), key[i]=Y, X=RCS(X,8), X+=Y, X^=i, Y=LCS(Y,3), Y^=X)
#define EK(A,B,C,D,k,key) (RK(B,A,k,key,0), RK(C,A,k,key,1), RK(D,A,k,key,2), RK(B,A,k,key,3), RK(C,A,k,key,4), RK(D,A,k,key,5), RK(B,A,k,key,6), \
#define EK(A,B,C,D,k,key) (RK(B,A,k,key,0), RK(C,A,k,key,1), RK(D,A,k,key,2), RK(B,A,k,key,3), RK(C,A,k,key,4), RK(D,A,k,key,5), RK(B,A,k,key,6), \
RK(C,A,k,key,7), RK(D,A,k,key,8), RK(B,A,k,key,9), RK(C,A,k,key,10), RK(D,A,k,key,11), RK(B,A,k,key,12), RK(C,A,k,key,13), \
RK(D,A,k,key,14), RK(B,A,k,key,15), RK(C,A,k,key,16), RK(D,A,k,key,17), RK(B,A,k,key,18), RK(C,A,k,key,19), RK(D,A,k,key,20), \
RK(B,A,k,key,21), RK(C,A,k,key,22), RK(D,A,k,key,23), RK(B,A,k,key,24), RK(C,A,k,key,25), RK(D,A,k,key,26), RK(B,A,k,key,27), \