small cc20 sse speed-up

This commit is contained in:
Logan007 2020-09-09 14:09:19 +05:45
parent e492e18c27
commit 6c982b9373
2 changed files with 69 additions and 91 deletions

View File

@ -43,7 +43,6 @@ typedef struct cc20_context_t {
typedef struct cc20_context {
uint32_t keystream32[16];
uint32_t state[16];
uint8_t key[CC20_KEY_BYTES];
} cc20_context_t;

View File

@ -78,20 +78,10 @@ int cc20_crypt (unsigned char *out, const unsigned char *in, size_t in_len,
#elif defined (__SSE2__) // SSE ----------------------------------------------------------
// taken (and modified and enhanced) from
// taken (and heavily modified and enhanced) from
// https://github.com/Ginurx/chacha20-c (public domain)
static void cc20_init_block(cc20_context_t *ctx, const uint8_t nonce[]) {
const uint8_t *magic_constant = (uint8_t*)"expand 32-byte k";
memcpy(&(ctx->state[ 0]), magic_constant, 16);
memcpy(&(ctx->state[ 4]), ctx->key, CC20_KEY_BYTES);
memcpy(&(ctx->state[12]), nonce, CC20_IV_SIZE);
}
#define SL _mm_slli_epi32
#define SR _mm_srli_epi32
#define XOR _mm_xor_si128
@ -99,15 +89,17 @@ static void cc20_init_block(cc20_context_t *ctx, const uint8_t nonce[]) {
#define ADD _mm_add_epi32
#define ROL(X,r) (XOR(SL(X,r),SR(X,(32-r))))
#define ONE _mm_setr_epi32(1, 0, 0, 0)
#if defined (__SSSE3__) // --- SSSE3
#define L8 _mm_set_epi32(0x0e0d0c0fL, 0x0a09080bL, 0x06050407L, 0x02010003L)
#define L16 _mm_set_epi32(0x0d0c0f0eL, 0x09080b0aL, 0x05040706L, 0x01000302L)
#define ROL8(X) ( _mm_shuffle_epi8(X, L8)) /* SSSE 3 */
#define ROL16(X) ( _mm_shuffle_epi8(X, L16)) /* SSSE 3 */
#else // --- regular SSE2 ---------
#else // --- regular SSE2 ----------
#define ROL8(X) ROL(X,8)
#define ROL16(X) ROL(X,16)
#endif // -------------------------
#endif // --------------------------
#define CC20_PERMUTE_ROWS(A,B,C,D) \
B = _mm_shuffle_epi32(B, _MM_SHUFFLE(0, 3, 2, 1)); \
@ -135,21 +127,23 @@ static void cc20_init_block(cc20_context_t *ctx, const uint8_t nonce[]) {
CC20_ODD_ROUND (A, B, C, D); \
CC20_EVEN_ROUND(A, B, C, D)
static void cc20_block_next(cc20_context_t *ctx) {
uint32_t *counter = ctx->state + 12;
int cc20_crypt (unsigned char *out, const unsigned char *in, size_t in_len,
const unsigned char *iv, cc20_context_t *ctx) {
__m128i a, b, c, d, k0, k1, k2, k3;
a = _mm_loadu_si128 ((__m128i*)&(ctx->state[ 0]));
b = _mm_loadu_si128 ((__m128i*)&(ctx->state[ 4]));
c = _mm_loadu_si128 ((__m128i*)&(ctx->state[ 8]));
d = _mm_loadu_si128 ((__m128i*)&(ctx->state[12]));
uint8_t *keystream8 = (uint8_t*)ctx->keystream32;
k0 = a;
k1 = b;
k2 = c;
k3 = d;
const uint8_t *magic_constant = (uint8_t*)"expand 32-byte k";
a = _mm_loadu_si128 ((__m128i*)magic_constant);
b = _mm_loadu_si128 ((__m128i*)(ctx->key));
c = _mm_loadu_si128 ( (__m128i*)((ctx->key)+16));
d = _mm_loadu_si128 ((__m128i*)iv);
while (in_len >= 64) {
k0 = a; k1 = b; k2 = c; k3 = d;
// 10 double rounds
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
@ -163,71 +157,56 @@ static void cc20_block_next(cc20_context_t *ctx) {
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
k0 = ADD(k0, a);
k1 = ADD(k1, b);
k2 = ADD(k2, c);
k3 = ADD(k3, d);
k0 = ADD(k0, a); k1 = ADD(k1, b); k2 = ADD(k2, c); k3 = ADD(k3, d);
_mm_storeu_si128 ((__m128i*)out,
_mm_xor_si128 (_mm_loadu_si128((__m128i*)in), k0));
in += 16; out += 16;
_mm_storeu_si128 ((__m128i*)out,
_mm_xor_si128 (_mm_loadu_si128((__m128i*)in), k1));
in += 16; out += 16;
_mm_storeu_si128 ((__m128i*)out,
_mm_xor_si128 (_mm_loadu_si128((__m128i*)in), k2));
in += 16; out += 16;
_mm_storeu_si128 ((__m128i*)out,
_mm_xor_si128 (_mm_loadu_si128((__m128i*)in), k3));
in += 16; out += 16;
// increment counter, make sure it is and stays little endian in memory
d = _mm_add_epi32(d, ONE);
in_len -= 64;
}
if (in_len) {
k0 = a; k1 = b; k2 = c; k3 = d;
// 10 double rounds
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
CC20_DOUBLE_ROUND(k0, k1, k2, k3);
k0 = ADD(k0, a); k1 = ADD(k1, b); k2 = ADD(k2, c); k3 = ADD(k3, d);
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 0]), k0);
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 4]), k1);
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 8]), k2);
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[12]), k3);
// increment counter, make sure it is and stays little endian in memory
*counter = htole32(le32toh(*counter)+1);
}
static void cc20_init_context(cc20_context_t *ctx, const uint8_t *nonce) {
cc20_init_block(ctx, nonce);
}
int cc20_crypt (unsigned char *out, const unsigned char *in, size_t in_len,
const unsigned char *iv, cc20_context_t *ctx) {
uint8_t *keystream8 = (uint8_t*)ctx->keystream32;
uint32_t * in_p = (uint32_t*)in;
uint32_t * out_p = (uint32_t*)out;
size_t tmp_len = in_len;
cc20_init_context(ctx, iv);
while(in_len >= 64) {
cc20_block_next(ctx);
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 0]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 1]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 2]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 3]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 4]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 5]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 6]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 7]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 8]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 9]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[10]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[11]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[12]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[13]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[14]; in_p++; out_p++;
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[15]; in_p++; out_p++;
in_len -= 64;
}
if(in_len > 0) {
cc20_block_next(ctx);
tmp_len -= in_len;
// keep in mind that out and in got increased inside the last loop
// and point to current position now
while(in_len > 0) {
out[tmp_len] = in[tmp_len] ^ keystream8[tmp_len%64];
tmp_len++;
in_len--;
out[in_len] = in[in_len] ^ keystream8[in_len];
}
}
}