readability code clean-up (#530)

2024-09-19 16:41:11 +02:00 · 2020-12-08 20:32:08 +05:45 · 2020-12-08 20:32:08 +05:45 · 9fbe941511
commit 9fbe941511
parent 3252231ecb
2 changed files with 397 additions and 378 deletions
--- a/include/tf.h
+++ b/include/tf.h
@ -22,42 +22,44 @@
 // published on github/drewcsillag/twofish


-/*
-The MIT License (MIT)
-
-Copyright (c) 2015 Andrew T. Csillag
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
+/**
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015 Andrew T. Csillag
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */


 #ifndef TF_H
 #define TF_H

+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include "portable_endian.h"


-#define TF_BLOCK_SIZE           16
-#define TF_IV_SIZE             (TF_BLOCK_SIZE)
+#define TF_BLOCK_SIZE     16
+#define TF_IV_SIZE       (TF_BLOCK_SIZE)


 typedef struct tf_context_t {
@ -82,4 +84,4 @@ int tf_init (const unsigned char *key, size_t key_size, tf_context_t **ctx);
 int tf_deinit (tf_context_t *ctx);


-#endif    // TF_H
+#endif // TF_H
--- a/src/tf.c
+++ b/src/tf.c
@ -22,29 +22,29 @@
 // published on github/drewcsillag/twofish


-/*
-The MIT License (MIT)
-
-Copyright (c) 2015 Andrew T. Csillag
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
+/**
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015 Andrew T. Csillag
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */


 #include "tf.h"
@ -123,6 +123,7 @@ const uint8_t multEF[] = { 0x00, 0xEF, 0xB7, 0x58, 0x07, 0xE8, 0xB0, 0x5F, 0x0E,
                           0xA8, 0x47, 0x1F, 0xF0, 0xAF, 0x40, 0x18, 0xF7, 0xA6, 0x49, 0x11, 0xFE, 0xA1, 0x4E, 0x16, 0xF9,
                           0xB4, 0x5B, 0x03, 0xEC, 0xB3, 0x5C, 0x04, 0xEB, 0xBA, 0x55, 0x0D, 0xE2, 0xBD, 0x52, 0x0A, 0xE5 };

+
 #define RS_MOD 0x14D
 #define RHO 0x01010101L

@ -140,463 +141,479 @@ const uint8_t multEF[] = { 0x00, 0xEF, 0xB7, 0x58, 0x07, 0xE8, 0xB0, 0x5F, 0x0E,
 #define U8S_TO_U32(r0, r1, r2, r3) ((r0 << 24) ^ (r1 << 16) ^ (r2 << 8) ^ r3)


-/* multiply two polynomials represented as u32's, actually called with bytes */
+// multiply two polynomials represented as u32's, actually called with bytes
 uint32_t polyMult(uint32_t a, uint32_t b) {

-  uint32_t t=0;
+    uint32_t t=0;

-  while(a) {
-    if(a&1) t^=b;
-    b <<= 1;
-    a >>= 1;
-  }
+    while(a) {
+        if(a & 1)
+            t^=b;
+        b <<= 1;
+        a >>= 1;
+    }

-  return t;
+    return t;
 }


-/* take the polynomial t and return the t % modulus in GF(256) */
+// take the polynomial t and return the t % modulus in GF(256)
 uint32_t gfMod(uint32_t t, uint32_t modulus) {

-  int i;
-  uint32_t tt;
+    int i;
+    uint32_t tt;

-  modulus <<= 7;
-  for(i = 0; i < 8; i++) {
-    tt = t ^ modulus;
-    if(tt < t) t = tt;
-    modulus >>= 1;
-  }
+    modulus <<= 7;
+    for(i = 0; i < 8; i++) {
+        tt = t ^ modulus;
+        if(tt < t)
+             t = tt;
+        modulus >>= 1;
+    }

-  return t;
+    return t;
 }


-/*multiply a and b and return the modulus */
+// multiply a and b and return the modulus
 #define gfMult(a, b, modulus) gfMod(polyMult(a, b), modulus)


-/* return a u32 containing the result of multiplying the RS Code matrix by the sd matrix */
+// return a u32 containing the result of multiplying the RS Code matrix by the sd matrix
 uint32_t RSMatrixMultiply(uint8_t sd[8]) {

-  int j, k;
-  uint8_t t;
-  uint8_t result[4];
+    int j, k;
+    uint8_t t;
+    uint8_t result[4];

-  for(j = 0; j < 4; j++) {
-    t = 0;
-    for(k = 0; k < 8; k++) {
-      t ^= gfMult(RS[j][k], sd[k], RS_MOD);
+    for(j = 0; j < 4; j++) {
+        t = 0;
+        for(k = 0; k < 8; k++) {
+            t ^= gfMult(RS[j][k], sd[k], RS_MOD);
+        }
+        result[3-j] = t;
    }
-    result[3-j] = t;
-  }

-  return U8ARRAY_TO_U32(result);
+    return U8ARRAY_TO_U32(result);
 }


-/* the Zero-keyed h function (used by the key setup routine) */
+// the Zero-keyed h function (used by the key setup routine)
 uint32_t h(uint32_t X, uint32_t L[4], int k) {

-  uint8_t y0, y1, y2, y3;
-  uint8_t z0, z1, z2, z3;
+    uint8_t y0, y1, y2, y3;
+    uint8_t z0, z1, z2, z3;

-  y0 = b0(X);
-  y1 = b1(X);
-  y2 = b2(X);
-  y3 = b3(X);
+    y0 = b0(X);
+    y1 = b1(X);
+    y2 = b2(X);
+    y3 = b3(X);

-  switch(k) {
-    case 4:
-      y0 = Q1[y0] ^ b0(L[3]);
-      y1 = Q0[y1] ^ b1(L[3]);
-      y2 = Q0[y2] ^ b2(L[3]);
-      y3 = Q1[y3] ^ b3(L[3]);
-    case 3:
-      y0 = Q1[y0] ^ b0(L[2]);
-      y1 = Q1[y1] ^ b1(L[2]);
-      y2 = Q0[y2] ^ b2(L[2]);
-      y3 = Q0[y3] ^ b3(L[2]);
-    case 2:
-      y0 = Q1[  Q0 [ Q0[y0] ^ b0(L[1]) ] ^ b0(L[0]) ];
-      y1 = Q0[  Q0 [ Q1[y1] ^ b1(L[1]) ] ^ b1(L[0]) ];
-      y2 = Q1[  Q1 [ Q0[y2] ^ b2(L[1]) ] ^ b2(L[0]) ];
-      y3 = Q0[  Q1 [ Q1[y3] ^ b3(L[1]) ] ^ b3(L[0]) ];
-  }
-
-  /* inline the MDS matrix multiply */
-  z0 = multEF[y0] ^ y1 ^         multEF[y2] ^ mult5B[y3];
-  z1 = multEF[y0] ^ mult5B[y1] ^ y2 ^         multEF[y3];
-  z2 = mult5B[y0] ^ multEF[y1] ^ multEF[y2] ^ y3;
-  z3 = y0 ^         multEF[y1] ^ mult5B[y2] ^ mult5B[y3];
-
-  return U8S_TO_U32(z0, z1, z2, z3);
-}
-
-
-/* given the Sbox keys, create the fully keyed QF */
-void fullKey(uint32_t L[4], int k, uint32_t QF[4][256]) {
-
-  uint8_t y0, y1, y2, y3;
-  int i;
-
-  /* for all input values to the Q permutations */
-  for(i=0; i<256; i++) {
-    /* run the Q permutations */
-    y0 = i; y1=i; y2=i; y3=i;
    switch(k) {
-      case 4:
-        y0 = Q1[y0] ^ b0(L[3]);
-        y1 = Q0[y1] ^ b1(L[3]);
-        y2 = Q0[y2] ^ b2(L[3]);
-        y3 = Q1[y3] ^ b3(L[3]);
-      case 3:
-        y0 = Q1[y0] ^ b0(L[2]);
-        y1 = Q1[y1] ^ b1(L[2]);
-        y2 = Q0[y2] ^ b2(L[2]);
-        y3 = Q0[y3] ^ b3(L[2]);
-      case 2:
-        y0 = Q1[  Q0 [ Q0[y0] ^ b0(L[1]) ] ^ b0(L[0]) ];
-        y1 = Q0[  Q0 [ Q1[y1] ^ b1(L[1]) ] ^ b1(L[0]) ];
-        y2 = Q1[  Q1 [ Q0[y2] ^ b2(L[1]) ] ^ b2(L[0]) ];
-        y3 = Q0[  Q1 [ Q1[y3] ^ b3(L[1]) ] ^ b3(L[0]) ];
+        case 4:
+            y0 = Q1[y0] ^ b0(L[3]);
+            y1 = Q0[y1] ^ b1(L[3]);
+            y2 = Q0[y2] ^ b2(L[3]);
+            y3 = Q1[y3] ^ b3(L[3]);
+        case 3:
+            y0 = Q1[y0] ^ b0(L[2]);
+            y1 = Q1[y1] ^ b1(L[2]);
+            y2 = Q0[y2] ^ b2(L[2]);
+            y3 = Q0[y3] ^ b3(L[2]);
+        case 2:
+            y0 = Q1[  Q0 [ Q0[y0] ^ b0(L[1]) ] ^ b0(L[0]) ];
+            y1 = Q0[  Q0 [ Q1[y1] ^ b1(L[1]) ] ^ b1(L[0]) ];
+            y2 = Q1[  Q1 [ Q0[y2] ^ b2(L[1]) ] ^ b2(L[0]) ];
+            y3 = Q0[  Q1 [ Q1[y3] ^ b3(L[1]) ] ^ b3(L[0]) ];
    }

-    /* now do the partial MDS matrix multiplies */
-    QF[0][i] = ((multEF[y0] << 24)
-             | (multEF[y0] << 16)
-             | (mult5B[y0] << 8)
-             | y0);
-    QF[1][i] = ((y1 << 24)
-             | (mult5B[y1] << 16)
-             | (multEF[y1] << 8)
-             | multEF[y1]);
-    QF[2][i] = ((multEF[y2] << 24)
-             | (y2 << 16)
-             | (multEF[y2] << 8)
-             | mult5B[y2]);
-    QF[3][i] = ((mult5B[y3] << 24)
-             | (multEF[y3] << 16)
-             | (y3 << 8)
-             | mult5B[y3]);
-  }
+    // inline the MDS matrix multiply
+    z0 = multEF[y0] ^ y1 ^         multEF[y2] ^ mult5B[y3];
+    z1 = multEF[y0] ^ mult5B[y1] ^ y2 ^         multEF[y3];
+    z2 = mult5B[y0] ^ multEF[y1] ^ multEF[y2] ^ y3;
+    z3 = y0 ^         multEF[y1] ^ mult5B[y2] ^ mult5B[y3];
+
+    return U8S_TO_U32(z0, z1, z2, z3);
 }

-// -------------------------------------------------------------------------------------

-/* fully keyed h (aka g) function */
+// given the Sbox keys, create the fully keyed QF
+void fullKey(uint32_t L[4], int k, uint32_t QF[4][256]) {
+
+    uint8_t y0, y1, y2, y3;
+    int i;
+
+    // for all input values to the Q permutations
+    for(i = 0; i < 256; i++) {
+        // run the Q permutations
+        y0 = i; y1 = i; y2 = i; y3 = i;
+        switch(k) {
+            case 4:
+                y0 = Q1[y0] ^ b0(L[3]);
+                y1 = Q0[y1] ^ b1(L[3]);
+                y2 = Q0[y2] ^ b2(L[3]);
+                y3 = Q1[y3] ^ b3(L[3]);
+            case 3:
+                y0 = Q1[y0] ^ b0(L[2]);
+                y1 = Q1[y1] ^ b1(L[2]);
+                y2 = Q0[y2] ^ b2(L[2]);
+                y3 = Q0[y3] ^ b3(L[2]);
+            case 2:
+                y0 = Q1[  Q0 [ Q0[y0] ^ b0(L[1]) ] ^ b0(L[0]) ];
+                y1 = Q0[  Q0 [ Q1[y1] ^ b1(L[1]) ] ^ b1(L[0]) ];
+                y2 = Q1[  Q1 [ Q0[y2] ^ b2(L[1]) ] ^ b2(L[0]) ];
+                y3 = Q0[  Q1 [ Q1[y3] ^ b3(L[1]) ] ^ b3(L[0]) ];
+        }
+
+        // now do the partial MDS matrix multiplies
+        QF[0][i] = ((multEF[y0] << 24)
+                 | (multEF[y0] << 16)
+                 | (mult5B[y0] << 8)
+                 | y0);
+        QF[1][i] = ((y1 << 24)
+                 | (mult5B[y1] << 16)
+                 | (multEF[y1] << 8)
+                 | multEF[y1]);
+        QF[2][i] = ((multEF[y2] << 24)
+                 | (y2 << 16)
+                 | (multEF[y2] << 8)
+                 | mult5B[y2]);
+        QF[3][i] = ((mult5B[y3] << 24)
+                 | (multEF[y3] << 16)
+                 | (y3 << 8)
+                 | mult5B[y3]);
+    }
+}
+
+// ----------------------------------------------------------------------------------------------------------------
+
+
+// fully keyed h (aka g) function
 #define fkh(X) (ctx->QF[0][b0(X)]^ctx->QF[1][b1(X)]^ctx->QF[2][b2(X)]^ctx->QF[3][b3(X)])

-// -------------------------------------------------------------------------------------

-/* one encryption round */
+// ----------------------------------------------------------------------------------------------------------------
+
+
+// one encryption round
 #define ENC_ROUND(R0, R1, R2, R3, round) \
-  T0 = fkh(R0); \
-  T1 = fkh(ROL(R1, 8)); \
-  R2 = ROR(R2 ^ (T1 + T0 + ctx->K[2*round+8]), 1); \
-  R3 = ROL(R3, 1) ^ (2*T1 + T0 + ctx->K[2*round+9]);
+    T0 = fkh(R0); \
+    T1 = fkh(ROL(R1, 8)); \
+    R2 = ROR(R2 ^ (T1 + T0 + ctx->K[2*round+8]), 1); \
+    R3 = ROL(R3, 1) ^ (2*T1 + T0 + ctx->K[2*round+9]);


 void twofish_internal_encrypt(uint8_t PT[16], tf_context_t *ctx) {

-  uint32_t R0, R1, R2, R3;
-  uint32_t T0, T1;
+    uint32_t R0, R1, R2, R3;
+    uint32_t T0, T1;

-  /* load/byteswap/whiten input */
-  R3 = ctx->K[3] ^ le32toh(((uint32_t*)PT)[3]);
-  R2 = ctx->K[2] ^ le32toh(((uint32_t*)PT)[2]);
-  R1 = ctx->K[1] ^ le32toh(((uint32_t*)PT)[1]);
-  R0 = ctx->K[0] ^ le32toh(((uint32_t*)PT)[0]);
+    // load/byteswap/whiten input
+    R3 = ctx->K[3] ^ le32toh(((uint32_t*)PT)[3]);
+    R2 = ctx->K[2] ^ le32toh(((uint32_t*)PT)[2]);
+    R1 = ctx->K[1] ^ le32toh(((uint32_t*)PT)[1]);
+    R0 = ctx->K[0] ^ le32toh(((uint32_t*)PT)[0]);

-  ENC_ROUND(R0, R1, R2, R3, 0);
-  ENC_ROUND(R2, R3, R0, R1, 1);
-  ENC_ROUND(R0, R1, R2, R3, 2);
-  ENC_ROUND(R2, R3, R0, R1, 3);
-  ENC_ROUND(R0, R1, R2, R3, 4);
-  ENC_ROUND(R2, R3, R0, R1, 5);
-  ENC_ROUND(R0, R1, R2, R3, 6);
-  ENC_ROUND(R2, R3, R0, R1, 7);
-  ENC_ROUND(R0, R1, R2, R3, 8);
-  ENC_ROUND(R2, R3, R0, R1, 9);
-  ENC_ROUND(R0, R1, R2, R3, 10);
-  ENC_ROUND(R2, R3, R0, R1, 11);
-  ENC_ROUND(R0, R1, R2, R3, 12);
-  ENC_ROUND(R2, R3, R0, R1, 13);
-  ENC_ROUND(R0, R1, R2, R3, 14);
-  ENC_ROUND(R2, R3, R0, R1, 15);
+    ENC_ROUND(R0, R1, R2, R3,  0);
+    ENC_ROUND(R2, R3, R0, R1,  1);
+    ENC_ROUND(R0, R1, R2, R3,  2);
+    ENC_ROUND(R2, R3, R0, R1,  3);
+    ENC_ROUND(R0, R1, R2, R3,  4);
+    ENC_ROUND(R2, R3, R0, R1,  5);
+    ENC_ROUND(R0, R1, R2, R3,  6);
+    ENC_ROUND(R2, R3, R0, R1,  7);
+    ENC_ROUND(R0, R1, R2, R3,  8);
+    ENC_ROUND(R2, R3, R0, R1,  9);
+    ENC_ROUND(R0, R1, R2, R3, 10);
+    ENC_ROUND(R2, R3, R0, R1, 11);
+    ENC_ROUND(R0, R1, R2, R3, 12);
+    ENC_ROUND(R2, R3, R0, R1, 13);
+    ENC_ROUND(R0, R1, R2, R3, 14);
+    ENC_ROUND(R2, R3, R0, R1, 15);

-  /* load/byteswap/whiten output */
-  ((uint32_t*)PT)[3] = htole32(R1 ^ ctx->K[7]);
-  ((uint32_t*)PT)[2] = htole32(R0 ^ ctx->K[6]);
-  ((uint32_t*)PT)[1] = htole32(R3 ^ ctx->K[5]);
-  ((uint32_t*)PT)[0] = htole32(R2 ^ ctx->K[4]);
+    // whiten/byteswap/store output
+    ((uint32_t*)PT)[3] = htole32(R1 ^ ctx->K[7]);
+    ((uint32_t*)PT)[2] = htole32(R0 ^ ctx->K[6]);
+    ((uint32_t*)PT)[1] = htole32(R3 ^ ctx->K[5]);
+    ((uint32_t*)PT)[0] = htole32(R2 ^ ctx->K[4]);
 }

-// -------------------------------------------------------------------------------------

-/* one decryption round */
+// ----------------------------------------------------------------------------------------------------------------
+
+
+// one decryption round
 #define DEC_ROUND(R0, R1, R2, R3, round) \
-  T0 = fkh(R0); \
-  T1 = fkh(ROL(R1, 8)); \
-  R2 = ROL(R2, 1) ^ (T0 + T1 + ctx->K[2*round+8]); \
-  R3 = ROR(R3 ^ (T0 + 2*T1 + ctx->K[2*round+9]), 1);
+    T0 = fkh(R0); \
+    T1 = fkh(ROL(R1, 8)); \
+    R2 = ROL(R2, 1) ^ (T0 + T1 + ctx->K[2*round+8]); \
+    R3 = ROR(R3 ^ (T0 + 2*T1 + ctx->K[2*round+9]), 1);


 void twofish_internal_decrypt(uint8_t PT[16], const uint8_t CT[16], tf_context_t *ctx) {

-  uint32_t T0, T1;
-  uint32_t R0, R1, R2, R3;
+    uint32_t T0, T1;
+    uint32_t R0, R1, R2, R3;

-  /* load/byteswap/whiten input */
-  R3 = ctx->K[7] ^ le32toh(((uint32_t*)CT)[3]);
-  R2 = ctx->K[6] ^ le32toh(((uint32_t*)CT)[2]);
-  R1 = ctx->K[5] ^ le32toh(((uint32_t*)CT)[1]);
-  R0 = ctx->K[4] ^ le32toh(((uint32_t*)CT)[0]);
+    // load/byteswap/whiten input
+    R3 = ctx->K[7] ^ le32toh(((uint32_t*)CT)[3]);
+    R2 = ctx->K[6] ^ le32toh(((uint32_t*)CT)[2]);
+    R1 = ctx->K[5] ^ le32toh(((uint32_t*)CT)[1]);
+    R0 = ctx->K[4] ^ le32toh(((uint32_t*)CT)[0]);

-  DEC_ROUND(R0, R1, R2, R3, 15);
-  DEC_ROUND(R2, R3, R0, R1, 14);
-  DEC_ROUND(R0, R1, R2, R3, 13);
-  DEC_ROUND(R2, R3, R0, R1, 12);
-  DEC_ROUND(R0, R1, R2, R3, 11);
-  DEC_ROUND(R2, R3, R0, R1, 10);
-  DEC_ROUND(R0, R1, R2, R3, 9);
-  DEC_ROUND(R2, R3, R0, R1, 8);
-  DEC_ROUND(R0, R1, R2, R3, 7);
-  DEC_ROUND(R2, R3, R0, R1, 6);
-  DEC_ROUND(R0, R1, R2, R3, 5);
-  DEC_ROUND(R2, R3, R0, R1, 4);
-  DEC_ROUND(R0, R1, R2, R3, 3);
-  DEC_ROUND(R2, R3, R0, R1, 2);
-  DEC_ROUND(R0, R1, R2, R3, 1);
-  DEC_ROUND(R2, R3, R0, R1, 0);
+    DEC_ROUND(R0, R1, R2, R3, 15);
+    DEC_ROUND(R2, R3, R0, R1, 14);
+    DEC_ROUND(R0, R1, R2, R3, 13);
+    DEC_ROUND(R2, R3, R0, R1, 12);
+    DEC_ROUND(R0, R1, R2, R3, 11);
+    DEC_ROUND(R2, R3, R0, R1, 10);
+    DEC_ROUND(R0, R1, R2, R3,  9);
+    DEC_ROUND(R2, R3, R0, R1,  8);
+    DEC_ROUND(R0, R1, R2, R3,  7);
+    DEC_ROUND(R2, R3, R0, R1,  6);
+    DEC_ROUND(R0, R1, R2, R3,  5);
+    DEC_ROUND(R2, R3, R0, R1,  4);
+    DEC_ROUND(R0, R1, R2, R3,  3);
+    DEC_ROUND(R2, R3, R0, R1,  2);
+    DEC_ROUND(R0, R1, R2, R3,  1);
+    DEC_ROUND(R2, R3, R0, R1,  0);

-  /* load/byteswap/whiten output */
-  ((uint32_t*)PT)[3] = htole32(R1 ^ ctx->K[3]);
-  ((uint32_t*)PT)[2] = htole32(R0 ^ ctx->K[2]);
-  ((uint32_t*)PT)[1] = htole32(R3 ^ ctx->K[1]);
-  ((uint32_t*)PT)[0] = htole32(R2 ^ ctx->K[0]);
+    // whiten/byteswap/store output
+    ((uint32_t*)PT)[3] = htole32(R1 ^ ctx->K[3]);
+    ((uint32_t*)PT)[2] = htole32(R0 ^ ctx->K[2]);
+    ((uint32_t*)PT)[1] = htole32(R3 ^ ctx->K[1]);
+    ((uint32_t*)PT)[0] = htole32(R2 ^ ctx->K[0]);
 }

+
 // -------------------------------------------------------------------------------------

-/* the key schedule routine */
+
+// the key schedule routine
 void keySched(const uint8_t M[], int N, uint32_t **S, uint32_t K[40], int *k) {

-  uint32_t Mo[4], Me[4];
-  int i, j;
-  uint8_t vector[8];
-  uint32_t A, B;
+    uint32_t Mo[4], Me[4];
+    int i, j;
+    uint8_t vector[8];
+    uint32_t A, B;

-  *k = (N + 63) / 64;
-  *S = (uint32_t*)malloc(sizeof(uint32_t) * (*k));
+    *k = (N + 63) / 64;
+    *S = (uint32_t*)malloc(sizeof(uint32_t) * (*k));

-  for(i = 0; i < *k; i++) {
-    Me[i] = le32toh(((uint32_t*)M)[2*i]);
-    Mo[i] = le32toh(((uint32_t*)M)[2*i+1]);
-  }
+    for(i = 0; i < *k; i++) {
+        Me[i] = le32toh(((uint32_t*)M)[2*i]);
+        Mo[i] = le32toh(((uint32_t*)M)[2*i+1]);
+    }

-  for(i = 0; i < *k; i++) {
-    for(j = 0; j < 4; j++)
-      vector[j] = _b(Me[i], j);
-    for(j = 0; j < 4; j++)
-      vector[j+4] = _b(Mo[i], j);
-    (*S)[(*k)-i-1] = RSMatrixMultiply(vector);
-  }
-  for(i = 0; i < 20; i++) {
-    A = h(2*i*RHO, Me, *k);
-    B = ROL(h(2*i*RHO + RHO, Mo, *k), 8);
-    K[2*i] = A+B;
-    K[2*i+1] = ROL(A + 2*B, 9);
-  }
+    for(i = 0; i < *k; i++) {
+        for(j = 0; j < 4; j++)
+            vector[j] = _b(Me[i], j);
+        for(j = 0; j < 4; j++)
+            vector[j+4] = _b(Mo[i], j);
+        (*S)[(*k)-i-1] = RSMatrixMultiply(vector);
+    }
+
+    for(i = 0; i < 20; i++) {
+        A = h(2*i*RHO, Me, *k);
+        B = ROL(h(2*i*RHO + RHO, Mo, *k), 8);
+        K[2*i] = A+B;
+        K[2*i+1] = ROL(A + 2*B, 9);
+    }
 }

-// -------------------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------------------------------------------
+

 #define fix_xor(target, source) *(uint32_t*)&(target)[0] = *(uint32_t*)&(target)[0] ^ *(uint32_t*)&(source)[0]; *(uint32_t*)&(target)[4] = *(uint32_t*)&(target)[4] ^ *(uint32_t*)&(source)[4]; \
                                *(uint32_t*)&(target)[8] = *(uint32_t*)&(target)[8] ^ *(uint32_t*)&(source)[8]; *(uint32_t*)&(target)[12] = *(uint32_t*)&(target)[12] ^ *(uint32_t*)&(source)[12];

-// -------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------------------------------


-/** public API **/
+// public API


 int tf_ecb_decrypt (unsigned char *out, const unsigned char *in, tf_context_t *ctx) {

-  twofish_internal_decrypt(out, in, ctx);
-  return TF_BLOCK_SIZE;
+    twofish_internal_decrypt(out, in, ctx);
+
+    return TF_BLOCK_SIZE;
 }

+
 // not used
 int tf_ecb_encrypt (unsigned char *out, const unsigned char *in, tf_context_t *ctx) {

-  memcpy (out, in, TF_BLOCK_SIZE);
-  twofish_internal_encrypt(out, ctx);
-  return TF_BLOCK_SIZE;
+    memcpy(out, in, TF_BLOCK_SIZE);
+    twofish_internal_encrypt(out, ctx);
+
+    return TF_BLOCK_SIZE;
 }


 int tf_cbc_encrypt (unsigned char *out, const unsigned char *in, size_t in_len,
                    const unsigned char *iv, tf_context_t *ctx) {

-  uint8_t tmp[TF_BLOCK_SIZE];
-  size_t i;
-  size_t n;
+    uint8_t tmp[TF_BLOCK_SIZE];
+    size_t i;
+    size_t n;

-  memcpy(tmp, iv, TF_BLOCK_SIZE);
+    memcpy(tmp, iv, TF_BLOCK_SIZE);

-  n = in_len / TF_BLOCK_SIZE;
-  for(i=0; i < n; i++) {
-    fix_xor(tmp, &in[i * TF_BLOCK_SIZE]);
-    twofish_internal_encrypt(tmp, ctx);
-    memcpy(&out[i * TF_BLOCK_SIZE], tmp, TF_BLOCK_SIZE);
-  }
-  return n * TF_BLOCK_SIZE;
+    n = in_len / TF_BLOCK_SIZE;
+    for(i = 0; i < n; i++) {
+        fix_xor(tmp, &in[i * TF_BLOCK_SIZE]);
+        twofish_internal_encrypt(tmp, ctx);
+        memcpy(&out[i * TF_BLOCK_SIZE], tmp, TF_BLOCK_SIZE);
+    }
+
+    return n * TF_BLOCK_SIZE;
 }


 int tf_cbc_decrypt (unsigned char *out, const unsigned char *in, size_t in_len,
                    const unsigned char *iv, tf_context_t *ctx) {

-  int n;                       // number of blocks
-  int ret = (int)in_len & 15;  // remainder
+    int n;                       /* number of blocks */
+    int ret = (int)in_len & 15;  /* remainder        */

-  uint8_t ivec[TF_BLOCK_SIZE]; // the ivec/old handling might be optimized if we
-  uint8_t old[TF_BLOCK_SIZE];  // can be sure that in != out
+    uint8_t ivec[TF_BLOCK_SIZE]; /* the ivec/old handling might be optimized if we */
+    uint8_t old[TF_BLOCK_SIZE];  /* can be sure that in != out                     */

-  memcpy(ivec, iv, TF_BLOCK_SIZE);
+    memcpy(ivec, iv, TF_BLOCK_SIZE);

-  for(n = in_len / TF_BLOCK_SIZE; n > 2; n -=3) {
+    // 3 parallel rails of twofish decryption
+    for(n = in_len / TF_BLOCK_SIZE; n > 2; n -=3) {
+        memcpy(old, in + 2 * TF_BLOCK_SIZE, TF_BLOCK_SIZE);

-    memcpy(old, in + 2 * TF_BLOCK_SIZE, TF_BLOCK_SIZE);
+        uint32_t T0, T1;
+        uint32_t Q0, Q1, Q2, Q3, R0, R1, R2, R3, S0, S1, S2, S3;

-    uint32_t T0, T1;
-    uint32_t Q0, Q1, Q2, Q3, R0, R1, R2, R3, S0, S1, S2, S3;
+        // load/byteswap/whiten input/iv
+        Q3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[3]);
+        Q2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[2]);
+        Q1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[1]);
+        Q0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[0]);

-    /* load/byteswap/whiten input/iv */
-    Q3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[3]);
-    Q2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[2]);
-    Q1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[1]);
-    Q0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[0]);
+        R3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[7]);
+        R2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[6]);
+        R1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[5]);
+        R0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[4]);

-    R3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[7]);
-    R2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[6]);
-    R1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[5]);
-    R0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[4]);
+        S3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[11]);
+        S2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[10]);
+        S1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[9]);
+        S0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[8]);

-    S3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[11]);
-    S2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[10]);
-    S1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[9]);
-    S0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[8]);
+        DEC_ROUND(Q0, Q1, Q2, Q3, 15); DEC_ROUND(R0, R1, R2, R3, 15); DEC_ROUND(S0, S1, S2, S3, 15);
+        DEC_ROUND(Q2, Q3, Q0, Q1, 14); DEC_ROUND(R2, R3, R0, R1, 14); DEC_ROUND(S2, S3, S0, S1, 14);
+        DEC_ROUND(Q0, Q1, Q2, Q3, 13); DEC_ROUND(R0, R1, R2, R3, 13); DEC_ROUND(S0, S1, S2, S3, 13);
+        DEC_ROUND(Q2, Q3, Q0, Q1, 12); DEC_ROUND(R2, R3, R0, R1, 12); DEC_ROUND(S2, S3, S0, S1, 12);
+        DEC_ROUND(Q0, Q1, Q2, Q3, 11); DEC_ROUND(R0, R1, R2, R3, 11); DEC_ROUND(S0, S1, S2, S3, 11);
+        DEC_ROUND(Q2, Q3, Q0, Q1, 10); DEC_ROUND(R2, R3, R0, R1, 10); DEC_ROUND(S2, S3, S0, S1, 10);
+        DEC_ROUND(Q0, Q1, Q2, Q3,  9); DEC_ROUND(R0, R1, R2, R3,  9); DEC_ROUND(S0, S1, S2, S3,  9);
+        DEC_ROUND(Q2, Q3, Q0, Q1,  8); DEC_ROUND(R2, R3, R0, R1,  8); DEC_ROUND(S2, S3, S0, S1,  8);
+        DEC_ROUND(Q0, Q1, Q2, Q3,  7); DEC_ROUND(R0, R1, R2, R3,  7); DEC_ROUND(S0, S1, S2, S3,  7);
+        DEC_ROUND(Q2, Q3, Q0, Q1,  6); DEC_ROUND(R2, R3, R0, R1,  6); DEC_ROUND(S2, S3, S0, S1,  6);
+        DEC_ROUND(Q0, Q1, Q2, Q3,  5); DEC_ROUND(R0, R1, R2, R3,  5); DEC_ROUND(S0, S1, S2, S3,  5);
+        DEC_ROUND(Q2, Q3, Q0, Q1,  4); DEC_ROUND(R2, R3, R0, R1,  4); DEC_ROUND(S2, S3, S0, S1,  4);
+        DEC_ROUND(Q0, Q1, Q2, Q3,  3); DEC_ROUND(R0, R1, R2, R3,  3); DEC_ROUND(S0, S1, S2, S3,  3);
+        DEC_ROUND(Q2, Q3, Q0, Q1,  2); DEC_ROUND(R2, R3, R0, R1,  2); DEC_ROUND(S2, S3, S0, S1,  2);
+        DEC_ROUND(Q0, Q1, Q2, Q3,  1); DEC_ROUND(R0, R1, R2, R3,  1); DEC_ROUND(S0, S1, S2, S3,  1);
+        DEC_ROUND(Q2, Q3, Q0, Q1,  0); DEC_ROUND(R2, R3, R0, R1,  0); DEC_ROUND(S2, S3, S0, S1,  0);

-    DEC_ROUND(Q0, Q1, Q2, Q3, 15); DEC_ROUND(R0, R1, R2, R3, 15); DEC_ROUND(S0, S1, S2, S3, 15);
-    DEC_ROUND(Q2, Q3, Q0, Q1, 14); DEC_ROUND(R2, R3, R0, R1, 14); DEC_ROUND(S2, S3, S0, S1, 14);
-    DEC_ROUND(Q0, Q1, Q2, Q3, 13); DEC_ROUND(R0, R1, R2, R3, 13); DEC_ROUND(S0, S1, S2, S3, 13);
-    DEC_ROUND(Q2, Q3, Q0, Q1, 12); DEC_ROUND(R2, R3, R0, R1, 12); DEC_ROUND(S2, S3, S0, S1, 12);
-    DEC_ROUND(Q0, Q1, Q2, Q3, 11); DEC_ROUND(R0, R1, R2, R3, 11); DEC_ROUND(S0, S1, S2, S3, 11);
-    DEC_ROUND(Q2, Q3, Q0, Q1, 10); DEC_ROUND(R2, R3, R0, R1, 10); DEC_ROUND(S2, S3, S0, S1, 10);
-    DEC_ROUND(Q0, Q1, Q2, Q3,  9); DEC_ROUND(R0, R1, R2, R3,  9); DEC_ROUND(S0, S1, S2, S3,  9);
-    DEC_ROUND(Q2, Q3, Q0, Q1,  8); DEC_ROUND(R2, R3, R0, R1,  8); DEC_ROUND(S2, S3, S0, S1,  8);
-    DEC_ROUND(Q0, Q1, Q2, Q3,  7); DEC_ROUND(R0, R1, R2, R3,  7); DEC_ROUND(S0, S1, S2, S3,  7);
-    DEC_ROUND(Q2, Q3, Q0, Q1,  6); DEC_ROUND(R2, R3, R0, R1,  6); DEC_ROUND(S2, S3, S0, S1,  6);
-    DEC_ROUND(Q0, Q1, Q2, Q3,  5); DEC_ROUND(R0, R1, R2, R3,  5); DEC_ROUND(S0, S1, S2, S3,  5);
-    DEC_ROUND(Q2, Q3, Q0, Q1,  4); DEC_ROUND(R2, R3, R0, R1,  4); DEC_ROUND(S2, S3, S0, S1,  4);
-    DEC_ROUND(Q0, Q1, Q2, Q3,  3); DEC_ROUND(R0, R1, R2, R3,  3); DEC_ROUND(S0, S1, S2, S3,  3);
-    DEC_ROUND(Q2, Q3, Q0, Q1,  2); DEC_ROUND(R2, R3, R0, R1,  2); DEC_ROUND(S2, S3, S0, S1,  2);
-    DEC_ROUND(Q0, Q1, Q2, Q3,  1); DEC_ROUND(R0, R1, R2, R3,  1); DEC_ROUND(S0, S1, S2, S3,  1);
-    DEC_ROUND(Q2, Q3, Q0, Q1,  0); DEC_ROUND(R2, R3, R0, R1,  0); DEC_ROUND(S2, S3, S0, S1,  0);
+        // whiten/byteswap/store output/iv
+        ((uint32_t*)out)[11] = htole32(S1 ^ ctx->K[3] ^ ((uint32_t*)in)[7]);
+        ((uint32_t*)out)[10] = htole32(S0 ^ ctx->K[2] ^ ((uint32_t*)in)[6]);
+        ((uint32_t*)out)[9]  = htole32(S3 ^ ctx->K[1] ^ ((uint32_t*)in)[5]);
+        ((uint32_t*)out)[8]  = htole32(S2 ^ ctx->K[0] ^ ((uint32_t*)in)[4]);

-    /* load/byteswap/whiten output/iv */
+        ((uint32_t*)out)[7]  = htole32(R1 ^ ctx->K[3] ^ ((uint32_t*)in)[3]);
+        ((uint32_t*)out)[6]  = htole32(R0 ^ ctx->K[2] ^ ((uint32_t*)in)[2]);
+        ((uint32_t*)out)[5]  = htole32(R3 ^ ctx->K[1] ^ ((uint32_t*)in)[1]);
+        ((uint32_t*)out)[4]  = htole32(R2 ^ ctx->K[0] ^ ((uint32_t*)in)[0]);

-    ((uint32_t*)out)[11] = htole32(S1 ^ ctx->K[3] ^ ((uint32_t*)in)[7]);
-    ((uint32_t*)out)[10] = htole32(S0 ^ ctx->K[2] ^ ((uint32_t*)in)[6]);
-    ((uint32_t*)out)[9]  = htole32(S3 ^ ctx->K[1] ^ ((uint32_t*)in)[5]);
-    ((uint32_t*)out)[8]  = htole32(S2 ^ ctx->K[0] ^ ((uint32_t*)in)[4]);
+        ((uint32_t*)out)[3]  = htole32(Q1 ^ ctx->K[3] ^ ((uint32_t*)ivec)[3]);
+        ((uint32_t*)out)[2]  = htole32(Q0 ^ ctx->K[2] ^ ((uint32_t*)ivec)[2]);
+        ((uint32_t*)out)[1]  = htole32(Q3 ^ ctx->K[1] ^ ((uint32_t*)ivec)[1]);
+        ((uint32_t*)out)[0]  = htole32(Q2 ^ ctx->K[0] ^ ((uint32_t*)ivec)[0]);

-    ((uint32_t*)out)[7]  = htole32(R1 ^ ctx->K[3] ^ ((uint32_t*)in)[3]);
-    ((uint32_t*)out)[6]  = htole32(R0 ^ ctx->K[2] ^ ((uint32_t*)in)[2]);
-    ((uint32_t*)out)[5]  = htole32(R3 ^ ctx->K[1] ^ ((uint32_t*)in)[1]);
-    ((uint32_t*)out)[4]  = htole32(R2 ^ ctx->K[0] ^ ((uint32_t*)in)[0]);
+        in += 3 * TF_BLOCK_SIZE; out += 3 * TF_BLOCK_SIZE;

-    ((uint32_t*)out)[3]  = htole32(Q1 ^ ctx->K[3] ^ ((uint32_t*)ivec)[3]);
-    ((uint32_t*)out)[2]  = htole32(Q0 ^ ctx->K[2] ^ ((uint32_t*)ivec)[2]);
-    ((uint32_t*)out)[1]  = htole32(Q3 ^ ctx->K[1] ^ ((uint32_t*)ivec)[1]);
-    ((uint32_t*)out)[0]  = htole32(Q2 ^ ctx->K[0] ^ ((uint32_t*)ivec)[0]);
+        memcpy(ivec, old, TF_BLOCK_SIZE);
+    }

-    in += 3 * TF_BLOCK_SIZE; out += 3 * TF_BLOCK_SIZE;
+    // handle the two or less remaining block on a single rail
+    for(; n != 0; n--) {
+        uint32_t T0, T1;
+        uint32_t Q0, Q1, Q2, Q3;

-    memcpy(ivec, old, TF_BLOCK_SIZE);
-  }
+        memcpy(old, in, TF_BLOCK_SIZE);

-  for(; n != 0; n--) {
-    uint32_t T0, T1;
-    uint32_t Q0, Q1, Q2, Q3;
+        // load/byteswap/whiten input
+        Q3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[3]);
+        Q2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[2]);
+        Q1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[1]);
+        Q0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[0]);

-    memcpy (old, in, TF_BLOCK_SIZE);
+        DEC_ROUND(Q0, Q1, Q2, Q3, 15);
+        DEC_ROUND(Q2, Q3, Q0, Q1, 14);
+        DEC_ROUND(Q0, Q1, Q2, Q3, 13);
+        DEC_ROUND(Q2, Q3, Q0, Q1, 12);
+        DEC_ROUND(Q0, Q1, Q2, Q3, 11);
+        DEC_ROUND(Q2, Q3, Q0, Q1, 10);
+        DEC_ROUND(Q0, Q1, Q2, Q3,  9);
+        DEC_ROUND(Q2, Q3, Q0, Q1,  8);
+        DEC_ROUND(Q0, Q1, Q2, Q3,  7);
+        DEC_ROUND(Q2, Q3, Q0, Q1,  6);
+        DEC_ROUND(Q0, Q1, Q2, Q3,  5);
+        DEC_ROUND(Q2, Q3, Q0, Q1,  4);
+        DEC_ROUND(Q0, Q1, Q2, Q3,  3);
+        DEC_ROUND(Q2, Q3, Q0, Q1,  2);
+        DEC_ROUND(Q0, Q1, Q2, Q3,  1);
+        DEC_ROUND(Q2, Q3, Q0, Q1,  0);

-    /* load/byteswap/whiten input */
-    Q3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[3]);
-    Q2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[2]);
-    Q1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[1]);
-    Q0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[0]);
+        // load/byteswap/whiten output/iv
+        ((uint32_t*)out)[3] = htole32(Q1 ^ ctx->K[3] ^ ((uint32_t*)ivec)[3]);
+        ((uint32_t*)out)[2] = htole32(Q0 ^ ctx->K[2] ^ ((uint32_t*)ivec)[2]);
+        ((uint32_t*)out)[1] = htole32(Q3 ^ ctx->K[1] ^ ((uint32_t*)ivec)[1]);
+        ((uint32_t*)out)[0] = htole32(Q2 ^ ctx->K[0] ^ ((uint32_t*)ivec)[0]);

-    DEC_ROUND(Q0, Q1, Q2, Q3, 15);
-    DEC_ROUND(Q2, Q3, Q0, Q1, 14);
-    DEC_ROUND(Q0, Q1, Q2, Q3, 13);
-    DEC_ROUND(Q2, Q3, Q0, Q1, 12);
-    DEC_ROUND(Q0, Q1, Q2, Q3, 11);
-    DEC_ROUND(Q2, Q3, Q0, Q1, 10);
-    DEC_ROUND(Q0, Q1, Q2, Q3,  9);
-    DEC_ROUND(Q2, Q3, Q0, Q1,  8);
-    DEC_ROUND(Q0, Q1, Q2, Q3,  7);
-    DEC_ROUND(Q2, Q3, Q0, Q1,  6);
-    DEC_ROUND(Q0, Q1, Q2, Q3,  5);
-    DEC_ROUND(Q2, Q3, Q0, Q1,  4);
-    DEC_ROUND(Q0, Q1, Q2, Q3,  3);
-    DEC_ROUND(Q2, Q3, Q0, Q1,  2);
-    DEC_ROUND(Q0, Q1, Q2, Q3,  1);
-    DEC_ROUND(Q2, Q3, Q0, Q1,  0);
+        in += TF_BLOCK_SIZE; out+= TF_BLOCK_SIZE;

-    /* load/byteswap/whiten output/iv */
-    ((uint32_t*)out)[3] = htole32(Q1 ^ ctx->K[3] ^ ((uint32_t*)ivec)[3]);
-    ((uint32_t*)out)[2] = htole32(Q0 ^ ctx->K[2] ^ ((uint32_t*)ivec)[2]);
-    ((uint32_t*)out)[1] = htole32(Q3 ^ ctx->K[1] ^ ((uint32_t*)ivec)[1]);
-    ((uint32_t*)out)[0] = htole32(Q2 ^ ctx->K[0] ^ ((uint32_t*)ivec)[0]);
+        memcpy(ivec, old, TF_BLOCK_SIZE);
+    }

-    in += TF_BLOCK_SIZE; out+= TF_BLOCK_SIZE;
-
-    memcpy (ivec, old, TF_BLOCK_SIZE);
-  }
-
-  return n * TF_BLOCK_SIZE;
+    return n * TF_BLOCK_SIZE;
 }

-/**
- * By definition twofish can only accept key up to 256 bit
- * we wont do any checking here and will assume user already
- * know about it. Twofish is undefined for key larger than 256 bit
- */
+
+// by definition twofish can only accept key up to 256 bit
+// we wont do any checking here and will assume user already
+// know about it. twofish is undefined for key larger than 256 bit
 int tf_init (const unsigned char *key, size_t key_size, tf_context_t **ctx) {

-  int k;
-  uint32_t *S;
+    int k;
+    uint32_t *S;

-  *ctx = calloc(1, sizeof(tf_context_t));
-  if(!(*ctx)) {
-    return -1;
-  }
-  (*ctx)->N = key_size;
-  keySched(key, key_size, &S, (*ctx)->K, &k);
-  fullKey(S, k, (*ctx)->QF);
-  free(S);    // allocated in keySched(...)
+    *ctx = calloc(1, sizeof(tf_context_t));
+    if(!(*ctx)) {
+        return -1;
+    }

-  return 0;
+    (*ctx)->N = key_size;
+    keySched(key, key_size, &S, (*ctx)->K, &k);
+    fullKey(S, k, (*ctx)->QF);
+    free(S); /* allocated in keySched(...) */
+
+    return 0;
 }


 int tf_deinit (tf_context_t *ctx) {

-  if (ctx) free (ctx);
+    if(ctx) free(ctx);

-  return 0;
+    return 0;
 }