diff --git a/Makefile b/Makefile
index d2f9e61dd2..60e9f4a8df 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,9 @@
 CC     = gcc
-CFLAGS = -Wall -Wextra -Os
-OBJS   = bignum.o ecdsa.o secp256k1.o sha2.o rand.o hmac.o bip32.o ripemd160.o bip39.o pbkdf2.o base58.o
-OBJS  += aescrypt.o aeskey.o aestab.o
+CFLAGS = -Wall -Wextra -Os -Wno-sequence-point
+OBJS   = bignum.o ecdsa.o secp256k1.o rand.o hmac.o bip32.o bip39.o pbkdf2.o base58.o
+OBJS  += ripemd160.o
+OBJS  += sha2.o
+OBJS  += aescrypt.o aeskey.o aestab.o aes_modes.o
 
 all: tests test-openssl
 
diff --git a/aes.h b/aes.h
index 687d7ad3aa..f56c1b88a5 100644
--- a/aes.h
+++ b/aes.h
@@ -1,100 +1,200 @@
 /*
- -------------------------------------------------------------------------
- Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
- All rights reserved.
+---------------------------------------------------------------------------
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
 
- LICENSE TERMS
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
 
- The free distribution and use of this software in both source and binary
- form is allowed (with or without changes) provided that:
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
 
-   1. distributions of this source code include the above copyright
-      notice, this list of conditions and the following disclaimer;
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
 
-   2. distributions in binary form include the above copyright
-      notice, this list of conditions and the following disclaimer
-      in the documentation and/or other associated materials;
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 20/12/2007
 
-   3. the copyright holder's name is not used to endorse products
-      built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and fitness for purpose.
- -------------------------------------------------------------------------
- Issue Date: 29/07/2002
-
- This file contains the definitions required to use AES (Rijndael) in C.
+ This file contains the definitions required to use AES in C. See aesopt.h
+ for optimisation details.
 */
 
 #ifndef _AES_H
 #define _AES_H
 
-/*  This include is used only to find 8 and 32 bit unsigned integer types   */
+#include <stdlib.h>
 
-#include "limits.h"
-
-#if UCHAR_MAX == 0xff       /* an unsigned 8 bit type for internal AES use  */
-  typedef unsigned char      aes_08t;
-#else
-#error Please define an unsigned 8 bit type in aes.h
-#endif
-
-#if UINT_MAX == 0xffffffff  /* an unsigned 32 bit type for internal AES use */
-  typedef   unsigned int     aes_32t;
-#elif ULONG_MAX == 0xffffffff
-  typedef   unsigned long    aes_32t;
-#else
-#error Please define an unsigned 32 bit type in aes.h
-#endif
-
-/*  BLOCK_SIZE is in BYTES: 16, 24, 32 or undefined for aes.c and 16, 20,
-    24, 28, 32 or undefined for aespp.c.  When left undefined a slower
-    version that provides variable block length is compiled.
-*/
-
-#define BLOCK_SIZE  16
-
-/* key schedule length (in 32-bit words)    */
-
-#if !defined(BLOCK_SIZE)
-#define KS_LENGTH   128
-#else
-#define KS_LENGTH   4 * BLOCK_SIZE
-#endif
+/*  This include is used to find 8 & 32 bit unsigned integer types  */
+#include "brg_types.h"
 
 #if defined(__cplusplus)
 extern "C"
 {
 #endif
 
-typedef unsigned int aes_fret;   /* type for function return value       */
-#define aes_bad      0           /* bad function return value            */
-#define aes_good     1           /* good function return value           */
-#ifndef AES_DLL                  /* implement normal or DLL functions    */
-#define aes_rval     aes_fret
+// #define AES_128     /* if a fast 128 bit key scheduler is needed    */
+// #define AES_192     /* if a fast 192 bit key scheduler is needed    */
+#define AES_256     /* if a fast 256 bit key scheduler is needed    */
+// #define AES_VAR     /* if variable key size scheduler is needed     */
+#define AES_MODES   /* if support is needed for modes               */
+
+/* The following must also be set in assembler files if being used  */
+
+#define AES_ENCRYPT /* if support for encryption is needed          */
+#define AES_DECRYPT /* if support for decryption is needed          */
+
+#define AES_BLOCK_SIZE  16  /* the AES block size in bytes          */
+#define N_COLS           4  /* the number of columns in the state   */
+
+/* The key schedule length is 11, 13 or 15 16-byte blocks for 128,  */
+/* 192 or 256-bit keys respectively. That is 176, 208 or 240 bytes  */
+/* or 44, 52 or 60 32-bit words.                                    */
+
+#if defined( AES_VAR ) || defined( AES_256 )
+#define KS_LENGTH       60
+#elif defined( AES_192 )
+#define KS_LENGTH       52
 #else
-#define aes_rval     aes_fret __declspec(dllexport) _stdcall
+#define KS_LENGTH       44
 #endif
 
+#define AES_RETURN INT_RETURN
 
-typedef struct                     /* the AES context for encryption   */
-{   aes_32t    k_sch[KS_LENGTH];   /* the encryption key schedule      */
-    aes_32t    n_rnd;              /* the number of cipher rounds      */
-    aes_32t    n_blk;              /* the number of bytes in the state */
-} aes_ctx;
+/* the character array 'inf' in the following structures is used    */
+/* to hold AES context information. This AES code uses cx->inf.b[0] */
+/* to hold the number of rounds multiplied by 16. The other three   */
+/* elements can be used by code that implements additional modes    */
 
-#if !defined(BLOCK_SIZE)
-aes_rval aes_blk_len(unsigned int blen, aes_ctx cx[1]);
+typedef union
+{   uint32_t l;
+    uint8_t b[4];
+} aes_inf;
+
+#ifdef _WIN64
+__declspec(align(16))
+#endif 
+typedef struct
+{   uint32_t ks[KS_LENGTH];
+    aes_inf inf;
+} aes_encrypt_ctx;
+
+#ifdef _WIN64
+__declspec(align(16))
+#endif 
+typedef struct
+{   uint32_t ks[KS_LENGTH];
+    aes_inf inf;
+} aes_decrypt_ctx;
+
+/* This routine must be called before first use if non-static       */
+/* tables are being used                                            */
+
+AES_RETURN aes_init(void);
+
+/* Key lengths in the range 16 <= key_len <= 32 are given in bytes, */
+/* those in the range 128 <= key_len <= 256 are given in bits       */
+
+#if defined( AES_ENCRYPT )
+
+#if defined( AES_128 ) || defined( AES_VAR)
+AES_RETURN aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]);
 #endif
 
-aes_rval aes_enc_key(const unsigned char in_key[], unsigned int klen, aes_ctx cx[1]);
-aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1]);
+#if defined( AES_192 ) || defined( AES_VAR)
+AES_RETURN aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]);
+#endif
 
-aes_rval aes_dec_key(const unsigned char in_key[], unsigned int klen, aes_ctx cx[1]);
-aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1]);
+#if defined( AES_256 ) || defined( AES_VAR)
+AES_RETURN aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]);
+#endif
+
+#if defined( AES_VAR )
+AES_RETURN aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]);
+#endif
+
+AES_RETURN aes_encrypt(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1]);
+
+#endif
+
+#if defined( AES_DECRYPT )
+
+#if defined( AES_128 ) || defined( AES_VAR)
+AES_RETURN aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]);
+#endif
+
+#if defined( AES_192 ) || defined( AES_VAR)
+AES_RETURN aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]);
+#endif
+
+#if defined( AES_256 ) || defined( AES_VAR)
+AES_RETURN aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]);
+#endif
+
+#if defined( AES_VAR )
+AES_RETURN aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]);
+#endif
+
+AES_RETURN aes_decrypt(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1]);
+
+#endif
+
+#if defined( AES_MODES )
+
+/* Multiple calls to the following subroutines for multiple block   */
+/* ECB, CBC, CFB, OFB and CTR mode encryption can be used to handle */
+/* long messages incrementally provided that the context AND the iv */
+/* are preserved between all such calls.  For the ECB and CBC modes */
+/* each individual call within a series of incremental calls must   */
+/* process only full blocks (i.e. len must be a multiple of 16) but */
+/* the CFB, OFB and CTR mode calls can handle multiple incremental  */
+/* calls of any length. Each mode is reset when a new AES key is    */
+/* set but ECB and CBC operations can be reset without setting a    */
+/* new key by setting a new IV value.  To reset CFB, OFB and CTR    */
+/* without setting the key, aes_mode_reset() must be called and the */
+/* IV must be set.  NOTE: All these calls update the IV on exit so  */
+/* this has to be reset if a new operation with the same IV as the  */
+/* previous one is required (or decryption follows encryption with  */
+/* the same IV array).                                              */
+
+AES_RETURN aes_test_alignment_detection(unsigned int n);
+
+AES_RETURN aes_ecb_encrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, const aes_encrypt_ctx cx[1]);
+
+AES_RETURN aes_ecb_decrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, const aes_decrypt_ctx cx[1]);
+
+AES_RETURN aes_cbc_encrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, const aes_encrypt_ctx cx[1]);
+
+AES_RETURN aes_cbc_decrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, const aes_decrypt_ctx cx[1]);
+
+AES_RETURN aes_mode_reset(aes_encrypt_ctx cx[1]);
+
+AES_RETURN aes_cfb_encrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, aes_encrypt_ctx cx[1]);
+
+AES_RETURN aes_cfb_decrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, aes_encrypt_ctx cx[1]);
+
+#define aes_ofb_encrypt aes_ofb_crypt
+#define aes_ofb_decrypt aes_ofb_crypt
+
+AES_RETURN aes_ofb_crypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, aes_encrypt_ctx cx[1]);
+
+typedef void cbuf_inc(unsigned char *cbuf);
+
+#define aes_ctr_encrypt aes_ctr_crypt
+#define aes_ctr_decrypt aes_ctr_crypt
+
+AES_RETURN aes_ctr_crypt(const unsigned char *ibuf, unsigned char *obuf,
+            int len, unsigned char *cbuf, cbuf_inc ctr_inc, aes_encrypt_ctx cx[1]);
+
+#endif
 
 #if defined(__cplusplus)
 }
diff --git a/aes_modes.c b/aes_modes.c
new file mode 100644
index 0000000000..a6e37ed801
--- /dev/null
+++ b/aes_modes.c
@@ -0,0 +1,947 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 20/12/2007
+
+ These subroutines implement multiple block AES modes for ECB, CBC, CFB,
+ OFB and CTR encryption,  The code provides support for the VIA Advanced
+ Cryptography Engine (ACE).
+
+ NOTE: In the following subroutines, the AES contexts (ctx) must be
+ 16 byte aligned if VIA ACE is being used
+*/
+
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+
+#include "aesopt.h"
+
+#if defined( AES_MODES )
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#if defined( _MSC_VER ) && ( _MSC_VER > 800 )
+#pragma intrinsic(memcpy)
+#endif
+
+#define BFR_BLOCKS      8
+
+/* These values are used to detect long word alignment in order to */
+/* speed up some buffer operations. This facility may not work on  */
+/* some machines so this define can be commented out if necessary  */
+
+#define FAST_BUFFER_OPERATIONS
+
+#define lp32(x)         ((uint32_t*)(x))
+
+#if defined( USE_VIA_ACE_IF_PRESENT )
+
+#include "aes_via_ace.h"
+
+#pragma pack(16)
+
+aligned_array(unsigned long,    enc_gen_table, 12, 16) =    NEH_ENC_GEN_DATA;
+aligned_array(unsigned long,   enc_load_table, 12, 16) =   NEH_ENC_LOAD_DATA;
+aligned_array(unsigned long, enc_hybrid_table, 12, 16) = NEH_ENC_HYBRID_DATA;
+aligned_array(unsigned long,    dec_gen_table, 12, 16) =    NEH_DEC_GEN_DATA;
+aligned_array(unsigned long,   dec_load_table, 12, 16) =   NEH_DEC_LOAD_DATA;
+aligned_array(unsigned long, dec_hybrid_table, 12, 16) = NEH_DEC_HYBRID_DATA;
+
+/* NOTE: These control word macros must only be used after  */
+/* a key has been set up because they depend on key size    */
+/* See the VIA ACE documentation for key type information   */
+/* and aes_via_ace.h for non-default NEH_KEY_TYPE values    */
+
+#ifndef NEH_KEY_TYPE
+#  define NEH_KEY_TYPE NEH_HYBRID
+#endif
+
+#if NEH_KEY_TYPE == NEH_LOAD
+#define kd_adr(c)   ((uint8_t*)(c)->ks)
+#elif NEH_KEY_TYPE == NEH_GENERATE
+#define kd_adr(c)   ((uint8_t*)(c)->ks + (c)->inf.b[0])
+#elif NEH_KEY_TYPE == NEH_HYBRID
+#define kd_adr(c)   ((uint8_t*)(c)->ks + ((c)->inf.b[0] == 160 ? 160 : 0))
+#else
+#error no key type defined for VIA ACE 
+#endif
+
+#else
+
+#define aligned_array(type, name, no, stride) type name[no]
+#define aligned_auto(type, name, no, stride)  type name[no]
+
+#endif
+
+#if defined( _MSC_VER ) && _MSC_VER > 1200
+
+#define via_cwd(cwd, ty, dir, len) \
+    unsigned long* cwd = (dir##_##ty##_table + ((len - 128) >> 4))
+
+#else
+
+#define via_cwd(cwd, ty, dir, len)              \
+    aligned_auto(unsigned long, cwd, 4, 16);    \
+    cwd[1] = cwd[2] = cwd[3] = 0;               \
+    cwd[0] = neh_##dir##_##ty##_key(len)
+
+#endif
+
+/* test the code for detecting and setting pointer alignment */
+
+AES_RETURN aes_test_alignment_detection(unsigned int n)	/* 4 <= n <= 16 */
+{	uint8_t	p[16];
+    uint32_t i, count_eq = 0, count_neq = 0;
+
+    if(n < 4 || n > 16)
+        return EXIT_FAILURE;
+
+    for(i = 0; i < n; ++i)
+    {
+        uint8_t *qf = ALIGN_FLOOR(p + i, n),
+                *qh =  ALIGN_CEIL(p + i, n);
+        
+        if(qh == qf)
+            ++count_eq;
+        else if(qh == qf + n)
+            ++count_neq;
+        else
+            return EXIT_FAILURE;
+    }
+    return (count_eq != 1 || count_neq != n - 1 ? EXIT_FAILURE : EXIT_SUCCESS);
+}
+
+AES_RETURN aes_mode_reset(aes_encrypt_ctx ctx[1])
+{
+    ctx->inf.b[2] = 0;
+    return EXIT_SUCCESS;
+}
+
+AES_RETURN aes_ecb_encrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, const aes_encrypt_ctx ctx[1])
+{   int nb = len >> 4;
+
+    if(len & (AES_BLOCK_SIZE - 1))
+        return EXIT_FAILURE;
+
+#if defined( USE_VIA_ACE_IF_PRESENT )
+
+    if(ctx->inf.b[1] == 0xff)
+    {   uint8_t *ksp = (uint8_t*)(ctx->ks);
+        via_cwd(cwd, hybrid, enc, 2 * ctx->inf.b[0] - 192);
+
+        if(ALIGN_OFFSET( ctx, 16 ))
+            return EXIT_FAILURE;
+
+        if(!ALIGN_OFFSET( ibuf, 16 ) && !ALIGN_OFFSET( obuf, 16 ))
+        {
+            via_ecb_op5(ksp, cwd, ibuf, obuf, nb);
+        }
+        else
+        {   aligned_auto(uint8_t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16);
+            uint8_t *ip, *op;
+
+            while(nb)
+            {
+                int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb);
+
+                ip = (ALIGN_OFFSET( ibuf, 16 ) ? buf : ibuf);
+                op = (ALIGN_OFFSET( obuf, 16 ) ? buf : obuf);
+
+                if(ip != ibuf)
+                    memcpy(buf, ibuf, m * AES_BLOCK_SIZE);
+
+                via_ecb_op5(ksp, cwd, ip, op, m);
+
+                if(op != obuf)
+                    memcpy(obuf, buf, m * AES_BLOCK_SIZE);
+
+                ibuf += m * AES_BLOCK_SIZE;
+                obuf += m * AES_BLOCK_SIZE;
+                nb -= m;
+            }
+        }
+
+        return EXIT_SUCCESS;
+    }
+
+#endif
+
+#if !defined( ASSUME_VIA_ACE_PRESENT )
+    while(nb--)
+    {
+        if(aes_encrypt(ibuf, obuf, ctx) != EXIT_SUCCESS)
+            return EXIT_FAILURE;
+        ibuf += AES_BLOCK_SIZE;
+        obuf += AES_BLOCK_SIZE;
+    }
+#endif
+    return EXIT_SUCCESS;
+}
+
+AES_RETURN aes_ecb_decrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, const aes_decrypt_ctx ctx[1])
+{   int nb = len >> 4;
+
+    if(len & (AES_BLOCK_SIZE - 1))
+        return EXIT_FAILURE;
+
+#if defined( USE_VIA_ACE_IF_PRESENT )
+
+    if(ctx->inf.b[1] == 0xff)
+    {   uint8_t *ksp = kd_adr(ctx);
+        via_cwd(cwd, hybrid, dec, 2 * ctx->inf.b[0] - 192);
+
+        if(ALIGN_OFFSET( ctx, 16 ))
+            return EXIT_FAILURE;
+
+        if(!ALIGN_OFFSET( ibuf, 16 ) && !ALIGN_OFFSET( obuf, 16 ))
+        {
+            via_ecb_op5(ksp, cwd, ibuf, obuf, nb);
+        }
+        else
+        {   aligned_auto(uint8_t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16);
+            uint8_t *ip, *op;
+
+            while(nb)
+            {
+                int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb);
+
+                ip = (ALIGN_OFFSET( ibuf, 16 ) ? buf : ibuf);
+                op = (ALIGN_OFFSET( obuf, 16 ) ? buf : obuf);
+
+                if(ip != ibuf)
+                    memcpy(buf, ibuf, m * AES_BLOCK_SIZE);
+
+                via_ecb_op5(ksp, cwd, ip, op, m);
+
+                if(op != obuf)
+                    memcpy(obuf, buf, m * AES_BLOCK_SIZE);
+
+                ibuf += m * AES_BLOCK_SIZE;
+                obuf += m * AES_BLOCK_SIZE;
+                nb -= m;
+            }
+        }
+
+        return EXIT_SUCCESS;
+    }
+
+#endif
+
+#if !defined( ASSUME_VIA_ACE_PRESENT )
+    while(nb--)
+    {
+        if(aes_decrypt(ibuf, obuf, ctx) != EXIT_SUCCESS)
+            return EXIT_FAILURE;
+        ibuf += AES_BLOCK_SIZE;
+        obuf += AES_BLOCK_SIZE;
+    }
+#endif
+    return EXIT_SUCCESS;
+}
+
+AES_RETURN aes_cbc_encrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, const aes_encrypt_ctx ctx[1])
+{   int nb = len >> 4;
+
+    if(len & (AES_BLOCK_SIZE - 1))
+        return EXIT_FAILURE;
+
+#if defined( USE_VIA_ACE_IF_PRESENT )
+
+    if(ctx->inf.b[1] == 0xff)
+    {   uint8_t *ksp = (uint8_t*)(ctx->ks), *ivp = iv;
+        aligned_auto(uint8_t, liv, AES_BLOCK_SIZE, 16);
+        via_cwd(cwd, hybrid, enc, 2 * ctx->inf.b[0] - 192);
+
+        if(ALIGN_OFFSET( ctx, 16 ))
+            return EXIT_FAILURE;
+
+        if(ALIGN_OFFSET( iv, 16 ))   /* ensure an aligned iv */
+        {
+            ivp = liv;
+            memcpy(liv, iv, AES_BLOCK_SIZE);
+        }
+
+        if(!ALIGN_OFFSET( ibuf, 16 ) && !ALIGN_OFFSET( obuf, 16 ) && !ALIGN_OFFSET( iv, 16 ))
+        {
+            via_cbc_op7(ksp, cwd, ibuf, obuf, nb, ivp, ivp);
+        }
+        else
+        {   aligned_auto(uint8_t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16);
+            uint8_t *ip, *op;
+
+            while(nb)
+            {
+                int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb);
+
+                ip = (ALIGN_OFFSET( ibuf, 16 ) ? buf : ibuf);
+                op = (ALIGN_OFFSET( obuf, 16 ) ? buf : obuf);
+
+                if(ip != ibuf)
+                    memcpy(buf, ibuf, m * AES_BLOCK_SIZE);
+
+                via_cbc_op7(ksp, cwd, ip, op, m, ivp, ivp);
+
+                if(op != obuf)
+                    memcpy(obuf, buf, m * AES_BLOCK_SIZE);
+
+                ibuf += m * AES_BLOCK_SIZE;
+                obuf += m * AES_BLOCK_SIZE;
+                nb -= m;
+            }
+        }
+
+        if(iv != ivp)
+            memcpy(iv, ivp, AES_BLOCK_SIZE);
+
+        return EXIT_SUCCESS;
+    }
+
+#endif
+
+#if !defined( ASSUME_VIA_ACE_PRESENT )
+# ifdef FAST_BUFFER_OPERATIONS
+    if(!ALIGN_OFFSET( ibuf, 4 ) && !ALIGN_OFFSET( iv, 4 ))
+        while(nb--)
+        {
+            lp32(iv)[0] ^= lp32(ibuf)[0];
+            lp32(iv)[1] ^= lp32(ibuf)[1];
+            lp32(iv)[2] ^= lp32(ibuf)[2];
+            lp32(iv)[3] ^= lp32(ibuf)[3];
+            if(aes_encrypt(iv, iv, ctx) != EXIT_SUCCESS)
+                return EXIT_FAILURE;
+            memcpy(obuf, iv, AES_BLOCK_SIZE);
+            ibuf += AES_BLOCK_SIZE;
+            obuf += AES_BLOCK_SIZE;
+        }
+    else
+# endif
+        while(nb--)
+        {
+            iv[ 0] ^= ibuf[ 0]; iv[ 1] ^= ibuf[ 1];
+            iv[ 2] ^= ibuf[ 2]; iv[ 3] ^= ibuf[ 3];
+            iv[ 4] ^= ibuf[ 4]; iv[ 5] ^= ibuf[ 5];
+            iv[ 6] ^= ibuf[ 6]; iv[ 7] ^= ibuf[ 7];
+            iv[ 8] ^= ibuf[ 8]; iv[ 9] ^= ibuf[ 9];
+            iv[10] ^= ibuf[10]; iv[11] ^= ibuf[11];
+            iv[12] ^= ibuf[12]; iv[13] ^= ibuf[13];
+            iv[14] ^= ibuf[14]; iv[15] ^= ibuf[15];
+            if(aes_encrypt(iv, iv, ctx) != EXIT_SUCCESS)
+                return EXIT_FAILURE;
+            memcpy(obuf, iv, AES_BLOCK_SIZE);
+            ibuf += AES_BLOCK_SIZE;
+            obuf += AES_BLOCK_SIZE;
+        }
+#endif
+    return EXIT_SUCCESS;
+}
+
+AES_RETURN aes_cbc_decrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, const aes_decrypt_ctx ctx[1])
+{   unsigned char tmp[AES_BLOCK_SIZE];
+    int nb = len >> 4;
+
+    if(len & (AES_BLOCK_SIZE - 1))
+        return EXIT_FAILURE;
+
+#if defined( USE_VIA_ACE_IF_PRESENT )
+
+    if(ctx->inf.b[1] == 0xff)
+    {   uint8_t *ksp = kd_adr(ctx), *ivp = iv;
+        aligned_auto(uint8_t, liv, AES_BLOCK_SIZE, 16);
+        via_cwd(cwd, hybrid, dec, 2 * ctx->inf.b[0] - 192);
+
+        if(ALIGN_OFFSET( ctx, 16 ))
+            return EXIT_FAILURE;
+
+        if(ALIGN_OFFSET( iv, 16 ))   /* ensure an aligned iv */
+        {
+            ivp = liv;
+            memcpy(liv, iv, AES_BLOCK_SIZE);
+        }
+
+        if(!ALIGN_OFFSET( ibuf, 16 ) && !ALIGN_OFFSET( obuf, 16 ) && !ALIGN_OFFSET( iv, 16 ))
+        {
+            via_cbc_op6(ksp, cwd, ibuf, obuf, nb, ivp);
+        }
+        else
+        {   aligned_auto(uint8_t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16);
+            uint8_t *ip, *op;
+
+            while(nb)
+            {
+                int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb);
+
+                ip = (ALIGN_OFFSET( ibuf, 16 ) ? buf : ibuf);
+                op = (ALIGN_OFFSET( obuf, 16 ) ? buf : obuf);
+
+                if(ip != ibuf)
+                    memcpy(buf, ibuf, m * AES_BLOCK_SIZE);
+
+                via_cbc_op6(ksp, cwd, ip, op, m, ivp);
+
+                if(op != obuf)
+                    memcpy(obuf, buf, m * AES_BLOCK_SIZE);
+
+                ibuf += m * AES_BLOCK_SIZE;
+                obuf += m * AES_BLOCK_SIZE;
+                nb -= m;
+            }
+        }
+
+        if(iv != ivp)
+            memcpy(iv, ivp, AES_BLOCK_SIZE);
+
+        return EXIT_SUCCESS;
+    }
+#endif
+
+#if !defined( ASSUME_VIA_ACE_PRESENT )
+# ifdef FAST_BUFFER_OPERATIONS
+    if(!ALIGN_OFFSET( obuf, 4 ) && !ALIGN_OFFSET( iv, 4 ))
+        while(nb--)
+        {
+            memcpy(tmp, ibuf, AES_BLOCK_SIZE);
+            if(aes_decrypt(ibuf, obuf, ctx) != EXIT_SUCCESS)
+                return EXIT_FAILURE;
+            lp32(obuf)[0] ^= lp32(iv)[0];
+            lp32(obuf)[1] ^= lp32(iv)[1];
+            lp32(obuf)[2] ^= lp32(iv)[2];
+            lp32(obuf)[3] ^= lp32(iv)[3];
+            memcpy(iv, tmp, AES_BLOCK_SIZE);
+            ibuf += AES_BLOCK_SIZE;
+            obuf += AES_BLOCK_SIZE;
+        }
+    else
+# endif
+        while(nb--)
+        {
+            memcpy(tmp, ibuf, AES_BLOCK_SIZE);
+            if(aes_decrypt(ibuf, obuf, ctx) != EXIT_SUCCESS)
+                return EXIT_FAILURE;
+            obuf[ 0] ^= iv[ 0]; obuf[ 1] ^= iv[ 1];
+            obuf[ 2] ^= iv[ 2]; obuf[ 3] ^= iv[ 3];
+            obuf[ 4] ^= iv[ 4]; obuf[ 5] ^= iv[ 5];
+            obuf[ 6] ^= iv[ 6]; obuf[ 7] ^= iv[ 7];
+            obuf[ 8] ^= iv[ 8]; obuf[ 9] ^= iv[ 9];
+            obuf[10] ^= iv[10]; obuf[11] ^= iv[11];
+            obuf[12] ^= iv[12]; obuf[13] ^= iv[13];
+            obuf[14] ^= iv[14]; obuf[15] ^= iv[15];
+            memcpy(iv, tmp, AES_BLOCK_SIZE);
+            ibuf += AES_BLOCK_SIZE;
+            obuf += AES_BLOCK_SIZE;
+        }
+#endif
+    return EXIT_SUCCESS;
+}
+
+AES_RETURN aes_cfb_encrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, aes_encrypt_ctx ctx[1])
+{   int cnt = 0, b_pos = (int)ctx->inf.b[2], nb;
+
+    if(b_pos)           /* complete any partial block   */
+    {
+        while(b_pos < AES_BLOCK_SIZE && cnt < len)
+        {
+            *obuf++ = (iv[b_pos++] ^= *ibuf++);
+            cnt++;
+        }
+
+        b_pos = (b_pos == AES_BLOCK_SIZE ? 0 : b_pos);
+    }
+
+    if((nb = (len - cnt) >> 4) != 0)    /* process whole blocks */
+    {
+#if defined( USE_VIA_ACE_IF_PRESENT )
+
+        if(ctx->inf.b[1] == 0xff)
+        {   int m;
+            uint8_t *ksp = (uint8_t*)(ctx->ks), *ivp = iv;
+            aligned_auto(uint8_t, liv, AES_BLOCK_SIZE, 16);
+            via_cwd(cwd, hybrid, enc, 2 * ctx->inf.b[0] - 192);
+
+            if(ALIGN_OFFSET( ctx, 16 ))
+                return EXIT_FAILURE;
+
+            if(ALIGN_OFFSET( iv, 16 ))   /* ensure an aligned iv */
+            {
+                ivp = liv;
+                memcpy(liv, iv, AES_BLOCK_SIZE);
+            }
+
+            if(!ALIGN_OFFSET( ibuf, 16 ) && !ALIGN_OFFSET( obuf, 16 ))
+            {
+                via_cfb_op7(ksp, cwd, ibuf, obuf, nb, ivp, ivp);
+                ibuf += nb * AES_BLOCK_SIZE;
+                obuf += nb * AES_BLOCK_SIZE;
+                cnt  += nb * AES_BLOCK_SIZE;
+            }
+            else    /* input, output or both are unaligned  */
+            {   aligned_auto(uint8_t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16);
+                uint8_t *ip, *op;
+
+                while(nb)
+                {
+                    m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb), nb -= m;
+
+                    ip = (ALIGN_OFFSET( ibuf, 16 ) ? buf : ibuf);
+                    op = (ALIGN_OFFSET( obuf, 16 ) ? buf : obuf);
+
+                    if(ip != ibuf)
+                        memcpy(buf, ibuf, m * AES_BLOCK_SIZE);
+
+                    via_cfb_op7(ksp, cwd, ip, op, m, ivp, ivp);
+
+                    if(op != obuf)
+                        memcpy(obuf, buf, m * AES_BLOCK_SIZE);
+
+                    ibuf += m * AES_BLOCK_SIZE;
+                    obuf += m * AES_BLOCK_SIZE;
+                    cnt  += m * AES_BLOCK_SIZE;
+                }
+            }
+
+            if(ivp != iv)
+                memcpy(iv, ivp, AES_BLOCK_SIZE);
+        }
+#else
+# ifdef FAST_BUFFER_OPERATIONS
+        if(!ALIGN_OFFSET( ibuf, 4 ) && !ALIGN_OFFSET( obuf, 4 ) && !ALIGN_OFFSET( iv, 4 ))
+            while(cnt + AES_BLOCK_SIZE <= len)
+            {
+                assert(b_pos == 0);
+                if(aes_encrypt(iv, iv, ctx) != EXIT_SUCCESS)
+                    return EXIT_FAILURE;
+                lp32(obuf)[0] = lp32(iv)[0] ^= lp32(ibuf)[0];
+                lp32(obuf)[1] = lp32(iv)[1] ^= lp32(ibuf)[1];
+                lp32(obuf)[2] = lp32(iv)[2] ^= lp32(ibuf)[2];
+                lp32(obuf)[3] = lp32(iv)[3] ^= lp32(ibuf)[3];
+                ibuf += AES_BLOCK_SIZE;
+                obuf += AES_BLOCK_SIZE;
+                cnt  += AES_BLOCK_SIZE;
+            }
+        else
+# endif
+            while(cnt + AES_BLOCK_SIZE <= len)
+            {
+                assert(b_pos == 0);
+                if(aes_encrypt(iv, iv, ctx) != EXIT_SUCCESS)
+                    return EXIT_FAILURE;
+                obuf[ 0] = iv[ 0] ^= ibuf[ 0]; obuf[ 1] = iv[ 1] ^= ibuf[ 1];
+                obuf[ 2] = iv[ 2] ^= ibuf[ 2]; obuf[ 3] = iv[ 3] ^= ibuf[ 3];
+                obuf[ 4] = iv[ 4] ^= ibuf[ 4]; obuf[ 5] = iv[ 5] ^= ibuf[ 5];
+                obuf[ 6] = iv[ 6] ^= ibuf[ 6]; obuf[ 7] = iv[ 7] ^= ibuf[ 7];
+                obuf[ 8] = iv[ 8] ^= ibuf[ 8]; obuf[ 9] = iv[ 9] ^= ibuf[ 9];
+                obuf[10] = iv[10] ^= ibuf[10]; obuf[11] = iv[11] ^= ibuf[11];
+                obuf[12] = iv[12] ^= ibuf[12]; obuf[13] = iv[13] ^= ibuf[13];
+                obuf[14] = iv[14] ^= ibuf[14]; obuf[15] = iv[15] ^= ibuf[15];
+                ibuf += AES_BLOCK_SIZE;
+                obuf += AES_BLOCK_SIZE;
+                cnt  += AES_BLOCK_SIZE;
+            }
+#endif
+    }
+
+    while(cnt < len)
+    {
+        if(!b_pos && aes_encrypt(iv, iv, ctx) != EXIT_SUCCESS)
+            return EXIT_FAILURE;
+
+        while(cnt < len && b_pos < AES_BLOCK_SIZE)
+        {
+            *obuf++ = (iv[b_pos++] ^= *ibuf++);
+            cnt++;
+        }
+
+        b_pos = (b_pos == AES_BLOCK_SIZE ? 0 : b_pos);
+    }
+
+    ctx->inf.b[2] = (uint8_t)b_pos;
+    return EXIT_SUCCESS;
+}
+
+AES_RETURN aes_cfb_decrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, aes_encrypt_ctx ctx[1])
+{   int cnt = 0, b_pos = (int)ctx->inf.b[2], nb;
+
+    if(b_pos)           /* complete any partial block   */
+    {   uint8_t t;
+
+        while(b_pos < AES_BLOCK_SIZE && cnt < len)
+        {
+            t = *ibuf++;
+            *obuf++ = t ^ iv[b_pos];
+            iv[b_pos++] = t;
+            cnt++;
+        }
+
+        b_pos = (b_pos == AES_BLOCK_SIZE ? 0 : b_pos);
+    }
+
+    if((nb = (len - cnt) >> 4) != 0)    /* process whole blocks */
+    {
+#if defined( USE_VIA_ACE_IF_PRESENT )
+
+        if(ctx->inf.b[1] == 0xff)
+        {   int m;
+            uint8_t *ksp = (uint8_t*)(ctx->ks), *ivp = iv;
+            aligned_auto(uint8_t, liv, AES_BLOCK_SIZE, 16);
+            via_cwd(cwd, hybrid, dec, 2 * ctx->inf.b[0] - 192);
+
+            if(ALIGN_OFFSET( ctx, 16 ))
+                return EXIT_FAILURE;
+
+            if(ALIGN_OFFSET( iv, 16 ))   /* ensure an aligned iv */
+            {
+                ivp = liv;
+                memcpy(liv, iv, AES_BLOCK_SIZE);
+            }
+
+            if(!ALIGN_OFFSET( ibuf, 16 ) && !ALIGN_OFFSET( obuf, 16 ))
+            {
+                via_cfb_op6(ksp, cwd, ibuf, obuf, nb, ivp);
+                ibuf += nb * AES_BLOCK_SIZE;
+                obuf += nb * AES_BLOCK_SIZE;
+                cnt  += nb * AES_BLOCK_SIZE;
+            }
+            else    /* input, output or both are unaligned  */
+            {   aligned_auto(uint8_t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16);
+                uint8_t *ip, *op;
+
+                while(nb)
+                {
+                    m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb), nb -= m;
+
+                    ip = (ALIGN_OFFSET( ibuf, 16 ) ? buf : ibuf);
+                    op = (ALIGN_OFFSET( obuf, 16 ) ? buf : obuf);
+
+                    if(ip != ibuf)  /* input buffer is not aligned */
+                        memcpy(buf, ibuf, m * AES_BLOCK_SIZE);
+
+                    via_cfb_op6(ksp, cwd, ip, op, m, ivp);
+
+                    if(op != obuf)  /* output buffer is not aligned */
+                        memcpy(obuf, buf, m * AES_BLOCK_SIZE);
+
+                    ibuf += m * AES_BLOCK_SIZE;
+                    obuf += m * AES_BLOCK_SIZE;
+                    cnt  += m * AES_BLOCK_SIZE;
+                }
+            }
+
+            if(ivp != iv)
+                memcpy(iv, ivp, AES_BLOCK_SIZE);
+        }
+#else
+# ifdef FAST_BUFFER_OPERATIONS
+        if(!ALIGN_OFFSET( ibuf, 4 ) && !ALIGN_OFFSET( obuf, 4 ) &&!ALIGN_OFFSET( iv, 4 ))
+            while(cnt + AES_BLOCK_SIZE <= len)
+            {   uint32_t t;
+
+                assert(b_pos == 0);
+                if(aes_encrypt(iv, iv, ctx) != EXIT_SUCCESS)
+                    return EXIT_FAILURE;
+                t = lp32(ibuf)[0], lp32(obuf)[0] = t ^ lp32(iv)[0], lp32(iv)[0] = t;
+                t = lp32(ibuf)[1], lp32(obuf)[1] = t ^ lp32(iv)[1], lp32(iv)[1] = t;
+                t = lp32(ibuf)[2], lp32(obuf)[2] = t ^ lp32(iv)[2], lp32(iv)[2] = t;
+                t = lp32(ibuf)[3], lp32(obuf)[3] = t ^ lp32(iv)[3], lp32(iv)[3] = t;
+                ibuf += AES_BLOCK_SIZE;
+                obuf += AES_BLOCK_SIZE;
+                cnt  += AES_BLOCK_SIZE;
+            }
+        else
+# endif
+            while(cnt + AES_BLOCK_SIZE <= len)
+            {   uint8_t t;
+
+                assert(b_pos == 0);
+                if(aes_encrypt(iv, iv, ctx) != EXIT_SUCCESS)
+                    return EXIT_FAILURE;
+                t = ibuf[ 0], obuf[ 0] = t ^ iv[ 0], iv[ 0] = t;
+                t = ibuf[ 1], obuf[ 1] = t ^ iv[ 1], iv[ 1] = t;
+                t = ibuf[ 2], obuf[ 2] = t ^ iv[ 2], iv[ 2] = t;
+                t = ibuf[ 3], obuf[ 3] = t ^ iv[ 3], iv[ 3] = t;
+                t = ibuf[ 4], obuf[ 4] = t ^ iv[ 4], iv[ 4] = t;
+                t = ibuf[ 5], obuf[ 5] = t ^ iv[ 5], iv[ 5] = t;
+                t = ibuf[ 6], obuf[ 6] = t ^ iv[ 6], iv[ 6] = t;
+                t = ibuf[ 7], obuf[ 7] = t ^ iv[ 7], iv[ 7] = t;
+                t = ibuf[ 8], obuf[ 8] = t ^ iv[ 8], iv[ 8] = t;
+                t = ibuf[ 9], obuf[ 9] = t ^ iv[ 9], iv[ 9] = t;
+                t = ibuf[10], obuf[10] = t ^ iv[10], iv[10] = t;
+                t = ibuf[11], obuf[11] = t ^ iv[11], iv[11] = t;
+                t = ibuf[12], obuf[12] = t ^ iv[12], iv[12] = t;
+                t = ibuf[13], obuf[13] = t ^ iv[13], iv[13] = t;
+                t = ibuf[14], obuf[14] = t ^ iv[14], iv[14] = t;
+                t = ibuf[15], obuf[15] = t ^ iv[15], iv[15] = t;
+                ibuf += AES_BLOCK_SIZE;
+                obuf += AES_BLOCK_SIZE;
+                cnt  += AES_BLOCK_SIZE;
+            }
+#endif
+    }
+
+    while(cnt < len)
+    {   uint8_t t;
+
+        if(!b_pos && aes_encrypt(iv, iv, ctx) != EXIT_SUCCESS)
+            return EXIT_FAILURE;
+
+        while(cnt < len && b_pos < AES_BLOCK_SIZE)
+        {
+            t = *ibuf++;
+            *obuf++ = t ^ iv[b_pos];
+            iv[b_pos++] = t;
+            cnt++;
+        }
+
+        b_pos = (b_pos == AES_BLOCK_SIZE ? 0 : b_pos);
+    }
+
+    ctx->inf.b[2] = (uint8_t)b_pos;
+    return EXIT_SUCCESS;
+}
+
+AES_RETURN aes_ofb_crypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, aes_encrypt_ctx ctx[1])
+{   int cnt = 0, b_pos = (int)ctx->inf.b[2], nb;
+
+    if(b_pos)           /* complete any partial block   */
+    {
+        while(b_pos < AES_BLOCK_SIZE && cnt < len)
+        {
+            *obuf++ = iv[b_pos++] ^ *ibuf++;
+            cnt++;
+        }
+
+        b_pos = (b_pos == AES_BLOCK_SIZE ? 0 : b_pos);
+    }
+
+    if((nb = (len - cnt) >> 4) != 0)   /* process whole blocks */
+    {
+#if defined( USE_VIA_ACE_IF_PRESENT )
+
+        if(ctx->inf.b[1] == 0xff)
+        {   int m;
+            uint8_t *ksp = (uint8_t*)(ctx->ks), *ivp = iv;
+            aligned_auto(uint8_t, liv, AES_BLOCK_SIZE, 16);
+            via_cwd(cwd, hybrid, enc, 2 * ctx->inf.b[0] - 192);
+
+            if(ALIGN_OFFSET( ctx, 16 ))
+                return EXIT_FAILURE;
+
+            if(ALIGN_OFFSET( iv, 16 ))   /* ensure an aligned iv */
+            {
+                ivp = liv;
+                memcpy(liv, iv, AES_BLOCK_SIZE);
+            }
+
+            if(!ALIGN_OFFSET( ibuf, 16 ) && !ALIGN_OFFSET( obuf, 16 ))
+            {
+                via_ofb_op6(ksp, cwd, ibuf, obuf, nb, ivp);
+                ibuf += nb * AES_BLOCK_SIZE;
+                obuf += nb * AES_BLOCK_SIZE;
+                cnt  += nb * AES_BLOCK_SIZE;
+            }
+            else    /* input, output or both are unaligned  */
+        {   aligned_auto(uint8_t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16);
+            uint8_t *ip, *op;
+
+                while(nb)
+                {
+                    m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb), nb -= m;
+
+                    ip = (ALIGN_OFFSET( ibuf, 16 ) ? buf : ibuf);
+                    op = (ALIGN_OFFSET( obuf, 16 ) ? buf : obuf);
+
+                    if(ip != ibuf)
+                        memcpy(buf, ibuf, m * AES_BLOCK_SIZE);
+
+                    via_ofb_op6(ksp, cwd, ip, op, m, ivp);
+
+                    if(op != obuf)
+                        memcpy(obuf, buf, m * AES_BLOCK_SIZE);
+
+                    ibuf += m * AES_BLOCK_SIZE;
+                    obuf += m * AES_BLOCK_SIZE;
+                    cnt  += m * AES_BLOCK_SIZE;
+                }
+            }
+
+            if(ivp != iv)
+                memcpy(iv, ivp, AES_BLOCK_SIZE);
+        }
+#else
+# ifdef FAST_BUFFER_OPERATIONS
+        if(!ALIGN_OFFSET( ibuf, 4 ) && !ALIGN_OFFSET( obuf, 4 ) && !ALIGN_OFFSET( iv, 4 ))
+            while(cnt + AES_BLOCK_SIZE <= len)
+            {
+                assert(b_pos == 0);
+                if(aes_encrypt(iv, iv, ctx) != EXIT_SUCCESS)
+                    return EXIT_FAILURE;
+                lp32(obuf)[0] = lp32(iv)[0] ^ lp32(ibuf)[0];
+                lp32(obuf)[1] = lp32(iv)[1] ^ lp32(ibuf)[1];
+                lp32(obuf)[2] = lp32(iv)[2] ^ lp32(ibuf)[2];
+                lp32(obuf)[3] = lp32(iv)[3] ^ lp32(ibuf)[3];
+                ibuf += AES_BLOCK_SIZE;
+                obuf += AES_BLOCK_SIZE;
+                cnt  += AES_BLOCK_SIZE;
+            }
+        else
+# endif
+            while(cnt + AES_BLOCK_SIZE <= len)
+            {
+                assert(b_pos == 0);
+                if(aes_encrypt(iv, iv, ctx) != EXIT_SUCCESS)
+                    return EXIT_FAILURE;
+                obuf[ 0] = iv[ 0] ^ ibuf[ 0]; obuf[ 1] = iv[ 1] ^ ibuf[ 1];
+                obuf[ 2] = iv[ 2] ^ ibuf[ 2]; obuf[ 3] = iv[ 3] ^ ibuf[ 3];
+                obuf[ 4] = iv[ 4] ^ ibuf[ 4]; obuf[ 5] = iv[ 5] ^ ibuf[ 5];
+                obuf[ 6] = iv[ 6] ^ ibuf[ 6]; obuf[ 7] = iv[ 7] ^ ibuf[ 7];
+                obuf[ 8] = iv[ 8] ^ ibuf[ 8]; obuf[ 9] = iv[ 9] ^ ibuf[ 9];
+                obuf[10] = iv[10] ^ ibuf[10]; obuf[11] = iv[11] ^ ibuf[11];
+                obuf[12] = iv[12] ^ ibuf[12]; obuf[13] = iv[13] ^ ibuf[13];
+                obuf[14] = iv[14] ^ ibuf[14]; obuf[15] = iv[15] ^ ibuf[15];
+                ibuf += AES_BLOCK_SIZE;
+                obuf += AES_BLOCK_SIZE;
+                cnt  += AES_BLOCK_SIZE;
+            }
+#endif
+    }
+
+    while(cnt < len)
+    {
+        if(!b_pos && aes_encrypt(iv, iv, ctx) != EXIT_SUCCESS)
+            return EXIT_FAILURE;
+
+        while(cnt < len && b_pos < AES_BLOCK_SIZE)
+        {
+            *obuf++ = iv[b_pos++] ^ *ibuf++;
+            cnt++;
+        }
+
+        b_pos = (b_pos == AES_BLOCK_SIZE ? 0 : b_pos);
+    }
+
+    ctx->inf.b[2] = (uint8_t)b_pos;
+    return EXIT_SUCCESS;
+}
+
+#define BFR_LENGTH  (BFR_BLOCKS * AES_BLOCK_SIZE)
+
+AES_RETURN aes_ctr_crypt(const unsigned char *ibuf, unsigned char *obuf,
+            int len, unsigned char *cbuf, cbuf_inc ctr_inc, aes_encrypt_ctx ctx[1])
+{   unsigned char   *ip;
+    int             i, blen, b_pos = (int)(ctx->inf.b[2]);
+
+#if defined( USE_VIA_ACE_IF_PRESENT )
+    aligned_auto(uint8_t, buf, BFR_LENGTH, 16);
+    if(ctx->inf.b[1] == 0xff && ALIGN_OFFSET( ctx, 16 ))
+        return EXIT_FAILURE;
+#else
+    uint8_t buf[BFR_LENGTH];
+#endif
+
+    if(b_pos)
+    {
+        memcpy(buf, cbuf, AES_BLOCK_SIZE);
+        if(aes_ecb_encrypt(buf, buf, AES_BLOCK_SIZE, ctx) != EXIT_SUCCESS)
+            return EXIT_FAILURE;
+
+        while(b_pos < AES_BLOCK_SIZE && len)
+        {
+            *obuf++ = *ibuf++ ^ buf[b_pos++];
+            --len;
+        }
+
+        if(len)
+            ctr_inc(cbuf), b_pos = 0;
+    }
+
+    while(len)
+    {
+        blen = (len > BFR_LENGTH ? BFR_LENGTH : len), len -= blen;
+
+        for(i = 0, ip = buf; i < (blen >> 4); ++i)
+        {
+            memcpy(ip, cbuf, AES_BLOCK_SIZE);
+            ctr_inc(cbuf);
+            ip += AES_BLOCK_SIZE;
+        }
+
+        if(blen & (AES_BLOCK_SIZE - 1))
+            memcpy(ip, cbuf, AES_BLOCK_SIZE), i++;
+
+#if defined( USE_VIA_ACE_IF_PRESENT )
+        if(ctx->inf.b[1] == 0xff)
+        {
+            via_cwd(cwd, hybrid, enc, 2 * ctx->inf.b[0] - 192);
+            via_ecb_op5((ctx->ks), cwd, buf, buf, i);
+        }
+        else
+#endif
+        if(aes_ecb_encrypt(buf, buf, i * AES_BLOCK_SIZE, ctx) != EXIT_SUCCESS)
+            return EXIT_FAILURE;
+
+        i = 0; ip = buf;
+# ifdef FAST_BUFFER_OPERATIONS
+        if(!ALIGN_OFFSET( ibuf, 4 ) && !ALIGN_OFFSET( obuf, 4 ) && !ALIGN_OFFSET( ip, 4 ))
+            while(i + AES_BLOCK_SIZE <= blen)
+            {
+                lp32(obuf)[0] = lp32(ibuf)[0] ^ lp32(ip)[0];
+                lp32(obuf)[1] = lp32(ibuf)[1] ^ lp32(ip)[1];
+                lp32(obuf)[2] = lp32(ibuf)[2] ^ lp32(ip)[2];
+                lp32(obuf)[3] = lp32(ibuf)[3] ^ lp32(ip)[3];
+                i += AES_BLOCK_SIZE;
+                ip += AES_BLOCK_SIZE;
+                ibuf += AES_BLOCK_SIZE;
+                obuf += AES_BLOCK_SIZE;
+            }
+        else
+#endif
+            while(i + AES_BLOCK_SIZE <= blen)
+            {
+                obuf[ 0] = ibuf[ 0] ^ ip[ 0]; obuf[ 1] = ibuf[ 1] ^ ip[ 1];
+                obuf[ 2] = ibuf[ 2] ^ ip[ 2]; obuf[ 3] = ibuf[ 3] ^ ip[ 3];
+                obuf[ 4] = ibuf[ 4] ^ ip[ 4]; obuf[ 5] = ibuf[ 5] ^ ip[ 5];
+                obuf[ 6] = ibuf[ 6] ^ ip[ 6]; obuf[ 7] = ibuf[ 7] ^ ip[ 7];
+                obuf[ 8] = ibuf[ 8] ^ ip[ 8]; obuf[ 9] = ibuf[ 9] ^ ip[ 9];
+                obuf[10] = ibuf[10] ^ ip[10]; obuf[11] = ibuf[11] ^ ip[11];
+                obuf[12] = ibuf[12] ^ ip[12]; obuf[13] = ibuf[13] ^ ip[13];
+                obuf[14] = ibuf[14] ^ ip[14]; obuf[15] = ibuf[15] ^ ip[15];
+                i += AES_BLOCK_SIZE;
+                ip += AES_BLOCK_SIZE;
+                ibuf += AES_BLOCK_SIZE;
+                obuf += AES_BLOCK_SIZE;
+            }
+
+        while(i++ < blen)
+            *obuf++ = *ibuf++ ^ ip[b_pos++];
+    }
+
+    ctx->inf.b[2] = (uint8_t)b_pos;
+    return EXIT_SUCCESS;
+}
+
+#if defined(__cplusplus)
+}
+#endif
+#endif
diff --git a/aescrypt.c b/aescrypt.c
index a832c63681..83c77f0b66 100644
--- a/aescrypt.c
+++ b/aescrypt.c
@@ -1,421 +1,301 @@
 /*
- -------------------------------------------------------------------------
- Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
- All rights reserved.
+---------------------------------------------------------------------------
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
 
- LICENSE TERMS
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
 
- The free distribution and use of this software in both source and binary
- form is allowed (with or without changes) provided that:
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
 
-   1. distributions of this source code include the above copyright
-      notice, this list of conditions and the following disclaimer;
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
 
-   2. distributions in binary form include the above copyright
-      notice, this list of conditions and the following disclaimer
-      in the documentation and/or other associated materials;
-
-   3. the copyright holder's name is not used to endorse products
-      built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and fitness for purpose.
- -------------------------------------------------------------------------
- Issue Date: 29/07/2002
-
- This file contains the code for implementing encryption and decryption
- for AES (Rijndael) for block and key sizes of 16, 24 and 32 bytes. It
- can optionally be replaced by code written in assembler using NASM.
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 20/12/2007
 */
 
 #include "aesopt.h"
+#include "aestab.h"
 
-#if defined(BLOCK_SIZE) && (BLOCK_SIZE & 7)
-#error An illegal block size has been specified.
+#if defined( USE_INTEL_AES_IF_PRESENT )
+#  include "aes_ni.h"
+#else
+/* map names here to provide the external API ('name' -> 'aes_name') */
+#  define aes_xi(x) aes_ ## x
 #endif
 
-#define unused  77  /* Sunset Strip */
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
 
-#define si(y,x,k,c) s(y,c) = word_in(x + 4 * c) ^ k[c]
-#define so(y,x,c)   word_out(y + 4 * c, s(x,c))
-
-#if BLOCK_SIZE == 16
+#define si(y,x,k,c) (s(y,c) = word_in(x, c) ^ (k)[c])
+#define so(y,x,c)   word_out(y, c, s(x,c))
 
 #if defined(ARRAYS)
 #define locals(y,x)     x[4],y[4]
 #else
 #define locals(y,x)     x##0,x##1,x##2,x##3,y##0,y##1,y##2,y##3
- /*
-   the following defines prevent the compiler requiring the declaration
-   of generated but unused variables in the fwd_var and inv_var macros
- */
-#define b04 unused
-#define b05 unused
-#define b06 unused
-#define b07 unused
-#define b14 unused
-#define b15 unused
-#define b16 unused
-#define b17 unused
 #endif
+
 #define l_copy(y, x)    s(y,0) = s(x,0); s(y,1) = s(x,1); \
                         s(y,2) = s(x,2); s(y,3) = s(x,3);
 #define state_in(y,x,k) si(y,x,k,0); si(y,x,k,1); si(y,x,k,2); si(y,x,k,3)
 #define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
 #define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3)
 
-#elif BLOCK_SIZE == 24
+#if ( FUNCS_IN_C & ENCRYPTION_IN_C )
 
-#if defined(ARRAYS)
-#define locals(y,x)     x[6],y[6]
-#else
-#define locals(y,x)     x##0,x##1,x##2,x##3,x##4,x##5, \
-                        y##0,y##1,y##2,y##3,y##4,y##5
-#define b06 unused
-#define b07 unused
-#define b16 unused
-#define b17 unused
-#endif
-#define l_copy(y, x)    s(y,0) = s(x,0); s(y,1) = s(x,1); \
-                        s(y,2) = s(x,2); s(y,3) = s(x,3); \
-                        s(y,4) = s(x,4); s(y,5) = s(x,5);
-#define state_in(y,x,k) si(y,x,k,0); si(y,x,k,1); si(y,x,k,2); \
-                        si(y,x,k,3); si(y,x,k,4); si(y,x,k,5)
-#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); \
-                        so(y,x,3); so(y,x,4); so(y,x,5)
-#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); \
-                        rm(y,x,k,3); rm(y,x,k,4); rm(y,x,k,5)
-#else
-
-#if defined(ARRAYS)
-#define locals(y,x)     x[8],y[8]
-#else
-#define locals(y,x)     x##0,x##1,x##2,x##3,x##4,x##5,x##6,x##7, \
-                        y##0,y##1,y##2,y##3,y##4,y##5,y##6,y##7
-#endif
-#define l_copy(y, x)    s(y,0) = s(x,0); s(y,1) = s(x,1); \
-                        s(y,2) = s(x,2); s(y,3) = s(x,3); \
-                        s(y,4) = s(x,4); s(y,5) = s(x,5); \
-                        s(y,6) = s(x,6); s(y,7) = s(x,7);
-
-#if BLOCK_SIZE == 32
-
-#define state_in(y,x,k) si(y,x,k,0); si(y,x,k,1); si(y,x,k,2); si(y,x,k,3); \
-                        si(y,x,k,4); si(y,x,k,5); si(y,x,k,6); si(y,x,k,7)
-#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3); \
-                        so(y,x,4); so(y,x,5); so(y,x,6); so(y,x,7)
-#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3); \
-                        rm(y,x,k,4); rm(y,x,k,5); rm(y,x,k,6); rm(y,x,k,7)
-#else
-
-#define state_in(y,x,k) \
-switch(nc) \
-{   case 8: si(y,x,k,7); si(y,x,k,6); \
-    case 6: si(y,x,k,5); si(y,x,k,4); \
-    case 4: si(y,x,k,3); si(y,x,k,2); \
-            si(y,x,k,1); si(y,x,k,0); \
-}
-
-#define state_out(y,x) \
-switch(nc) \
-{   case 8: so(y,x,7); so(y,x,6); \
-    case 6: so(y,x,5); so(y,x,4); \
-    case 4: so(y,x,3); so(y,x,2); \
-            so(y,x,1); so(y,x,0); \
-}
-
-#if defined(FAST_VARIABLE)
-
-#define round(rm,y,x,k) \
-switch(nc) \
-{   case 8: rm(y,x,k,7); rm(y,x,k,6); \
-            rm(y,x,k,5); rm(y,x,k,4); \
-            rm(y,x,k,3); rm(y,x,k,2); \
-            rm(y,x,k,1); rm(y,x,k,0); \
-            break; \
-    case 6: rm(y,x,k,5); rm(y,x,k,4); \
-            rm(y,x,k,3); rm(y,x,k,2); \
-            rm(y,x,k,1); rm(y,x,k,0); \
-            break; \
-    case 4: rm(y,x,k,3); rm(y,x,k,2); \
-            rm(y,x,k,1); rm(y,x,k,0); \
-            break; \
-}
-#else
-
-#define round(rm,y,x,k) \
-switch(nc) \
-{   case 8: rm(y,x,k,7); rm(y,x,k,6); \
-    case 6: rm(y,x,k,5); rm(y,x,k,4); \
-    case 4: rm(y,x,k,3); rm(y,x,k,2); \
-            rm(y,x,k,1); rm(y,x,k,0); \
-}
-
-#endif
-
-#endif
-#endif
-
-#if defined(ENCRYPTION)
-
-/* I am grateful to Frank Yellin for the following construction
-   (and that for decryption) which, given the column (c) of the
-   output state variable, gives the input state variables which
-   are needed in its computation for each row (r) of the state.
-
-   For the fixed block size options, compilers should be able to
-   reduce this complex expression (and the equivalent one for
-   decryption) to a static variable reference at compile time.
-   But for variable block size code, there will be some limbs on
-   which conditional clauses will be returned.
+/* Visual C++ .Net v7.1 provides the fastest encryption code when using
+   Pentium optimiation with small code but this is poor for decryption
+   so we need to control this with the following VC++ pragmas
 */
 
-/* y = output word, x = input word, r = row, c = column for r = 0,
-   1, 2 and 3 = column accessed for row r.
+#if defined( _MSC_VER ) && !defined( _WIN64 )
+#pragma optimize( "s", on )
+#endif
+
+/* Given the column (c) of the output state variable, the following
+   macros give the input state variables which are needed in its
+   computation for each row (r) of the state. All the alternative
+   macros give the same end values but expand into different ways
+   of calculating these values.  In particular the complex macro
+   used for dynamically variable block sizes is designed to expand
+   to a compile time constant whenever possible but will expand to
+   conditional clauses on some branches (I am grateful to Frank
+   Yellin for this construction)
 */
 
 #define fwd_var(x,r,c)\
- ( r == 0 ?           \
-    ( c == 0 ? s(x,0) \
-    : c == 1 ? s(x,1) \
-    : c == 2 ? s(x,2) \
-    : c == 3 ? s(x,3) \
-    : c == 4 ? s(x,4) \
-    : c == 5 ? s(x,5) \
-    : c == 6 ? s(x,6) \
-    :          s(x,7))\
- : r == 1 ?           \
-    ( c == 0 ? s(x,1) \
-    : c == 1 ? s(x,2) \
-    : c == 2 ? s(x,3) \
-    : c == 3 ? nc == 4 ? s(x,0) : s(x,4) \
-    : c == 4 ? s(x,5) \
-    : c == 5 ? nc == 8 ? s(x,6) : s(x,0) \
-    : c == 6 ? s(x,7) \
-    :          s(x,0))\
- : r == 2 ?           \
-    ( c == 0 ? nc == 8 ? s(x,3) : s(x,2) \
-    : c == 1 ? nc == 8 ? s(x,4) : s(x,3) \
-    : c == 2 ? nc == 4 ? s(x,0) : nc == 8 ? s(x,5) : s(x,4) \
-    : c == 3 ? nc == 4 ? s(x,1) : nc == 8 ? s(x,6) : s(x,5) \
-    : c == 4 ? nc == 8 ? s(x,7) : s(x,0) \
-    : c == 5 ? nc == 8 ? s(x,0) : s(x,1) \
-    : c == 6 ? s(x,1) \
-    :          s(x,2))\
- :                    \
-    ( c == 0 ? nc == 8 ? s(x,4) : s(x,3) \
-    : c == 1 ? nc == 4 ? s(x,0) : nc == 8 ? s(x,5) : s(x,4) \
-    : c == 2 ? nc == 4 ? s(x,1) : nc == 8 ? s(x,6) : s(x,5) \
-    : c == 3 ? nc == 4 ? s(x,2) : nc == 8 ? s(x,7) : s(x,0) \
-    : c == 4 ? nc == 8 ? s(x,0) : s(x,1) \
-    : c == 5 ? nc == 8 ? s(x,1) : s(x,2) \
-    : c == 6 ? s(x,2) \
-    :          s(x,3)))
+ ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
+ : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\
+ : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
+ :          ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2)))
 
 #if defined(FT4_SET)
 #undef  dec_fmvars
-#define dec_fmvars
-#define fwd_rnd(y,x,k,c)    s(y,c)= (k)[c] ^ four_tables(x,ft_tab,fwd_var,rf1,c)
+#define fwd_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c))
 #elif defined(FT1_SET)
 #undef  dec_fmvars
-#define dec_fmvars
-#define fwd_rnd(y,x,k,c)    s(y,c)= (k)[c] ^ one_table(x,upr,ft_tab,fwd_var,rf1,c)
+#define fwd_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ one_table(x,upr,t_use(f,n),fwd_var,rf1,c))
 #else
-#define fwd_rnd(y,x,k,c)    s(y,c) = fwd_mcol(no_table(x,s_box,fwd_var,rf1,c)) ^ (k)[c]
+#define fwd_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ fwd_mcol(no_table(x,t_use(s,box),fwd_var,rf1,c)))
 #endif
 
 #if defined(FL4_SET)
-#define fwd_lrnd(y,x,k,c)   s(y,c)= (k)[c] ^ four_tables(x,fl_tab,fwd_var,rf1,c)
+#define fwd_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,l),fwd_var,rf1,c))
 #elif defined(FL1_SET)
-#define fwd_lrnd(y,x,k,c)   s(y,c)= (k)[c] ^ one_table(x,ups,fl_tab,fwd_var,rf1,c)
+#define fwd_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ one_table(x,ups,t_use(f,l),fwd_var,rf1,c))
 #else
-#define fwd_lrnd(y,x,k,c)   s(y,c) = no_table(x,s_box,fwd_var,rf1,c) ^ (k)[c]
+#define fwd_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ no_table(x,t_use(s,box),fwd_var,rf1,c))
 #endif
 
-aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])
-{   aes_32t        locals(b0, b1);
-    const aes_32t  *kp = cx->k_sch;
-    dec_fmvars  /* declare variables for fwd_mcol() if needed */
+AES_RETURN aes_xi(encrypt)(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1])
+{   uint32_t         locals(b0, b1);
+    const uint32_t   *kp;
+#if defined( dec_fmvars )
+    dec_fmvars; /* declare variables for fwd_mcol() if needed */
+#endif
 
-    if(!(cx->n_blk & 1)) return aes_bad;
+	if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
+		return EXIT_FAILURE;
 
-    state_in(b0, in_blk, kp);
+	kp = cx->ks;
+    state_in(b0, in, kp);
 
 #if (ENC_UNROLL == FULL)
 
-    kp += (cx->n_rnd - 9) * nc;
-
-    switch(cx->n_rnd)
+    switch(cx->inf.b[0])
     {
-    case 14:    round(fwd_rnd,  b1, b0, kp - 4 * nc);
-                round(fwd_rnd,  b0, b1, kp - 3 * nc);
-    case 12:    round(fwd_rnd,  b1, b0, kp - 2 * nc);
-                round(fwd_rnd,  b0, b1, kp -     nc);
-    case 10:    round(fwd_rnd,  b1, b0, kp         );
-                round(fwd_rnd,  b0, b1, kp +     nc);
-                round(fwd_rnd,  b1, b0, kp + 2 * nc);
-                round(fwd_rnd,  b0, b1, kp + 3 * nc);
-                round(fwd_rnd,  b1, b0, kp + 4 * nc);
-                round(fwd_rnd,  b0, b1, kp + 5 * nc);
-                round(fwd_rnd,  b1, b0, kp + 6 * nc);
-                round(fwd_rnd,  b0, b1, kp + 7 * nc);
-                round(fwd_rnd,  b1, b0, kp + 8 * nc);
-                round(fwd_lrnd, b0, b1, kp + 9 * nc);
+    case 14 * 16:
+        round(fwd_rnd,  b1, b0, kp + 1 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 2 * N_COLS);
+        kp += 2 * N_COLS;
+    case 12 * 16:
+        round(fwd_rnd,  b1, b0, kp + 1 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 2 * N_COLS);
+        kp += 2 * N_COLS;
+    case 10 * 16:
+        round(fwd_rnd,  b1, b0, kp + 1 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 2 * N_COLS);
+        round(fwd_rnd,  b1, b0, kp + 3 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 4 * N_COLS);
+        round(fwd_rnd,  b1, b0, kp + 5 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 6 * N_COLS);
+        round(fwd_rnd,  b1, b0, kp + 7 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 8 * N_COLS);
+        round(fwd_rnd,  b1, b0, kp + 9 * N_COLS);
+        round(fwd_lrnd, b0, b1, kp +10 * N_COLS);
     }
+
 #else
 
 #if (ENC_UNROLL == PARTIAL)
-    {   aes_32t    rnd;
-        for(rnd = 0; rnd < (cx->n_rnd >> 1) - 1; ++rnd)
+    {   uint32_t    rnd;
+        for(rnd = 0; rnd < (cx->inf.b[0] >> 5) - 1; ++rnd)
         {
-            kp += nc;
+            kp += N_COLS;
             round(fwd_rnd, b1, b0, kp);
-            kp += nc;
+            kp += N_COLS;
             round(fwd_rnd, b0, b1, kp);
         }
-        kp += nc;
+        kp += N_COLS;
         round(fwd_rnd,  b1, b0, kp);
 #else
-    {   aes_32t    rnd, *p0 = b0, *p1 = b1, *pt;
-        for(rnd = 0; rnd < cx->n_rnd - 1; ++rnd)
+    {   uint32_t    rnd;
+        for(rnd = 0; rnd < (cx->inf.b[0] >> 4) - 1; ++rnd)
         {
-            kp += nc;
-            round(fwd_rnd, p1, p0, kp);
-            pt = p0, p0 = p1, p1 = pt;
+            kp += N_COLS;
+            round(fwd_rnd, b1, b0, kp);
+            l_copy(b0, b1);
         }
 #endif
-        kp += nc;
+        kp += N_COLS;
         round(fwd_lrnd, b0, b1, kp);
     }
 #endif
 
-    state_out(out_blk, b0);
-    return aes_good;
+    state_out(out, b0);
+    return EXIT_SUCCESS;
 }
 
 #endif
 
-#if defined(DECRYPTION)
+#if ( FUNCS_IN_C & DECRYPTION_IN_C)
 
-#define inv_var(x,r,c) \
- ( r == 0 ?           \
-    ( c == 0 ? s(x,0) \
-    : c == 1 ? s(x,1) \
-    : c == 2 ? s(x,2) \
-    : c == 3 ? s(x,3) \
-    : c == 4 ? s(x,4) \
-    : c == 5 ? s(x,5) \
-    : c == 6 ? s(x,6) \
-    :          s(x,7))\
- : r == 1 ?           \
-    ( c == 0 ? nc == 4 ? s(x,3) : nc == 8 ? s(x,7) : s(x,5) \
-    : c == 1 ? s(x,0) \
-    : c == 2 ? s(x,1) \
-    : c == 3 ? s(x,2) \
-    : c == 4 ? s(x,3) \
-    : c == 5 ? s(x,4) \
-    : c == 6 ? s(x,5) \
-    :          s(x,6))\
- : r == 2 ?           \
-    ( c == 0 ? nc == 4 ? s(x,2) : nc == 8 ? s(x,5) : s(x,4) \
-    : c == 1 ? nc == 4 ? s(x,3) : nc == 8 ? s(x,6) : s(x,5) \
-    : c == 2 ? nc == 8 ? s(x,7) : s(x,0) \
-    : c == 3 ? nc == 8 ? s(x,0) : s(x,1) \
-    : c == 4 ? nc == 8 ? s(x,1) : s(x,2) \
-    : c == 5 ? nc == 8 ? s(x,2) : s(x,3) \
-    : c == 6 ? s(x,3) \
-    :          s(x,4))\
- :                    \
-    ( c == 0 ? nc == 4 ? s(x,1) : nc == 8 ? s(x,4) : s(x,3) \
-    : c == 1 ? nc == 4 ? s(x,2) : nc == 8 ? s(x,5) : s(x,4) \
-    : c == 2 ? nc == 4 ? s(x,3) : nc == 8 ? s(x,6) : s(x,5) \
-    : c == 3 ? nc == 8 ? s(x,7) : s(x,0) \
-    : c == 4 ? nc == 8 ? s(x,0) : s(x,1) \
-    : c == 5 ? nc == 8 ? s(x,1) : s(x,2) \
-    : c == 6 ? s(x,2) \
-    :          s(x,3)))
+/* Visual C++ .Net v7.1 provides the fastest encryption code when using
+   Pentium optimiation with small code but this is poor for decryption
+   so we need to control this with the following VC++ pragmas
+*/
+
+#if defined( _MSC_VER ) && !defined( _WIN64 )
+#pragma optimize( "t", on )
+#endif
+
+/* Given the column (c) of the output state variable, the following
+   macros give the input state variables which are needed in its
+   computation for each row (r) of the state. All the alternative
+   macros give the same end values but expand into different ways
+   of calculating these values.  In particular the complex macro
+   used for dynamically variable block sizes is designed to expand
+   to a compile time constant whenever possible but will expand to
+   conditional clauses on some branches (I am grateful to Frank
+   Yellin for this construction)
+*/
+
+#define inv_var(x,r,c)\
+ ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
+ : r == 1 ? ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))\
+ : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
+ :          ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0)))
 
 #if defined(IT4_SET)
 #undef  dec_imvars
-#define dec_imvars
-#define inv_rnd(y,x,k,c)    s(y,c)= (k)[c] ^ four_tables(x,it_tab,inv_var,rf1,c)
+#define inv_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ four_tables(x,t_use(i,n),inv_var,rf1,c))
 #elif defined(IT1_SET)
 #undef  dec_imvars
-#define dec_imvars
-#define inv_rnd(y,x,k,c)    s(y,c)= (k)[c] ^ one_table(x,upr,it_tab,inv_var,rf1,c)
+#define inv_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ one_table(x,upr,t_use(i,n),inv_var,rf1,c))
 #else
-#define inv_rnd(y,x,k,c)    s(y,c) = inv_mcol(no_table(x,inv_s_box,inv_var,rf1,c) ^ (k)[c])
+#define inv_rnd(y,x,k,c)    (s(y,c) = inv_mcol((k)[c] ^ no_table(x,t_use(i,box),inv_var,rf1,c)))
 #endif
 
 #if defined(IL4_SET)
-#define inv_lrnd(y,x,k,c)   s(y,c)= (k)[c] ^ four_tables(x,il_tab,inv_var,rf1,c)
+#define inv_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ four_tables(x,t_use(i,l),inv_var,rf1,c))
 #elif defined(IL1_SET)
-#define inv_lrnd(y,x,k,c)   s(y,c)= (k)[c] ^ one_table(x,ups,il_tab,inv_var,rf1,c)
+#define inv_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ one_table(x,ups,t_use(i,l),inv_var,rf1,c))
 #else
-#define inv_lrnd(y,x,k,c)   s(y,c) = no_table(x,inv_s_box,inv_var,rf1,c) ^ (k)[c]
+#define inv_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ no_table(x,t_use(i,box),inv_var,rf1,c))
 #endif
 
-aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])
-{   aes_32t        locals(b0, b1);
-    const aes_32t  *kp = cx->k_sch + nc * cx->n_rnd;
-    dec_imvars  /* declare variables for inv_mcol() if needed */
+/* This code can work with the decryption key schedule in the   */
+/* order that is used for encrytpion (where the 1st decryption  */
+/* round key is at the high end ot the schedule) or with a key  */
+/* schedule that has been reversed to put the 1st decryption    */
+/* round key at the low end of the schedule in memory (when     */
+/* AES_REV_DKS is defined)                                      */
 
-    if(!(cx->n_blk & 2)) return aes_bad;
+#ifdef AES_REV_DKS
+#define key_ofs     0
+#define rnd_key(n)  (kp + n * N_COLS)
+#else
+#define key_ofs     1
+#define rnd_key(n)  (kp - n * N_COLS)
+#endif
 
-    state_in(b0, in_blk, kp);
+AES_RETURN aes_xi(decrypt)(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1])
+{   uint32_t        locals(b0, b1);
+#if defined( dec_imvars )
+    dec_imvars; /* declare variables for inv_mcol() if needed */
+#endif
+    const uint32_t *kp;
+
+	if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
+		return EXIT_FAILURE;
+
+    kp = cx->ks + (key_ofs ? (cx->inf.b[0] >> 2) : 0);
+    state_in(b0, in, kp);
 
 #if (DEC_UNROLL == FULL)
 
-    kp = cx->k_sch + 9 * nc;
-    switch(cx->n_rnd)
+    kp = cx->ks + (key_ofs ? 0 : (cx->inf.b[0] >> 2));
+    switch(cx->inf.b[0])
     {
-    case 14:    round(inv_rnd,  b1, b0, kp + 4 * nc);
-                round(inv_rnd,  b0, b1, kp + 3 * nc);
-    case 12:    round(inv_rnd,  b1, b0, kp + 2 * nc);
-                round(inv_rnd,  b0, b1, kp + nc    );
-    case 10:    round(inv_rnd,  b1, b0, kp         );
-                round(inv_rnd,  b0, b1, kp -     nc);
-                round(inv_rnd,  b1, b0, kp - 2 * nc);
-                round(inv_rnd,  b0, b1, kp - 3 * nc);
-                round(inv_rnd,  b1, b0, kp - 4 * nc);
-                round(inv_rnd,  b0, b1, kp - 5 * nc);
-                round(inv_rnd,  b1, b0, kp - 6 * nc);
-                round(inv_rnd,  b0, b1, kp - 7 * nc);
-                round(inv_rnd,  b1, b0, kp - 8 * nc);
-                round(inv_lrnd, b0, b1, kp - 9 * nc);
+    case 14 * 16:
+        round(inv_rnd,  b1, b0, rnd_key(-13));
+        round(inv_rnd,  b0, b1, rnd_key(-12));
+    case 12 * 16:
+        round(inv_rnd,  b1, b0, rnd_key(-11));
+        round(inv_rnd,  b0, b1, rnd_key(-10));
+    case 10 * 16:
+        round(inv_rnd,  b1, b0, rnd_key(-9));
+        round(inv_rnd,  b0, b1, rnd_key(-8));
+        round(inv_rnd,  b1, b0, rnd_key(-7));
+        round(inv_rnd,  b0, b1, rnd_key(-6));
+        round(inv_rnd,  b1, b0, rnd_key(-5));
+        round(inv_rnd,  b0, b1, rnd_key(-4));
+        round(inv_rnd,  b1, b0, rnd_key(-3));
+        round(inv_rnd,  b0, b1, rnd_key(-2));
+        round(inv_rnd,  b1, b0, rnd_key(-1));
+        round(inv_lrnd, b0, b1, rnd_key( 0));
     }
+
 #else
 
 #if (DEC_UNROLL == PARTIAL)
-    {   aes_32t    rnd;
-        for(rnd = 0; rnd < (cx->n_rnd >> 1) - 1; ++rnd)
+    {   uint32_t    rnd;
+        for(rnd = 0; rnd < (cx->inf.b[0] >> 5) - 1; ++rnd)
         {
-            kp -= nc;
+            kp = rnd_key(1);
             round(inv_rnd, b1, b0, kp);
-            kp -= nc;
+            kp = rnd_key(1);
             round(inv_rnd, b0, b1, kp);
         }
-        kp -= nc;
+        kp = rnd_key(1);
         round(inv_rnd, b1, b0, kp);
 #else
-    {   aes_32t    rnd, *p0 = b0, *p1 = b1, *pt;
-        for(rnd = 0; rnd < cx->n_rnd - 1; ++rnd)
+    {   uint32_t    rnd;
+        for(rnd = 0; rnd < (cx->inf.b[0] >> 4) - 1; ++rnd)
         {
-            kp -= nc;
-            round(inv_rnd, p1, p0, kp);
-            pt = p0, p0 = p1, p1 = pt;
+            kp = rnd_key(1);
+            round(inv_rnd, b1, b0, kp);
+            l_copy(b0, b1);
         }
 #endif
-        kp -= nc;
+        kp = rnd_key(1);
         round(inv_lrnd, b0, b1, kp);
-    }
+        }
 #endif
 
-    state_out(out_blk, b0);
-    return aes_good;
+    state_out(out, b0);
+    return EXIT_SUCCESS;
 }
 
 #endif
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/aeskey.c b/aeskey.c
index b6dee48d3c..16e9607ff1 100644
--- a/aeskey.c
+++ b/aeskey.c
@@ -1,68 +1,46 @@
 /*
- -------------------------------------------------------------------------
- Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
- All rights reserved.
+---------------------------------------------------------------------------
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
 
- LICENSE TERMS
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
 
- The free distribution and use of this software in both source and binary
- form is allowed (with or without changes) provided that:
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
 
-   1. distributions of this source code include the above copyright
-      notice, this list of conditions and the following disclaimer;
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
 
-   2. distributions in binary form include the above copyright
-      notice, this list of conditions and the following disclaimer
-      in the documentation and/or other associated materials;
-
-   3. the copyright holder's name is not used to endorse products
-      built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and fitness for purpose.
- -------------------------------------------------------------------------
- Issue Date: 29/07/2002
-
- This file contains the code for implementing the key schedule for AES
- (Rijndael) for block and key sizes of 16, 24, and 32 bytes.
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 20/12/2007
 */
 
 #include "aesopt.h"
+#include "aestab.h"
 
-#if defined(BLOCK_SIZE) && (BLOCK_SIZE & 7)
-#error An illegal block size has been specified.
+#if defined( USE_INTEL_AES_IF_PRESENT )
+#  include "aes_ni.h"
+#else
+/* map names here to provide the external API ('name' -> 'aes_name') */
+#  define aes_xi(x) aes_ ## x
 #endif
 
-/* Subroutine to set the block size (if variable) in bytes, legal
-   values being 16, 24 and 32.
-*/
+#ifdef USE_VIA_ACE_IF_PRESENT
+#  include "aes_via_ace.h"
+#endif
 
-#if !defined(BLOCK_SIZE)
-
-aes_rval aes_blk_len(unsigned int blen, aes_ctx cx[1])
+#if defined(__cplusplus)
+extern "C"
 {
-#if !defined(FIXED_TABLES)
-    if(!tab_init) gen_tabs();
-#endif
-
-    if((blen & 7) || blen < 16 || blen > 32)
-    {
-        cx->n_blk = 0; return aes_bad;
-    }
-
-    cx->n_blk = blen;
-    return aes_good;
-}
-
 #endif
 
 /* Initialise the key schedule from the user supplied key. The key
-   length is now specified in bytes - 16, 24 or 32 as appropriate.
-   This corresponds to bit lengths of 128, 192 and 256 bits, and
-   to Nk values of 4, 6 and 8 respectively.
+   length can be specified in bytes, with legal values of 16, 24
+   and 32, or in bits, with legal values of 128, 192 and 256. These
+   values correspond with Nk values of 4, 6 and 8 respectively.
 
    The following macros implement a single cycle in the key
    schedule generation process. The number of cycles needed
@@ -77,293 +55,500 @@ aes_rval aes_blk_len(unsigned int blen, aes_ctx cx[1])
     cx->n_col = 8   29 23 19 17 14
 */
 
+#if defined( REDUCE_CODE_SIZE )
+#  define ls_box ls_sub
+   uint32_t ls_sub(const uint32_t t, const uint32_t n);
+#  define inv_mcol im_sub
+   uint32_t im_sub(const uint32_t x);
+#  ifdef ENC_KS_UNROLL
+#    undef ENC_KS_UNROLL
+#  endif
+#  ifdef DEC_KS_UNROLL
+#    undef DEC_KS_UNROLL
+#  endif
+#endif
+
+#if (FUNCS_IN_C & ENC_KEYING_IN_C)
+
+#if defined(AES_128) || defined( AES_VAR )
+
 #define ke4(k,i) \
-{   k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; k[4*(i)+5] = ss[1] ^= ss[0]; \
-    k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \
+{   k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; \
+    k[4*(i)+5] = ss[1] ^= ss[0]; \
+    k[4*(i)+6] = ss[2] ^= ss[1]; \
+    k[4*(i)+7] = ss[3] ^= ss[2]; \
 }
-#define kel4(k,i) \
-{   k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; k[4*(i)+5] = ss[1] ^= ss[0]; \
-    k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \
+
+AES_RETURN aes_xi(encrypt_key128)(const unsigned char *key, aes_encrypt_ctx cx[1])
+{   uint32_t    ss[4];
+
+    cx->ks[0] = ss[0] = word_in(key, 0);
+    cx->ks[1] = ss[1] = word_in(key, 1);
+    cx->ks[2] = ss[2] = word_in(key, 2);
+    cx->ks[3] = ss[3] = word_in(key, 3);
+
+#ifdef ENC_KS_UNROLL
+    ke4(cx->ks, 0);  ke4(cx->ks, 1);
+    ke4(cx->ks, 2);  ke4(cx->ks, 3);
+    ke4(cx->ks, 4);  ke4(cx->ks, 5);
+    ke4(cx->ks, 6);  ke4(cx->ks, 7);
+    ke4(cx->ks, 8);
+#else
+    {   uint32_t i;
+        for(i = 0; i < 9; ++i)
+            ke4(cx->ks, i);
+    }
+#endif
+    ke4(cx->ks, 9);
+    cx->inf.l = 0;
+    cx->inf.b[0] = 10 * 16;
+
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+    return EXIT_SUCCESS;
+}
+
+#endif
+
+#if defined(AES_192) || defined( AES_VAR )
+
+#define kef6(k,i) \
+{   k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; \
+    k[6*(i)+ 7] = ss[1] ^= ss[0]; \
+    k[6*(i)+ 8] = ss[2] ^= ss[1]; \
+    k[6*(i)+ 9] = ss[3] ^= ss[2]; \
 }
 
 #define ke6(k,i) \
-{   k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; k[6*(i)+ 7] = ss[1] ^= ss[0]; \
-    k[6*(i)+ 8] = ss[2] ^= ss[1]; k[6*(i)+ 9] = ss[3] ^= ss[2]; \
-    k[6*(i)+10] = ss[4] ^= ss[3]; k[6*(i)+11] = ss[5] ^= ss[4]; \
+{   kef6(k,i); \
+    k[6*(i)+10] = ss[4] ^= ss[3]; \
+    k[6*(i)+11] = ss[5] ^= ss[4]; \
 }
-#define kel6(k,i) \
-{   k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; k[6*(i)+ 7] = ss[1] ^= ss[0]; \
-    k[6*(i)+ 8] = ss[2] ^= ss[1]; k[6*(i)+ 9] = ss[3] ^= ss[2]; \
+
+AES_RETURN aes_xi(encrypt_key192)(const unsigned char *key, aes_encrypt_ctx cx[1])
+{   uint32_t    ss[6];
+
+	cx->ks[0] = ss[0] = word_in(key, 0);
+    cx->ks[1] = ss[1] = word_in(key, 1);
+    cx->ks[2] = ss[2] = word_in(key, 2);
+    cx->ks[3] = ss[3] = word_in(key, 3);
+    cx->ks[4] = ss[4] = word_in(key, 4);
+    cx->ks[5] = ss[5] = word_in(key, 5);
+
+#ifdef ENC_KS_UNROLL
+    ke6(cx->ks, 0);  ke6(cx->ks, 1);
+    ke6(cx->ks, 2);  ke6(cx->ks, 3);
+    ke6(cx->ks, 4);  ke6(cx->ks, 5);
+    ke6(cx->ks, 6);
+#else
+    {   uint32_t i;
+        for(i = 0; i < 7; ++i)
+            ke6(cx->ks, i);
+    }
+#endif
+    kef6(cx->ks, 7);
+    cx->inf.l = 0;
+    cx->inf.b[0] = 12 * 16;
+
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+    return EXIT_SUCCESS;
+}
+
+#endif
+
+#if defined(AES_256) || defined( AES_VAR )
+
+#define kef8(k,i) \
+{   k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; \
+    k[8*(i)+ 9] = ss[1] ^= ss[0]; \
+    k[8*(i)+10] = ss[2] ^= ss[1]; \
+    k[8*(i)+11] = ss[3] ^= ss[2]; \
 }
 
 #define ke8(k,i) \
-{   k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; k[8*(i)+ 9] = ss[1] ^= ss[0]; \
-    k[8*(i)+10] = ss[2] ^= ss[1]; k[8*(i)+11] = ss[3] ^= ss[2]; \
-    k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); k[8*(i)+13] = ss[5] ^= ss[4]; \
-    k[8*(i)+14] = ss[6] ^= ss[5]; k[8*(i)+15] = ss[7] ^= ss[6]; \
-}
-#define kel8(k,i) \
-{   k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; k[8*(i)+ 9] = ss[1] ^= ss[0]; \
-    k[8*(i)+10] = ss[2] ^= ss[1]; k[8*(i)+11] = ss[3] ^= ss[2]; \
+{   kef8(k,i); \
+    k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); \
+    k[8*(i)+13] = ss[5] ^= ss[4]; \
+    k[8*(i)+14] = ss[6] ^= ss[5]; \
+    k[8*(i)+15] = ss[7] ^= ss[6]; \
 }
 
-#if defined(ENCRYPTION_KEY_SCHEDULE)
+AES_RETURN aes_xi(encrypt_key256)(const unsigned char *key, aes_encrypt_ctx cx[1])
+{   uint32_t    ss[8];
 
-aes_rval aes_enc_key(const unsigned char in_key[], unsigned int klen, aes_ctx cx[1])
-{   aes_32t    ss[8];
+    cx->ks[0] = ss[0] = word_in(key, 0);
+    cx->ks[1] = ss[1] = word_in(key, 1);
+    cx->ks[2] = ss[2] = word_in(key, 2);
+    cx->ks[3] = ss[3] = word_in(key, 3);
+    cx->ks[4] = ss[4] = word_in(key, 4);
+    cx->ks[5] = ss[5] = word_in(key, 5);
+    cx->ks[6] = ss[6] = word_in(key, 6);
+    cx->ks[7] = ss[7] = word_in(key, 7);
 
-#if !defined(FIXED_TABLES)
-    if(!tab_init) gen_tabs();
-#endif
-
-#if !defined(BLOCK_SIZE)
-    if(!cx->n_blk) cx->n_blk = 16;
+#ifdef ENC_KS_UNROLL
+    ke8(cx->ks, 0); ke8(cx->ks, 1);
+    ke8(cx->ks, 2); ke8(cx->ks, 3);
+    ke8(cx->ks, 4); ke8(cx->ks, 5);
 #else
-    cx->n_blk = BLOCK_SIZE;
-#endif
-
-    cx->n_blk = (cx->n_blk & ~3) | 1;
-
-    cx->k_sch[0] = ss[0] = word_in(in_key     );
-    cx->k_sch[1] = ss[1] = word_in(in_key +  4);
-    cx->k_sch[2] = ss[2] = word_in(in_key +  8);
-    cx->k_sch[3] = ss[3] = word_in(in_key + 12);
-
-#if (BLOCK_SIZE == 16) && (ENC_UNROLL != NONE)
-
-    switch(klen)
-    {
-    case 16:    ke4(cx->k_sch, 0); ke4(cx->k_sch, 1);
-                ke4(cx->k_sch, 2); ke4(cx->k_sch, 3);
-                ke4(cx->k_sch, 4); ke4(cx->k_sch, 5);
-                ke4(cx->k_sch, 6); ke4(cx->k_sch, 7);
-                ke4(cx->k_sch, 8); kel4(cx->k_sch, 9);
-                cx->n_rnd = 10; break;
-    case 24:    cx->k_sch[4] = ss[4] = word_in(in_key + 16);
-                cx->k_sch[5] = ss[5] = word_in(in_key + 20);
-                ke6(cx->k_sch, 0); ke6(cx->k_sch, 1);
-                ke6(cx->k_sch, 2); ke6(cx->k_sch, 3);
-                ke6(cx->k_sch, 4); ke6(cx->k_sch, 5);
-                ke6(cx->k_sch, 6); kel6(cx->k_sch, 7);
-                cx->n_rnd = 12; break;
-    case 32:    cx->k_sch[4] = ss[4] = word_in(in_key + 16);
-                cx->k_sch[5] = ss[5] = word_in(in_key + 20);
-                cx->k_sch[6] = ss[6] = word_in(in_key + 24);
-                cx->k_sch[7] = ss[7] = word_in(in_key + 28);
-                ke8(cx->k_sch, 0); ke8(cx->k_sch, 1);
-                ke8(cx->k_sch, 2); ke8(cx->k_sch, 3);
-                ke8(cx->k_sch, 4); ke8(cx->k_sch, 5);
-                kel8(cx->k_sch, 6);
-                cx->n_rnd = 14; break;
-    default:    cx->n_rnd = 0; return aes_bad;
-    }
-#else
-    {   aes_32t i, l;
-        cx->n_rnd = ((klen >> 2) > nc ? (klen >> 2) : nc) + 6;
-        l = (nc * cx->n_rnd + nc - 1) / (klen >> 2);
-
-        switch(klen)
-        {
-        case 16:    for(i = 0; i < l; ++i)
-                        ke4(cx->k_sch, i);
-                    break;
-        case 24:    cx->k_sch[4] = ss[4] = word_in(in_key + 16);
-                    cx->k_sch[5] = ss[5] = word_in(in_key + 20);
-                    for(i = 0; i < l; ++i)
-                        ke6(cx->k_sch, i);
-                    break;
-        case 32:    cx->k_sch[4] = ss[4] = word_in(in_key + 16);
-                    cx->k_sch[5] = ss[5] = word_in(in_key + 20);
-                    cx->k_sch[6] = ss[6] = word_in(in_key + 24);
-                    cx->k_sch[7] = ss[7] = word_in(in_key + 28);
-                    for(i = 0; i < l; ++i)
-                        ke8(cx->k_sch,  i);
-                    break;
-        default:    cx->n_rnd = 0; return aes_bad;
-        }
+    {   uint32_t i;
+        for(i = 0; i < 6; ++i)
+            ke8(cx->ks,  i);
     }
 #endif
+    kef8(cx->ks, 6);
+    cx->inf.l = 0;
+    cx->inf.b[0] = 14 * 16;
 
-    return aes_good;
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+    return EXIT_SUCCESS;
 }
 
 #endif
 
-#if defined(DECRYPTION_KEY_SCHEDULE)
+#endif
 
-#if (DEC_ROUND != NO_TABLES)
-#define d_vars  dec_imvars
-#define ff(x)   inv_mcol(x)
+#if (FUNCS_IN_C & DEC_KEYING_IN_C)
+
+/* this is used to store the decryption round keys  */
+/* in forward or reverse order                      */
+
+#ifdef AES_REV_DKS
+#define v(n,i)  ((n) - (i) + 2 * ((i) & 3))
 #else
+#define v(n,i)  (i)
+#endif
+
+#if DEC_ROUND == NO_TABLES
 #define ff(x)   (x)
-#define d_vars
+#else
+#define ff(x)   inv_mcol(x)
+#if defined( dec_imvars )
+#define d_vars  dec_imvars
 #endif
+#endif
+
+#if defined(AES_128) || defined( AES_VAR )
+
+#define k4e(k,i) \
+{   k[v(40,(4*(i))+4)] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; \
+    k[v(40,(4*(i))+5)] = ss[1] ^= ss[0]; \
+    k[v(40,(4*(i))+6)] = ss[2] ^= ss[1]; \
+    k[v(40,(4*(i))+7)] = ss[3] ^= ss[2]; \
+}
 
 #if 1
+
 #define kdf4(k,i) \
-{   ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; ss[1] = ss[1] ^ ss[3]; ss[2] = ss[2] ^ ss[3]; ss[3] = ss[3]; \
-    ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; ss[i % 4] ^= ss[4]; \
-    ss[4] ^= k[4*(i)];   k[4*(i)+4] = ff(ss[4]); ss[4] ^= k[4*(i)+1]; k[4*(i)+5] = ff(ss[4]); \
-    ss[4] ^= k[4*(i)+2]; k[4*(i)+6] = ff(ss[4]); ss[4] ^= k[4*(i)+3]; k[4*(i)+7] = ff(ss[4]); \
+{   ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \
+    ss[1] = ss[1] ^ ss[3]; \
+    ss[2] = ss[2] ^ ss[3]; \
+    ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; \
+    ss[i % 4] ^= ss[4]; \
+    ss[4] ^= k[v(40,(4*(i)))];   k[v(40,(4*(i))+4)] = ff(ss[4]); \
+    ss[4] ^= k[v(40,(4*(i))+1)]; k[v(40,(4*(i))+5)] = ff(ss[4]); \
+    ss[4] ^= k[v(40,(4*(i))+2)]; k[v(40,(4*(i))+6)] = ff(ss[4]); \
+    ss[4] ^= k[v(40,(4*(i))+3)]; k[v(40,(4*(i))+7)] = ff(ss[4]); \
 }
+
 #define kd4(k,i) \
-{   ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \
-    k[4*(i)+4] = ss[4] ^= k[4*(i)]; k[4*(i)+5] = ss[4] ^= k[4*(i)+1]; \
-    k[4*(i)+6] = ss[4] ^= k[4*(i)+2]; k[4*(i)+7] = ss[4] ^= k[4*(i)+3]; \
+{   ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; \
+    ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \
+    k[v(40,(4*(i))+4)] = ss[4] ^= k[v(40,(4*(i)))]; \
+    k[v(40,(4*(i))+5)] = ss[4] ^= k[v(40,(4*(i))+1)]; \
+    k[v(40,(4*(i))+6)] = ss[4] ^= k[v(40,(4*(i))+2)]; \
+    k[v(40,(4*(i))+7)] = ss[4] ^= k[v(40,(4*(i))+3)]; \
 }
+
 #define kdl4(k,i) \
-{   ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; ss[i % 4] ^= ss[4]; \
-    k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; k[4*(i)+5] = ss[1] ^ ss[3]; \
-    k[4*(i)+6] = ss[0]; k[4*(i)+7] = ss[1]; \
+{   ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; \
+    k[v(40,(4*(i))+4)] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \
+    k[v(40,(4*(i))+5)] = ss[1] ^ ss[3]; \
+    k[v(40,(4*(i))+6)] = ss[0]; \
+    k[v(40,(4*(i))+7)] = ss[1]; \
 }
+
 #else
+
 #define kdf4(k,i) \
-{   ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; k[4*(i)+ 4] = ff(ss[0]); ss[1] ^= ss[0]; k[4*(i)+ 5] = ff(ss[1]); \
-    ss[2] ^= ss[1]; k[4*(i)+ 6] = ff(ss[2]); ss[3] ^= ss[2]; k[4*(i)+ 7] = ff(ss[3]); \
+{   ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[v(40,(4*(i))+ 4)] = ff(ss[0]); \
+    ss[1] ^= ss[0]; k[v(40,(4*(i))+ 5)] = ff(ss[1]); \
+    ss[2] ^= ss[1]; k[v(40,(4*(i))+ 6)] = ff(ss[2]); \
+    ss[3] ^= ss[2]; k[v(40,(4*(i))+ 7)] = ff(ss[3]); \
 }
+
 #define kd4(k,i) \
-{   ss[4] = ls_box(ss[3],3) ^ rcon_tab[i]; \
-    ss[0] ^= ss[4]; ss[4] = ff(ss[4]); k[4*(i)+ 4] = ss[4] ^= k[4*(i)]; \
-    ss[1] ^= ss[0]; k[4*(i)+ 5] = ss[4] ^= k[4*(i)+ 1]; \
-    ss[2] ^= ss[1]; k[4*(i)+ 6] = ss[4] ^= k[4*(i)+ 2]; \
-    ss[3] ^= ss[2]; k[4*(i)+ 7] = ss[4] ^= k[4*(i)+ 3]; \
+{   ss[4] = ls_box(ss[3],3) ^ t_use(r,c)[i]; \
+    ss[0] ^= ss[4]; ss[4] = ff(ss[4]); k[v(40,(4*(i))+ 4)] = ss[4] ^= k[v(40,(4*(i)))]; \
+    ss[1] ^= ss[0]; k[v(40,(4*(i))+ 5)] = ss[4] ^= k[v(40,(4*(i))+ 1)]; \
+    ss[2] ^= ss[1]; k[v(40,(4*(i))+ 6)] = ss[4] ^= k[v(40,(4*(i))+ 2)]; \
+    ss[3] ^= ss[2]; k[v(40,(4*(i))+ 7)] = ss[4] ^= k[v(40,(4*(i))+ 3)]; \
 }
+
 #define kdl4(k,i) \
-{   ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; k[4*(i)+ 4] = ss[0]; ss[1] ^= ss[0]; k[4*(i)+ 5] = ss[1]; \
-    ss[2] ^= ss[1]; k[4*(i)+ 6] = ss[2]; ss[3] ^= ss[2]; k[4*(i)+ 7] = ss[3]; \
+{   ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[v(40,(4*(i))+ 4)] = ss[0]; \
+    ss[1] ^= ss[0]; k[v(40,(4*(i))+ 5)] = ss[1]; \
+    ss[2] ^= ss[1]; k[v(40,(4*(i))+ 6)] = ss[2]; \
+    ss[3] ^= ss[2]; k[v(40,(4*(i))+ 7)] = ss[3]; \
 }
+
 #endif
 
+AES_RETURN aes_xi(decrypt_key128)(const unsigned char *key, aes_decrypt_ctx cx[1])
+{   uint32_t    ss[5];
+#if defined( d_vars )
+        d_vars;
+#endif
+
+	cx->ks[v(40,(0))] = ss[0] = word_in(key, 0);
+    cx->ks[v(40,(1))] = ss[1] = word_in(key, 1);
+    cx->ks[v(40,(2))] = ss[2] = word_in(key, 2);
+    cx->ks[v(40,(3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+     kdf4(cx->ks, 0); kd4(cx->ks, 1);
+     kd4(cx->ks, 2);  kd4(cx->ks, 3);
+     kd4(cx->ks, 4);  kd4(cx->ks, 5);
+     kd4(cx->ks, 6);  kd4(cx->ks, 7);
+     kd4(cx->ks, 8);  kdl4(cx->ks, 9);
+#else
+    {   uint32_t i;
+        for(i = 0; i < 10; ++i)
+            k4e(cx->ks, i);
+#if !(DEC_ROUND == NO_TABLES)
+        for(i = N_COLS; i < 10 * N_COLS; ++i)
+            cx->ks[i] = inv_mcol(cx->ks[i]);
+#endif
+    }
+#endif
+    cx->inf.l = 0;
+    cx->inf.b[0] = 10 * 16;
+
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+    return EXIT_SUCCESS;
+}
+
+#endif
+
+#if defined(AES_192) || defined( AES_VAR )
+
+#define k6ef(k,i) \
+{   k[v(48,(6*(i))+ 6)] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; \
+    k[v(48,(6*(i))+ 7)] = ss[1] ^= ss[0]; \
+    k[v(48,(6*(i))+ 8)] = ss[2] ^= ss[1]; \
+    k[v(48,(6*(i))+ 9)] = ss[3] ^= ss[2]; \
+}
+
+#define k6e(k,i) \
+{   k6ef(k,i); \
+    k[v(48,(6*(i))+10)] = ss[4] ^= ss[3]; \
+    k[v(48,(6*(i))+11)] = ss[5] ^= ss[4]; \
+}
+
 #define kdf6(k,i) \
-{   ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; k[6*(i)+ 6] = ff(ss[0]); ss[1] ^= ss[0]; k[6*(i)+ 7] = ff(ss[1]); \
-    ss[2] ^= ss[1]; k[6*(i)+ 8] = ff(ss[2]); ss[3] ^= ss[2]; k[6*(i)+ 9] = ff(ss[3]); \
-    ss[4] ^= ss[3]; k[6*(i)+10] = ff(ss[4]); ss[5] ^= ss[4]; k[6*(i)+11] = ff(ss[5]); \
+{   ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[v(48,(6*(i))+ 6)] = ff(ss[0]); \
+    ss[1] ^= ss[0]; k[v(48,(6*(i))+ 7)] = ff(ss[1]); \
+    ss[2] ^= ss[1]; k[v(48,(6*(i))+ 8)] = ff(ss[2]); \
+    ss[3] ^= ss[2]; k[v(48,(6*(i))+ 9)] = ff(ss[3]); \
+    ss[4] ^= ss[3]; k[v(48,(6*(i))+10)] = ff(ss[4]); \
+    ss[5] ^= ss[4]; k[v(48,(6*(i))+11)] = ff(ss[5]); \
 }
+
 #define kd6(k,i) \
-{   ss[6] = ls_box(ss[5],3) ^ rcon_tab[i]; \
-    ss[0] ^= ss[6]; ss[6] = ff(ss[6]); k[6*(i)+ 6] = ss[6] ^= k[6*(i)]; \
-    ss[1] ^= ss[0]; k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1]; \
-    ss[2] ^= ss[1]; k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2]; \
-    ss[3] ^= ss[2]; k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3]; \
-    ss[4] ^= ss[3]; k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4]; \
-    ss[5] ^= ss[4]; k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5]; \
+{   ss[6] = ls_box(ss[5],3) ^ t_use(r,c)[i]; \
+    ss[0] ^= ss[6]; ss[6] = ff(ss[6]); k[v(48,(6*(i))+ 6)] = ss[6] ^= k[v(48,(6*(i)))]; \
+    ss[1] ^= ss[0]; k[v(48,(6*(i))+ 7)] = ss[6] ^= k[v(48,(6*(i))+ 1)]; \
+    ss[2] ^= ss[1]; k[v(48,(6*(i))+ 8)] = ss[6] ^= k[v(48,(6*(i))+ 2)]; \
+    ss[3] ^= ss[2]; k[v(48,(6*(i))+ 9)] = ss[6] ^= k[v(48,(6*(i))+ 3)]; \
+    ss[4] ^= ss[3]; k[v(48,(6*(i))+10)] = ss[6] ^= k[v(48,(6*(i))+ 4)]; \
+    ss[5] ^= ss[4]; k[v(48,(6*(i))+11)] = ss[6] ^= k[v(48,(6*(i))+ 5)]; \
 }
+
 #define kdl6(k,i) \
-{   ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; k[6*(i)+ 6] = ss[0]; ss[1] ^= ss[0]; k[6*(i)+ 7] = ss[1]; \
-    ss[2] ^= ss[1]; k[6*(i)+ 8] = ss[2]; ss[3] ^= ss[2]; k[6*(i)+ 9] = ss[3]; \
+{   ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[v(48,(6*(i))+ 6)] = ss[0]; \
+    ss[1] ^= ss[0]; k[v(48,(6*(i))+ 7)] = ss[1]; \
+    ss[2] ^= ss[1]; k[v(48,(6*(i))+ 8)] = ss[2]; \
+    ss[3] ^= ss[2]; k[v(48,(6*(i))+ 9)] = ss[3]; \
+}
+
+AES_RETURN aes_xi(decrypt_key192)(const unsigned char *key, aes_decrypt_ctx cx[1])
+{   uint32_t    ss[7];
+#if defined( d_vars )
+        d_vars;
+#endif
+
+    cx->ks[v(48,(0))] = ss[0] = word_in(key, 0);
+    cx->ks[v(48,(1))] = ss[1] = word_in(key, 1);
+    cx->ks[v(48,(2))] = ss[2] = word_in(key, 2);
+    cx->ks[v(48,(3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+    cx->ks[v(48,(4))] = ff(ss[4] = word_in(key, 4));
+    cx->ks[v(48,(5))] = ff(ss[5] = word_in(key, 5));
+    kdf6(cx->ks, 0); kd6(cx->ks, 1);
+    kd6(cx->ks, 2);  kd6(cx->ks, 3);
+    kd6(cx->ks, 4);  kd6(cx->ks, 5);
+    kd6(cx->ks, 6);  kdl6(cx->ks, 7);
+#else
+    cx->ks[v(48,(4))] = ss[4] = word_in(key, 4);
+    cx->ks[v(48,(5))] = ss[5] = word_in(key, 5);
+    {   uint32_t i;
+
+        for(i = 0; i < 7; ++i)
+            k6e(cx->ks, i);
+        k6ef(cx->ks, 7);
+#if !(DEC_ROUND == NO_TABLES)
+        for(i = N_COLS; i < 12 * N_COLS; ++i)
+            cx->ks[i] = inv_mcol(cx->ks[i]);
+#endif
+    }
+#endif
+    cx->inf.l = 0;
+    cx->inf.b[0] = 12 * 16;
+
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+    return EXIT_SUCCESS;
+}
+
+#endif
+
+#if defined(AES_256) || defined( AES_VAR )
+
+#define k8ef(k,i) \
+{   k[v(56,(8*(i))+ 8)] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; \
+    k[v(56,(8*(i))+ 9)] = ss[1] ^= ss[0]; \
+    k[v(56,(8*(i))+10)] = ss[2] ^= ss[1]; \
+    k[v(56,(8*(i))+11)] = ss[3] ^= ss[2]; \
+}
+
+#define k8e(k,i) \
+{   k8ef(k,i); \
+    k[v(56,(8*(i))+12)] = ss[4] ^= ls_box(ss[3],0); \
+    k[v(56,(8*(i))+13)] = ss[5] ^= ss[4]; \
+    k[v(56,(8*(i))+14)] = ss[6] ^= ss[5]; \
+    k[v(56,(8*(i))+15)] = ss[7] ^= ss[6]; \
 }
 
 #define kdf8(k,i) \
-{   ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; k[8*(i)+ 8] = ff(ss[0]); ss[1] ^= ss[0]; k[8*(i)+ 9] = ff(ss[1]); \
-    ss[2] ^= ss[1]; k[8*(i)+10] = ff(ss[2]); ss[3] ^= ss[2]; k[8*(i)+11] = ff(ss[3]); \
-    ss[4] ^= ls_box(ss[3],0); k[8*(i)+12] = ff(ss[4]); ss[5] ^= ss[4]; k[8*(i)+13] = ff(ss[5]); \
-    ss[6] ^= ss[5]; k[8*(i)+14] = ff(ss[6]); ss[7] ^= ss[6]; k[8*(i)+15] = ff(ss[7]); \
+{   ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[v(56,(8*(i))+ 8)] = ff(ss[0]); \
+    ss[1] ^= ss[0]; k[v(56,(8*(i))+ 9)] = ff(ss[1]); \
+    ss[2] ^= ss[1]; k[v(56,(8*(i))+10)] = ff(ss[2]); \
+    ss[3] ^= ss[2]; k[v(56,(8*(i))+11)] = ff(ss[3]); \
+    ss[4] ^= ls_box(ss[3],0); k[v(56,(8*(i))+12)] = ff(ss[4]); \
+    ss[5] ^= ss[4]; k[v(56,(8*(i))+13)] = ff(ss[5]); \
+    ss[6] ^= ss[5]; k[v(56,(8*(i))+14)] = ff(ss[6]); \
+    ss[7] ^= ss[6]; k[v(56,(8*(i))+15)] = ff(ss[7]); \
 }
+
 #define kd8(k,i) \
-{   aes_32t g = ls_box(ss[7],3) ^ rcon_tab[i]; \
-    ss[0] ^= g; g = ff(g); k[8*(i)+ 8] = g ^= k[8*(i)]; \
-    ss[1] ^= ss[0]; k[8*(i)+ 9] = g ^= k[8*(i)+ 1]; \
-    ss[2] ^= ss[1]; k[8*(i)+10] = g ^= k[8*(i)+ 2]; \
-    ss[3] ^= ss[2]; k[8*(i)+11] = g ^= k[8*(i)+ 3]; \
-    g = ls_box(ss[3],0); \
-    ss[4] ^= g; g = ff(g); k[8*(i)+12] = g ^= k[8*(i)+ 4]; \
-    ss[5] ^= ss[4]; k[8*(i)+13] = g ^= k[8*(i)+ 5]; \
-    ss[6] ^= ss[5]; k[8*(i)+14] = g ^= k[8*(i)+ 6]; \
-    ss[7] ^= ss[6]; k[8*(i)+15] = g ^= k[8*(i)+ 7]; \
+{   ss[8] = ls_box(ss[7],3) ^ t_use(r,c)[i]; \
+    ss[0] ^= ss[8]; ss[8] = ff(ss[8]); k[v(56,(8*(i))+ 8)] = ss[8] ^= k[v(56,(8*(i)))]; \
+    ss[1] ^= ss[0]; k[v(56,(8*(i))+ 9)] = ss[8] ^= k[v(56,(8*(i))+ 1)]; \
+    ss[2] ^= ss[1]; k[v(56,(8*(i))+10)] = ss[8] ^= k[v(56,(8*(i))+ 2)]; \
+    ss[3] ^= ss[2]; k[v(56,(8*(i))+11)] = ss[8] ^= k[v(56,(8*(i))+ 3)]; \
+    ss[8] = ls_box(ss[3],0); \
+    ss[4] ^= ss[8]; ss[8] = ff(ss[8]); k[v(56,(8*(i))+12)] = ss[8] ^= k[v(56,(8*(i))+ 4)]; \
+    ss[5] ^= ss[4]; k[v(56,(8*(i))+13)] = ss[8] ^= k[v(56,(8*(i))+ 5)]; \
+    ss[6] ^= ss[5]; k[v(56,(8*(i))+14)] = ss[8] ^= k[v(56,(8*(i))+ 6)]; \
+    ss[7] ^= ss[6]; k[v(56,(8*(i))+15)] = ss[8] ^= k[v(56,(8*(i))+ 7)]; \
 }
+
 #define kdl8(k,i) \
-{   ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; k[8*(i)+ 8] = ss[0]; ss[1] ^= ss[0]; k[8*(i)+ 9] = ss[1]; \
-    ss[2] ^= ss[1]; k[8*(i)+10] = ss[2]; ss[3] ^= ss[2]; k[8*(i)+11] = ss[3]; \
+{   ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[v(56,(8*(i))+ 8)] = ss[0]; \
+    ss[1] ^= ss[0]; k[v(56,(8*(i))+ 9)] = ss[1]; \
+    ss[2] ^= ss[1]; k[v(56,(8*(i))+10)] = ss[2]; \
+    ss[3] ^= ss[2]; k[v(56,(8*(i))+11)] = ss[3]; \
 }
 
-aes_rval aes_dec_key(const unsigned char in_key[], unsigned int klen, aes_ctx cx[1])
-{   aes_32t    ss[8];
-    d_vars
-
-#if !defined(FIXED_TABLES)
-    if(!tab_init) gen_tabs();
+AES_RETURN aes_xi(decrypt_key256)(const unsigned char *key, aes_decrypt_ctx cx[1])
+{   uint32_t    ss[9];
+#if defined( d_vars )
+        d_vars;
 #endif
 
-#if !defined(BLOCK_SIZE)
-    if(!cx->n_blk) cx->n_blk = 16;
+    cx->ks[v(56,(0))] = ss[0] = word_in(key, 0);
+    cx->ks[v(56,(1))] = ss[1] = word_in(key, 1);
+    cx->ks[v(56,(2))] = ss[2] = word_in(key, 2);
+    cx->ks[v(56,(3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+    cx->ks[v(56,(4))] = ff(ss[4] = word_in(key, 4));
+    cx->ks[v(56,(5))] = ff(ss[5] = word_in(key, 5));
+    cx->ks[v(56,(6))] = ff(ss[6] = word_in(key, 6));
+    cx->ks[v(56,(7))] = ff(ss[7] = word_in(key, 7));
+    kdf8(cx->ks, 0); kd8(cx->ks, 1);
+    kd8(cx->ks, 2);  kd8(cx->ks, 3);
+    kd8(cx->ks, 4);  kd8(cx->ks, 5);
+    kdl8(cx->ks, 6);
 #else
-    cx->n_blk = BLOCK_SIZE;
-#endif
+    cx->ks[v(56,(4))] = ss[4] = word_in(key, 4);
+    cx->ks[v(56,(5))] = ss[5] = word_in(key, 5);
+    cx->ks[v(56,(6))] = ss[6] = word_in(key, 6);
+    cx->ks[v(56,(7))] = ss[7] = word_in(key, 7);
+    {   uint32_t i;
 
-    cx->n_blk = (cx->n_blk & ~3) | 2;
-
-    cx->k_sch[0] = ss[0] = word_in(in_key     );
-    cx->k_sch[1] = ss[1] = word_in(in_key +  4);
-    cx->k_sch[2] = ss[2] = word_in(in_key +  8);
-    cx->k_sch[3] = ss[3] = word_in(in_key + 12);
-
-#if (BLOCK_SIZE == 16) && (DEC_UNROLL != NONE)
-
-    switch(klen)
-    {
-    case 16:    kdf4(cx->k_sch, 0); kd4(cx->k_sch, 1);
-                kd4(cx->k_sch, 2); kd4(cx->k_sch, 3);
-                kd4(cx->k_sch, 4); kd4(cx->k_sch, 5);
-                kd4(cx->k_sch, 6); kd4(cx->k_sch, 7);
-                kd4(cx->k_sch, 8); kdl4(cx->k_sch, 9);
-                cx->n_rnd = 10; break;
-    case 24:    ss[4] = word_in(in_key + 16);
-                cx->k_sch[4] = ff(ss[4]);
-                ss[5] = word_in(in_key + 20);
-                cx->k_sch[5] = ff(ss[5]);
-                kdf6(cx->k_sch, 0); kd6(cx->k_sch, 1);
-                kd6(cx->k_sch, 2); kd6(cx->k_sch, 3);
-                kd6(cx->k_sch, 4); kd6(cx->k_sch, 5);
-                kd6(cx->k_sch, 6); kdl6(cx->k_sch, 7);
-                cx->n_rnd = 12; break;
-    case 32:    ss[4] = word_in(in_key + 16);
-                cx->k_sch[4] = ff(ss[4]);
-                ss[5] = word_in(in_key + 20);
-                cx->k_sch[5] = ff(ss[5]);
-                ss[6] = word_in(in_key + 24);
-                cx->k_sch[6] = ff(ss[6]);
-                ss[7] = word_in(in_key + 28);
-                cx->k_sch[7] = ff(ss[7]);
-                kdf8(cx->k_sch, 0); kd8(cx->k_sch, 1);
-                kd8(cx->k_sch, 2); kd8(cx->k_sch, 3);
-                kd8(cx->k_sch, 4); kd8(cx->k_sch, 5);
-                kdl8(cx->k_sch, 6);
-                cx->n_rnd = 14; break;
-    default:    cx->n_rnd = 0; return aes_bad;
-    }
-#else
-    {   aes_32t i, l;
-        cx->n_rnd = ((klen >> 2) > nc ? (klen >> 2) : nc) + 6;
-        l = (nc * cx->n_rnd + nc - 1) / (klen >> 2);
-
-        switch(klen)
-        {
-        case 16:
-                    for(i = 0; i < l; ++i)
-                        ke4(cx->k_sch, i);
-                    break;
-        case 24:    cx->k_sch[4] = ss[4] = word_in(in_key + 16);
-                    cx->k_sch[5] = ss[5] = word_in(in_key + 20);
-                    for(i = 0; i < l; ++i)
-                        ke6(cx->k_sch, i);
-                    break;
-        case 32:    cx->k_sch[4] = ss[4] = word_in(in_key + 16);
-                    cx->k_sch[5] = ss[5] = word_in(in_key + 20);
-                    cx->k_sch[6] = ss[6] = word_in(in_key + 24);
-                    cx->k_sch[7] = ss[7] = word_in(in_key + 28);
-                    for(i = 0; i < l; ++i)
-                        ke8(cx->k_sch,  i);
-                    break;
-        default:    cx->n_rnd = 0; return aes_bad;
-        }
-#if (DEC_ROUND != NO_TABLES)
-        for(i = nc; i < nc * cx->n_rnd; ++i)
-            cx->k_sch[i] = inv_mcol(cx->k_sch[i]);
+        for(i = 0; i < 6; ++i)
+            k8e(cx->ks,  i);
+        k8ef(cx->ks,  6);
+#if !(DEC_ROUND == NO_TABLES)
+        for(i = N_COLS; i < 14 * N_COLS; ++i)
+            cx->ks[i] = inv_mcol(cx->ks[i]);
 #endif
     }
 #endif
+    cx->inf.l = 0;
+    cx->inf.b[0] = 14 * 16;
 
-    return aes_good;
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+    return EXIT_SUCCESS;
 }
 
 #endif
+
+#endif
+
+#if defined( AES_VAR )
+
+AES_RETURN aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1])
+{
+	switch(key_len)
+	{
+	case 16: case 128: return aes_encrypt_key128(key, cx);
+	case 24: case 192: return aes_encrypt_key192(key, cx);
+	case 32: case 256: return aes_encrypt_key256(key, cx);
+	default: return EXIT_FAILURE;
+	}
+}
+
+AES_RETURN aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1])
+{
+	switch(key_len)
+	{
+	case 16: case 128: return aes_decrypt_key128(key, cx);
+	case 24: case 192: return aes_decrypt_key192(key, cx);
+	case 32: case 256: return aes_decrypt_key256(key, cx);
+	default: return EXIT_FAILURE;
+	}
+}
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/aesopt.h b/aesopt.h
index 485d5ec3e7..d3ac0f9be1 100644
--- a/aesopt.h
+++ b/aesopt.h
@@ -1,294 +1,126 @@
 /*
- -------------------------------------------------------------------------
- Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
- All rights reserved.
+---------------------------------------------------------------------------
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
 
- LICENSE TERMS
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
 
- The free distribution and use of this software in both source and binary
- form is allowed (with or without changes) provided that:
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
 
-   1. distributions of this source code include the above copyright
-      notice, this list of conditions and the following disclaimer;
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
 
-   2. distributions in binary form include the above copyright
-      notice, this list of conditions and the following disclaimer
-      in the documentation and/or other associated materials;
-
-   3. the copyright holder's name is not used to endorse products
-      built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and fitness for purpose.
- -------------------------------------------------------------------------
- Issue Date: 29/07/2002
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 20/12/2007
 
  This file contains the compilation options for AES (Rijndael) and code
  that is common across encryption, key scheduling and table generation.
 
-    OPERATION
+ OPERATION
 
-    These source code files implement the AES algorithm Rijndael designed by
-    Joan Daemen and Vincent Rijmen. The version in aes.c is designed for
-    block and key sizes of 128, 192 and 256 bits (16, 24 and 32 bytes) while
-    that in aespp.c provides for block and keys sizes of 128, 160, 192, 224
-    and 256 bits (16, 20, 24, 28 and 32 bytes).  This file is a common header
-    file for these two implementations and for aesref.c, which is a reference
-    implementation.
+ These source code files implement the AES algorithm Rijndael designed by
+ Joan Daemen and Vincent Rijmen. This version is designed for the standard
+ block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24
+ and 32 bytes).
 
-    This version is designed for flexibility and speed using operations on
-    32-bit words rather than operations on bytes.  It provides aes_both fixed
-    and  dynamic block and key lengths and can also run with either big or
-    little endian internal byte order (see aes.h).  It inputs block and key
-    lengths in bytes with the legal values being  16, 24 and 32 for aes.c and
-    16, 20, 24, 28 and 32 for aespp.c
+ This version is designed for flexibility and speed using operations on
+ 32-bit words rather than operations on bytes.  It can be compiled with
+ either big or little endian internal byte order but is faster when the
+ native byte order for the processor is used.
 
-    THE CIPHER INTERFACE
+ THE CIPHER INTERFACE
 
-    aes_08t         (an unsigned  8-bit type)
-    aes_32t         (an unsigned 32-bit type)
-    aes_fret        (a signed 16 bit type for function return values)
-    aes_good        (value != 0, a good return)
-    aes_bad         (value == 0, an error return)
-    struct aes_ctx  (structure for the cipher encryption context)
-    struct aes_ctx  (structure for the cipher decryption context)
-    aes_rval        the function return type (aes_fret if not DLL)
+ The cipher interface is implemented as an array of bytes in which lower
+ AES bit sequence indexes map to higher numeric significance within bytes.
 
-    C subroutine calls:
+  uint8_t                 (an unsigned  8-bit type)
+  uint32_t                (an unsigned 32-bit type)
+  struct aes_encrypt_ctx  (structure for the cipher encryption context)
+  struct aes_decrypt_ctx  (structure for the cipher decryption context)
+  AES_RETURN                the function return type
 
-      aes_rval aes_blk_len(unsigned int blen, aes_ctx cx[1]);
-      aes_rval aes_enc_key(const unsigned char in_key[], unsigned int klen, aes_ctx cx[1]);
-      aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1]);
+  C subroutine calls:
 
-      aes_rval aes_dec_len(unsigned int blen, aes_ctx cx[1]);
-      aes_rval aes_dec_key(const unsigned char in_key[], unsigned int klen, aes_ctx cx[1]);
-      aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1]);
+  AES_RETURN aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]);
+  AES_RETURN aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]);
+  AES_RETURN aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]);
+  AES_RETURN aes_encrypt(const unsigned char *in, unsigned char *out,
+                                                  const aes_encrypt_ctx cx[1]);
 
-    IMPORTANT NOTE: If you are using this C interface and your compiler does
-    not set the memory used for objects to zero before use, you will need to
-    ensure that cx.s_flg is set to zero before using these subroutine calls.
+  AES_RETURN aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]);
+  AES_RETURN aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]);
+  AES_RETURN aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]);
+  AES_RETURN aes_decrypt(const unsigned char *in, unsigned char *out,
+                                                  const aes_decrypt_ctx cx[1]);
 
-    C++ aes class subroutines:
+ IMPORTANT NOTE: If you are using this C interface with dynamic tables make sure that
+ you call aes_init() before AES is used so that the tables are initialised.
 
-      class AESclass    for encryption
-      class AESclass    for decryption
+ C++ aes class subroutines:
 
-      aes_rval len(unsigned int blen = 16);
-      aes_rval key(const unsigned char in_key[], unsigned int klen);
-      aes_rval blk(const unsigned char in_blk[], unsigned char out_blk[]);
+     Class AESencrypt  for encryption
 
-      aes_rval len(unsigned int blen = 16);
-      aes_rval key(const unsigned char in_key[], unsigned int klen);
-      aes_rval blk(const unsigned char in_blk[], unsigned char out_blk[]);
+      Construtors:
+          AESencrypt(void)
+          AESencrypt(const unsigned char *key) - 128 bit key
+      Members:
+          AES_RETURN key128(const unsigned char *key)
+          AES_RETURN key192(const unsigned char *key)
+          AES_RETURN key256(const unsigned char *key)
+          AES_RETURN encrypt(const unsigned char *in, unsigned char *out) const
 
-    The block length inputs to set_block and set_key are in numbers of
-    BYTES, not bits.  The calls to subroutines must be made in the above
-    order but multiple calls can be made without repeating earlier calls
-    if their parameters have not changed. If the cipher block length is
-    variable but set_blk has not been called before cipher operations a
-    value of 16 is assumed (that is, the AES block size). In contrast to
-    earlier versions the block and key length parameters are now checked
-    for correctness and the encryption and decryption routines check to
-    ensure that an appropriate key has been set before they are called.
-
-    COMPILATION
-
-    The files used to provide AES (Rijndael) are
-
-    a. aes.h for the definitions needed for use in C.
-    b. aescpp.h for the definitions needed for use in C++.
-    c. aesopt.h for setting compilation options (also includes common
-       code).
-    d. aescrypt.c for encryption and decrytpion, or
-    e. aescrypt.asm for encryption and decryption using assembler code.
-    f. aeskey.c for key scheduling.
-    g. aestab.c for table loading or generation.
-
-    The assembler code uses the NASM assembler. The above files provice
-    block and key lengths of 16, 24 and 32 bytes (128, 192 and 256 bits).
-    If aescrypp.c and aeskeypp.c are used instead of aescrypt.c and
-    aeskey.c respectively, the block and key lengths can then be 16, 20,
-    24, 28 or 32 bytes. However this code has not been optimised to the
-    same extent and is hence slower (esepcially for the AES block size
-    of 16 bytes).
-
-    To compile AES (Rijndael) for use in C code use aes.h and exclude
-    the AES_DLL define in aes.h
-
-    To compile AES (Rijndael) for use in in C++ code use aescpp.h and
-    exclude the AES_DLL define in aes.h
-
-    To compile AES (Rijndael) in C as a Dynamic Link Library DLL) use
-    aes.h, include the AES_DLL define and compile the DLL.  If using
-    the test files to test the DLL, exclude aes.c from the test build
-    project and compile it with the same defines as used for the DLL
-    (ensure that the DLL path is correct)
-
-    CONFIGURATION OPTIONS (here and in aes.h)
-
-    a. define BLOCK_SIZE in aes.h to set the cipher block size (16, 24
-       or 32 for the standard code, or 16, 20, 24, 28 or 32 for the
-       extended code) or leave this undefined for dynamically variable
-       block size (this will result in much slower code).
-    b. set AES_DLL in aes.h if AES (Rijndael) is to be compiled as a DLL
-    c. You may need to set PLATFORM_BYTE_ORDER to define the byte order.
-    d. If you want the code to run in a specific internal byte order, then
-       INTERNAL_BYTE_ORDER must be set accordingly.
-    e. set other configuration options decribed below.
+      Class AESdecrypt  for encryption
+      Construtors:
+          AESdecrypt(void)
+          AESdecrypt(const unsigned char *key) - 128 bit key
+      Members:
+          AES_RETURN key128(const unsigned char *key)
+          AES_RETURN key192(const unsigned char *key)
+          AES_RETURN key256(const unsigned char *key)
+          AES_RETURN decrypt(const unsigned char *in, unsigned char *out) const
 */
 
-#ifndef _AESOPT_H
+#if !defined( _AESOPT_H )
 #define _AESOPT_H
 
-/*  START OF CONFIGURATION OPTIONS
+#if defined( __cplusplus )
+#include "aescpp.h"
+#else
+#include "aes.h"
+#endif
 
-    USE OF DEFINES
+/*  PLATFORM SPECIFIC INCLUDES */
+
+#include "brg_endian.h"
+
+/*  CONFIGURATION - THE USE OF DEFINES
 
     Later in this section there are a number of defines that control the
     operation of the code.  In each section, the purpose of each define is
     explained so that the relevant form can be included or excluded by
     setting either 1's or 0's respectively on the branches of the related
-    #if clauses.
+    #if clauses.  The following local defines should not be changed.
 */
 
-/*  1. PLATFORM SPECIFIC INCLUDES */
+#define ENCRYPTION_IN_C     1
+#define DECRYPTION_IN_C     2
+#define ENC_KEYING_IN_C     4
+#define DEC_KEYING_IN_C     8
 
-#include "aes.h"
-#define PLATFORM_BYTE_ORDER AES_LITTLE_ENDIAN
+#define NO_TABLES           0
+#define ONE_TABLE           1
+#define FOUR_TABLES         4
+#define NONE                0
+#define PARTIAL             1
+#define FULL                2
 
-/*
-#if defined( __CRYPTLIB__ ) && !defined( INC_ALL ) && !defined( INC_CHILD )
-#include "crypt/aes.h"
-#else
-  #include "aes.h"
-#endif
+/*  --- START OF USER CONFIGURED OPTIONS --- */
 
-#if defined(__GNUC__) || defined(__GNU_LIBRARY__)
-//#  include <endian.h>
-//#  include <byteswap.h>
-#elif defined(__CRYPTLIB__)
-#  if defined( INC_ALL )
-#    include "crypt.h"
-#  elif defined( INC_CHILD )
-#    include "../crypt.h"
-#  else
-#    include "crypt.h"
-#  endif
-#  if defined(DATA_LITTLEENDIAN)
-#    define PLATFORM_BYTE_ORDER AES_LITTLE_ENDIAN
-#  else
-#    define PLATFORM_BYTE_ORDER AES_BIG_ENDIAN
-#  endif
-#elif defined(_MSC_VER)
-#  include <stdlib.h>
-#elif !defined(WIN32)
-#  include <stdlib.h>
-#  if !defined (_ENDIAN_H)
-#    include <sys/param.h>
-#  else
-#    include _ENDIAN_H
-#  endif
-#endif
-*/
-
-/*  2. BYTE ORDER IN 32-BIT WORDS
-
-    To obtain the highest speed on processors with 32-bit words, this code
-    needs to determine the order in which bytes are packed into such words.
-    The following block of code is an attempt to capture the most obvious
-    ways in which various environemnts define byte order. It may well fail,
-    in which case the definitions will need to be set by editing at the
-    points marked **** EDIT HERE IF NECESSARY **** below.
-*/
-#define AES_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
-#define AES_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
-
-#if !defined(PLATFORM_BYTE_ORDER)
-#if defined(LITTLE_ENDIAN) || defined(BIG_ENDIAN)
-#  if defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN)
-#    if defined(BYTE_ORDER)
-#      if   (BYTE_ORDER == LITTLE_ENDIAN)
-#        define PLATFORM_BYTE_ORDER AES_LITTLE_ENDIAN
-#      elif (BYTE_ORDER == BIG_ENDIAN)
-#        define PLATFORM_BYTE_ORDER AES_BIG_ENDIAN
-#      endif
-#    endif
-#  elif defined(LITTLE_ENDIAN) && !defined(BIG_ENDIAN)
-#    define PLATFORM_BYTE_ORDER AES_LITTLE_ENDIAN
-#  elif !defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN)
-#    define PLATFORM_BYTE_ORDER AES_BIG_ENDIAN
-#  endif
-#elif defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)
-#  if defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)
-#    if defined(_BYTE_ORDER)
-#      if   (_BYTE_ORDER == _LITTLE_ENDIAN)
-#        define PLATFORM_BYTE_ORDER AES_LITTLE_ENDIAN
-#      elif (_BYTE_ORDER == _BIG_ENDIAN)
-#        define PLATFORM_BYTE_ORDER AES_BIG_ENDIAN
-#      endif
-#    endif
-#  elif defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)
-#    define PLATFORM_BYTE_ORDER AES_LITTLE_ENDIAN
-#  elif !defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)
-#    define PLATFORM_BYTE_ORDER AES_BIG_ENDIAN
-#  endif
-#elif 0     /* **** EDIT HERE IF NECESSARY **** */
-#define PLATFORM_BYTE_ORDER AES_LITTLE_ENDIAN
-#elif 0     /* **** EDIT HERE IF NECESSARY **** */
-#define PLATFORM_BYTE_ORDER AES_BIG_ENDIAN
-/*
-#elif (('1234' >> 24) == '1')
-#  define PLATFORM_BYTE_ORDER AES_LITTLE_ENDIAN
-#elif (('4321' >> 24) == '1')
-#  define PLATFORM_BYTE_ORDER AES_BIG_ENDIAN
-*/
-#endif
-#endif
-
-#if !defined(PLATFORM_BYTE_ORDER)
-#  error Please set undetermined byte order (lines 233 or 235 of aesopt.h).
-#endif
-
-/*  3. ASSEMBLER SUPPORT
-
-    If the assembler code is used for encryption and decryption this file only
-    provides key scheduling so the following defines are used
-*/
-#ifdef  AES_ASM
-#define ENCRYPTION_KEY_SCHEDULE
-#define DECRYPTION_KEY_SCHEDULE
-#else
-
-/*  4. FUNCTIONS REQUIRED
-
-    This implementation provides five main subroutines which provide for
-    setting block length, setting encryption and decryption keys and for
-    encryption and decryption. When the assembler code is not being used
-    the following definition blocks allow the selection of the routines
-    that are to be included in the compilation.
-*/
-#if 1
-#define ENCRYPTION_KEY_SCHEDULE
-#endif
-
-#if 1
-#define DECRYPTION_KEY_SCHEDULE
-#endif
-
-#if 1
-#define ENCRYPTION
-#endif
-
-#if 1
-#define DECRYPTION
-#endif
-
-#endif
-
-/*  5. BYTE ORDER WITHIN 32 BIT WORDS
+/*  1. BYTE ORDER WITHIN 32 BIT WORDS
 
     The fundamental data processing units in Rijndael are 8-bit bytes. The
     input, output and key input are all enumerated arrays of bytes in which
@@ -314,16 +146,111 @@
     machine on which it runs. Normally the internal byte order will be set
     to the order of the processor on which the code is to be run but this
     define can be used to reverse this in special situations
+
+    WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set.
+    This define will hence be redefined later (in section 4) if necessary
 */
+
 #if 1
-#define INTERNAL_BYTE_ORDER PLATFORM_BYTE_ORDER
-#elif defined(AES_LITTLE_ENDIAN)
-#define INTERNAL_BYTE_ORDER AES_LITTLE_ENDIAN
-#elif defined(AES_BIG_ENDIAN)
-#define INTERNAL_BYTE_ORDER AES_BIG_ENDIAN
+#  define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#elif 0
+#  define ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0
+#  define ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error The algorithm byte order is not defined
 #endif
 
-/*  6. FAST INPUT/OUTPUT OPERATIONS.
+/*  2. Intel AES AND VIA ACE SUPPORT */
+
+#if defined( __GNUC__ ) && defined( __i386__ ) \
+	|| defined(_WIN32) && defined(_M_IX86) \
+	&& !(defined(_WIN64) || defined(_WIN32_WCE) || defined(_MSC_VER) && (_MSC_VER <= 800))
+#  define VIA_ACE_POSSIBLE
+#endif
+
+/*  Define this option if support for the Intel AESNI is required (not
+    currently available with GCC). If AESNI is known to be present, then 
+	defining ASSUME_INTEL_AES_VIA_PRESENT will replace the ordinary 
+	encryption/decryption.  If USE_INTEL_AES_IF_PRESENT is defined then
+	AESNI will be used if it is detected (both present and enabled).
+
+	AESNI uses a decryption key schedule with the first decryption
+	round key at the high end of the key scedule with the following
+	round keys at lower positions in memory.  So AES_REV_DKS must NOT
+	be defined when AESNI will be used.  ALthough it is unlikely that
+	assembler code will be used with an AESNI build, if it is then
+	AES_REV_DKS must NOT be defined when such assembler files are
+	built
+*/
+#if 1 && defined( _WIN64 ) && defined( _MSC_VER )
+#  define INTEL_AES_POSSIBLE
+#endif
+
+#if defined( INTEL_AES_POSSIBLE ) && !defined( USE_INTEL_AES_IF_PRESENT )
+#  define USE_INTEL_AES_IF_PRESENT
+#endif
+
+/*  Define this option if support for the VIA ACE is required. This uses
+    inline assembler instructions and is only implemented for the Microsoft,
+    Intel and GCC compilers.  If VIA ACE is known to be present, then defining
+    ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption
+    code.  If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if
+    it is detected (both present and enabled) but the normal AES code will
+    also be present.
+
+    When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte
+    aligned; other input/output buffers do not need to be 16 byte aligned
+    but there are very large performance gains if this can be arranged.
+    VIA ACE also requires the decryption key schedule to be in reverse
+    order (which later checks below ensure).
+
+	AES_REV_DKS must be set for assembler code used with a VIA ACE build
+*/
+
+#if 1 && defined( VIA_ACE_POSSIBLE ) && !defined( USE_VIA_ACE_IF_PRESENT )
+#  define USE_VIA_ACE_IF_PRESENT
+#endif
+
+#if 0 && defined( VIA_ACE_POSSIBLE ) && !defined( ASSUME_VIA_ACE_PRESENT )
+#  define ASSUME_VIA_ACE_PRESENT
+#  endif
+
+/*  3. ASSEMBLER SUPPORT
+
+    This define (which can be on the command line) enables the use of the
+    assembler code routines for encryption, decryption and key scheduling
+    as follows:
+
+    ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for
+                encryption and decryption and but with key scheduling in C
+    ASM_X86_V2  uses assembler (aes_x86_v2.asm) with compressed tables for
+                encryption, decryption and key scheduling
+    ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for
+                encryption and decryption and but with key scheduling in C
+    ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for
+                encryption and decryption and but with key scheduling in C
+
+    Change one 'if 0' below to 'if 1' to select the version or define
+    as a compilation option.
+*/
+
+#if 0 && !defined( ASM_X86_V1C )
+#  define ASM_X86_V1C
+#elif 0 && !defined( ASM_X86_V2  )
+#  define ASM_X86_V2
+#elif 0 && !defined( ASM_X86_V2C )
+#  define ASM_X86_V2C
+#elif 0 && !defined( ASM_AMD64_C )
+#  define ASM_AMD64_C
+#endif
+
+#if (defined ( ASM_X86_V1C ) || defined( ASM_X86_V2 ) || defined( ASM_X86_V2C )) \
+      && !defined( _M_IX86 ) || defined( ASM_AMD64_C ) && !defined( _M_X64 )
+#  error Assembler code is only available for x86 and AMD64 systems
+#endif
+
+/*  4. FAST INPUT/OUTPUT OPERATIONS.
 
     On some machines it is possible to improve speed by transferring the
     bytes in the input and output arrays to and from the internal 32-bit
@@ -339,11 +266,11 @@
     assumed that access to byte arrays as if they are arrays of 32-bit
     words will not cause problems when such accesses are misaligned.
 */
-#if 1
-#define SAFE_IO
+#if 1 && !defined( _MSC_VER )
+#  define SAFE_IO
 #endif
 
-/*  7. LOOP UNROLLING
+/*  5. LOOP UNROLLING
 
     The code for encryption and decrytpion cycles through a number of rounds
     that can be implemented either in a loop or by expanding the code into a
@@ -355,41 +282,39 @@
     to be set independently for encryption and decryption
 */
 #if 1
-#define ENC_UNROLL  FULL
+#  define ENC_UNROLL  FULL
 #elif 0
-#define ENC_UNROLL  PARTIAL
+#  define ENC_UNROLL  PARTIAL
 #else
-#define ENC_UNROLL  NONE
+#  define ENC_UNROLL  NONE
 #endif
 
 #if 1
-#define DEC_UNROLL  FULL
+#  define DEC_UNROLL  FULL
 #elif 0
-#define DEC_UNROLL  PARTIAL
+#  define DEC_UNROLL  PARTIAL
 #else
-#define DEC_UNROLL  NONE
+#  define DEC_UNROLL  NONE
 #endif
 
-/*  8. FIXED OR DYNAMIC TABLES
-
-    When this section is included the tables used by the code are comipled
-    statically into the binary file.  Otherwise they are computed once when
-    the code is first used.
-*/
 #if 1
-#define FIXED_TABLES
+#  define ENC_KS_UNROLL
 #endif
 
-/*  9. FAST FINITE FIELD OPERATIONS
+#if 1
+#  define DEC_KS_UNROLL
+#endif
+
+/*  6. FAST FINITE FIELD OPERATIONS
 
     If this section is included, tables are used to provide faster finite
     field arithmetic (this has no effect if FIXED_TABLES is defined).
 */
 #if 1
-#define FF_TABLES
+#  define FF_TABLES
 #endif
 
-/*  10. INTERNAL STATE VARIABLE FORMAT
+/*  7. INTERNAL STATE VARIABLE FORMAT
 
     The internal state of Rijndael is stored in a number of local 32-bit
     word varaibles which can be defined either as an array or as individual
@@ -397,37 +322,54 @@
     varaibles in arrays. Otherwise individual local variables will be used.
 */
 #if 1
-#define ARRAYS
+#  define ARRAYS
 #endif
 
-/* In this implementation the columns of the state array are each held in
-   32-bit words. The state array can be held in various ways: in an array
-   of words, in a number of individual word variables or in a number of
-   processor registers. The following define maps a variable name x and
-   a column number c to the way the state array variable is to be held.
-   The first define below maps the state into an array x[c] whereas the
-   second form maps the state into a number of individual variables x0,
-   x1, etc.  Another form could map individual state colums to machine
-   register names.
-*/
+/*  8. FIXED OR DYNAMIC TABLES
 
-#if defined(ARRAYS)
-#define s(x,c) x[c]
+    When this section is included the tables used by the code are compiled
+    statically into the binary file.  Otherwise the subroutine aes_init()
+    must be called to compute them before the code is first used.
+*/
+#if 1 && !(defined( _MSC_VER ) && ( _MSC_VER <= 800 ))
+#  define FIXED_TABLES
+#endif
+
+/*  9. MASKING OR CASTING FROM LONGER VALUES TO BYTES
+
+    In some systems it is better to mask longer values to extract bytes 
+    rather than using a cast. This option allows this choice.
+*/
+#if 0
+#  define to_byte(x)  ((uint8_t)(x))
 #else
-#define s(x,c) x##c
+#  define to_byte(x)  ((x) & 0xff)
 #endif
 
-/*  11. VARIABLE BLOCK SIZE SPEED
+/*  10. TABLE ALIGNMENT
 
-    This section is only relevant if you wish to use the variable block
-    length feature of the code.  Include this section if you place more
-    emphasis on speed rather than code size.
+    On some sytsems speed will be improved by aligning the AES large lookup
+    tables on particular boundaries. This define should be set to a power of
+    two giving the desired alignment. It can be left undefined if alignment
+    is not needed.  This option is specific to the Microsft VC++ compiler -
+    it seems to sometimes cause trouble for the VC++ version 6 compiler.
 */
-#if 1
-#define FAST_VARIABLE
+
+#if 1 && defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
+#  define TABLE_ALIGN 32
 #endif
 
-/*  12. INTERNAL TABLE CONFIGURATION
+/*  11.  REDUCE CODE AND TABLE SIZE
+
+    This replaces some expanded macros with function calls if AES_ASM_V2 or
+    AES_ASM_V2C are defined
+*/
+
+#if 1 && (defined( ASM_X86_V2 ) || defined( ASM_X86_V2C ))
+#  define REDUCE_CODE_SIZE
+#endif
+
+/*  12. TABLE OPTIONS
 
     This cipher proceeds by repeating in a number of cycles known as 'rounds'
     which are implemented by a round function which can optionally be speeded
@@ -448,35 +390,35 @@
 */
 
 #if 1   /* set tables for the normal encryption round */
-#define ENC_ROUND   FOUR_TABLES
+#  define ENC_ROUND   FOUR_TABLES
 #elif 0
-#define ENC_ROUND   ONE_TABLE
+#  define ENC_ROUND   ONE_TABLE
 #else
-#define ENC_ROUND   NO_TABLES
+#  define ENC_ROUND   NO_TABLES
 #endif
 
 #if 1   /* set tables for the last encryption round */
-#define LAST_ENC_ROUND  FOUR_TABLES
+#  define LAST_ENC_ROUND  FOUR_TABLES
 #elif 0
-#define LAST_ENC_ROUND  ONE_TABLE
+#  define LAST_ENC_ROUND  ONE_TABLE
 #else
-#define LAST_ENC_ROUND  NO_TABLES
+#  define LAST_ENC_ROUND  NO_TABLES
 #endif
 
 #if 1   /* set tables for the normal decryption round */
-#define DEC_ROUND   FOUR_TABLES
+#  define DEC_ROUND   FOUR_TABLES
 #elif 0
-#define DEC_ROUND   ONE_TABLE
+#  define DEC_ROUND   ONE_TABLE
 #else
-#define DEC_ROUND   NO_TABLES
+#  define DEC_ROUND   NO_TABLES
 #endif
 
 #if 1   /* set tables for the last decryption round */
-#define LAST_DEC_ROUND  FOUR_TABLES
+#  define LAST_DEC_ROUND  FOUR_TABLES
 #elif 0
-#define LAST_DEC_ROUND  ONE_TABLE
+#  define LAST_DEC_ROUND  ONE_TABLE
 #else
-#define LAST_DEC_ROUND  NO_TABLES
+#  define LAST_DEC_ROUND  NO_TABLES
 #endif
 
 /*  The decryption key schedule can be speeded up with tables in the same
@@ -484,58 +426,134 @@
     defines to set this requirement.
 */
 #if 1
-#define KEY_SCHED   FOUR_TABLES
+#  define KEY_SCHED   FOUR_TABLES
 #elif 0
-#define KEY_SCHED   ONE_TABLE
+#  define KEY_SCHED   ONE_TABLE
 #else
-#define KEY_SCHED   NO_TABLES
+#  define KEY_SCHED   NO_TABLES
 #endif
 
+/*  ---- END OF USER CONFIGURED OPTIONS ---- */
+
+/* VIA ACE support is only available for VC++ and GCC */
+
+#if !defined( _MSC_VER ) && !defined( __GNUC__ )
+#  if defined( ASSUME_VIA_ACE_PRESENT )
+#    undef ASSUME_VIA_ACE_PRESENT
+#  endif
+#  if defined( USE_VIA_ACE_IF_PRESENT )
+#    undef USE_VIA_ACE_IF_PRESENT
+#  endif
+#endif
+
+#if defined( ASSUME_VIA_ACE_PRESENT ) && !defined( USE_VIA_ACE_IF_PRESENT )
+#  define USE_VIA_ACE_IF_PRESENT
+#endif
+
+/* define to reverse decryption key schedule    */
+#if 1 || defined( USE_VIA_ACE_IF_PRESENT ) && !defined ( AES_REV_DKS )
+#  define AES_REV_DKS
+#endif
+
+/* Intel AESNI uses a decryption key schedule in the encryption order */
+#if defined( USE_INTEL_AES_IF_PRESENT ) && defined ( AES_REV_DKS )
+#  undef AES_REV_DKS
+#endif
+
+/* Assembler support requires the use of platform byte order */
+
+#if ( defined( ASM_X86_V1C ) || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C ) ) \
+    && (ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER)
+#  undef  ALGORITHM_BYTE_ORDER
+#  define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#endif
+
+/* In this implementation the columns of the state array are each held in
+   32-bit words. The state array can be held in various ways: in an array
+   of words, in a number of individual word variables or in a number of
+   processor registers. The following define maps a variable name x and
+   a column number c to the way the state array variable is to be held.
+   The first define below maps the state into an array x[c] whereas the
+   second form maps the state into a number of individual variables x0,
+   x1, etc.  Another form could map individual state colums to machine
+   register names.
+*/
+
+#if defined( ARRAYS )
+#  define s(x,c) x[c]
+#else
+#  define s(x,c) x##c
+#endif
+
+/*  This implementation provides subroutines for encryption, decryption
+    and for setting the three key lengths (separately) for encryption
+    and decryption. Since not all functions are needed, masks are set
+    up here to determine which will be implemented in C
+*/
+
+#if !defined( AES_ENCRYPT )
+#  define EFUNCS_IN_C   0
+#elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) \
+    || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C )
+#  define EFUNCS_IN_C   ENC_KEYING_IN_C
+#elif !defined( ASM_X86_V2 )
+#  define EFUNCS_IN_C   ( ENCRYPTION_IN_C | ENC_KEYING_IN_C )
+#else
+#  define EFUNCS_IN_C   0
+#endif
+
+#if !defined( AES_DECRYPT )
+#  define DFUNCS_IN_C   0
+#elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) \
+    || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C )
+#  define DFUNCS_IN_C   DEC_KEYING_IN_C
+#elif !defined( ASM_X86_V2 )
+#  define DFUNCS_IN_C   ( DECRYPTION_IN_C | DEC_KEYING_IN_C )
+#else
+#  define DFUNCS_IN_C   0
+#endif
+
+#define FUNCS_IN_C  ( EFUNCS_IN_C | DFUNCS_IN_C )
+
 /* END OF CONFIGURATION OPTIONS */
 
-#define NO_TABLES   0   /* DO NOT CHANGE */
-#define ONE_TABLE   1   /* DO NOT CHANGE */
-#define FOUR_TABLES 4   /* DO NOT CHANGE */
-#define NONE        0   /* DO NOT CHANGE */
-#define PARTIAL     1   /* DO NOT CHANGE */
-#define FULL        2   /* DO NOT CHANGE */
+#define RC_LENGTH   (5 * (AES_BLOCK_SIZE / 4 - 2))
 
-#if defined(BLOCK_SIZE) && ((BLOCK_SIZE & 3) || BLOCK_SIZE < 16 || BLOCK_SIZE > 32)
-#error An illegal block size has been specified.
-#endif
-
-#if !defined(BLOCK_SIZE)
-#define RC_LENGTH    29
-#else
-#define RC_LENGTH   5 * BLOCK_SIZE / 4 - (BLOCK_SIZE == 16 ? 10 : 11)
-#endif
-
-/* Disable at least some poor combinations of options */
+/* Disable or report errors on some combinations of options */
 
 #if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES
-#undef  LAST_ENC_ROUND
-#define LAST_ENC_ROUND  NO_TABLES
+#  undef  LAST_ENC_ROUND
+#  define LAST_ENC_ROUND  NO_TABLES
 #elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES
-#undef  LAST_ENC_ROUND
-#define LAST_ENC_ROUND  ONE_TABLE
+#  undef  LAST_ENC_ROUND
+#  define LAST_ENC_ROUND  ONE_TABLE
 #endif
 
 #if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE
-#undef  ENC_UNROLL
-#define ENC_UNROLL  NONE
+#  undef  ENC_UNROLL
+#  define ENC_UNROLL  NONE
 #endif
 
 #if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES
-#undef  LAST_DEC_ROUND
-#define LAST_DEC_ROUND  NO_TABLES
+#  undef  LAST_DEC_ROUND
+#  define LAST_DEC_ROUND  NO_TABLES
 #elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES
-#undef  LAST_DEC_ROUND
-#define LAST_DEC_ROUND  ONE_TABLE
+#  undef  LAST_DEC_ROUND
+#  define LAST_DEC_ROUND  ONE_TABLE
 #endif
 
 #if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE
-#undef  DEC_UNROLL
-#define DEC_UNROLL  NONE
+#  undef  DEC_UNROLL
+#  define DEC_UNROLL  NONE
+#endif
+
+#if defined( bswap32 )
+#  define aes_sw32    bswap32
+#elif defined( bswap_32 )
+#  define aes_sw32    bswap_32
+#else
+#  define brot(x,n)   (((uint32_t)(x) <<  n) | ((uint32_t)(x) >> (32 - n)))
+#  define aes_sw32(x) ((brot((x),8) & 0x00ff00ff) | (brot((x),24) & 0xff00ff00))
 #endif
 
 /*  upr(x,n):  rotates bytes within words by n positions, moving bytes to
@@ -544,54 +562,38 @@
                words but without wrap around
     bval(x,n): extracts a byte from a word
 
-    NOTE:      The definitions given here are intended only for use with
+    WARNING:   The definitions given here are intended only for use with
                unsigned variables and with shift counts that are compile
                time constants
 */
 
-#if (INTERNAL_BYTE_ORDER == AES_LITTLE_ENDIAN)
-#if defined(_MSC_VER)
-#define upr(x,n)        _lrotl((aes_32t)(x), 8 * (n))
+#if ( ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN )
+#  define upr(x,n)      (((uint32_t)(x) << (8 * (n))) | ((uint32_t)(x) >> (32 - 8 * (n))))
+#  define ups(x,n)      ((uint32_t) (x) << (8 * (n)))
+#  define bval(x,n)     to_byte((x) >> (8 * (n)))
+#  define bytes2word(b0, b1, b2, b3)  \
+        (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
+#endif
+
+#if ( ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN )
+#  define upr(x,n)      (((uint32_t)(x) >> (8 * (n))) | ((uint32_t)(x) << (32 - 8 * (n))))
+#  define ups(x,n)      ((uint32_t) (x) >> (8 * (n)))
+#  define bval(x,n)     to_byte((x) >> (24 - 8 * (n)))
+#  define bytes2word(b0, b1, b2, b3)  \
+        (((uint32_t)(b0) << 24) | ((uint32_t)(b1) << 16) | ((uint32_t)(b2) << 8) | (b3))
+#endif
+
+#if defined( SAFE_IO )
+#  define word_in(x,c)    bytes2word(((const uint8_t*)(x)+4*c)[0], ((const uint8_t*)(x)+4*c)[1], \
+                                   ((const uint8_t*)(x)+4*c)[2], ((const uint8_t*)(x)+4*c)[3])
+#  define word_out(x,c,v) { ((uint8_t*)(x)+4*c)[0] = bval(v,0); ((uint8_t*)(x)+4*c)[1] = bval(v,1); \
+                          ((uint8_t*)(x)+4*c)[2] = bval(v,2); ((uint8_t*)(x)+4*c)[3] = bval(v,3); }
+#elif ( ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER )
+#  define word_in(x,c)    (*((uint32_t*)(x)+(c)))
+#  define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v))
 #else
-#define upr(x,n)        ((aes_32t)(x) << 8 * (n) | (aes_32t)(x) >> 32 - 8 * (n))
-#endif
-#define ups(x,n)        ((aes_32t)(x) << 8 * (n))
-#define bval(x,n)       ((aes_08t)((x) >> 8 * (n)))
-#define bytes2word(b0, b1, b2, b3)  \
-        (((aes_32t)(b3) << 24) | ((aes_32t)(b2) << 16) | ((aes_32t)(b1) << 8) | (b0))
-#endif
-
-#if (INTERNAL_BYTE_ORDER == AES_BIG_ENDIAN)
-#define upr(x,n)        ((aes_32t)(x) >> 8 * (n) | (aes_32t)(x) << 32 - 8 * (n))
-#define ups(x,n)        ((aes_32t)(x) >> 8 * (n)))
-#define bval(x,n)       ((aes_08t)((x) >> 24 - 8 * (n)))
-#define bytes2word(b0, b1, b2, b3)  \
-        (((aes_32t)(b0) << 24) | ((aes_32t)(b1) << 16) | ((aes_32t)(b2) << 8) | (b3))
-#endif
-
-#if defined(SAFE_IO)
-
-#define word_in(x)      bytes2word((x)[0], (x)[1], (x)[2], (x)[3])
-#define word_out(x,v)   { (x)[0] = bval(v,0); (x)[1] = bval(v,1);   \
-                          (x)[2] = bval(v,2); (x)[3] = bval(v,3);   }
-
-#elif (INTERNAL_BYTE_ORDER == PLATFORM_BYTE_ORDER)
-
-#define word_in(x)      *(aes_32t*)(x)
-#define word_out(x,v)   *(aes_32t*)(x) = (v)
-
-#else
-
-#if !defined(bswap_32)
-#if !defined(_MSC_VER)
-#define _lrotl(x,n)     ((aes_32t)(x) <<  n | (aes_32t)(x) >> 32 - n)
-#endif
-#define bswap_32(x)     ((_lrotl((x),8) & 0x00ff00ff) | (_lrotl((x),24) & 0xff00ff00))
-#endif
-
-#define word_in(x)      bswap_32(*(aes_32t*)(x))
-#define word_out(x,v)   *(aes_32t*)(x) = bswap_32(v)
-
+#  define word_in(x,c)    aes_sw32(*((uint32_t*)(x)+(c)))
+#  define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = aes_sw32(v))
 #endif
 
 /* the finite field modular polynomial and elements */
@@ -601,173 +603,101 @@
 
 /* multiply four bytes in GF(2^8) by 'x' {02} in parallel */
 
-#define m1  0x80808080
-#define m2  0x7f7f7f7f
-#define FFmulX(x)  ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY))
+#define gf_c1  0x80808080
+#define gf_c2  0x7f7f7f7f
+#define gf_mulx(x)  ((((x) & gf_c2) << 1) ^ ((((x) & gf_c1) >> 7) * BPOLY))
 
-/* The following defines provide alternative definitions of FFmulX that might
+/* The following defines provide alternative definitions of gf_mulx that might
    give improved performance if a fast 32-bit multiply is not available. Note
-   that a temporary variable u needs to be defined where FFmulX is used.
+   that a temporary variable u needs to be defined where gf_mulx is used.
 
-#define FFmulX(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ ((u >> 3) | (u >> 6))
-#define m4  (0x01010101 * BPOLY)
-#define FFmulX(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) & m4)
+#define gf_mulx(x) (u = (x) & gf_c1, u |= (u >> 1), ((x) & gf_c2) << 1) ^ ((u >> 3) | (u >> 6))
+#define gf_c4  (0x01010101 * BPOLY)
+#define gf_mulx(x) (u = (x) & gf_c1, ((x) & gf_c2) << 1) ^ ((u - (u >> 7)) & gf_c4)
 */
 
 /* Work out which tables are needed for the different options   */
 
-#ifdef  AES_ASM
-#ifdef  ENC_ROUND
-#undef  ENC_ROUND
-#endif
-#define ENC_ROUND   FOUR_TABLES
-#ifdef  LAST_ENC_ROUND
-#undef  LAST_ENC_ROUND
-#endif
-#define LAST_ENC_ROUND  FOUR_TABLES
-#ifdef  DEC_ROUND
-#undef  DEC_ROUND
-#endif
-#define DEC_ROUND   FOUR_TABLES
-#ifdef  LAST_DEC_ROUND
-#undef  LAST_DEC_ROUND
-#endif
-#define LAST_DEC_ROUND  FOUR_TABLES
-#ifdef  KEY_SCHED
-#undef  KEY_SCHED
-#define KEY_SCHED   FOUR_TABLES
-#endif
+#if defined( ASM_X86_V1C )
+#  if defined( ENC_ROUND )
+#    undef  ENC_ROUND
+#  endif
+#  define ENC_ROUND   FOUR_TABLES
+#  if defined( LAST_ENC_ROUND )
+#    undef  LAST_ENC_ROUND
+#  endif
+#  define LAST_ENC_ROUND  FOUR_TABLES
+#  if defined( DEC_ROUND )
+#    undef  DEC_ROUND
+#  endif
+#  define DEC_ROUND   FOUR_TABLES
+#  if defined( LAST_DEC_ROUND )
+#    undef  LAST_DEC_ROUND
+#  endif
+#  define LAST_DEC_ROUND  FOUR_TABLES
+#  if defined( KEY_SCHED )
+#    undef  KEY_SCHED
+#    define KEY_SCHED   FOUR_TABLES
+#  endif
 #endif
 
-#if defined(ENCRYPTION) || defined(AES_ASM)
-#if ENC_ROUND == ONE_TABLE
-#define FT1_SET
-#elif ENC_ROUND == FOUR_TABLES
-#define FT4_SET
-#else
-#define SBX_SET
-#endif
-#if LAST_ENC_ROUND == ONE_TABLE
-#define FL1_SET
-#elif LAST_ENC_ROUND == FOUR_TABLES
-#define FL4_SET
-#elif !defined(SBX_SET)
-#define SBX_SET
-#endif
+#if ( FUNCS_IN_C & ENCRYPTION_IN_C ) || defined( ASM_X86_V1C )
+#  if ENC_ROUND == ONE_TABLE
+#    define FT1_SET
+#  elif ENC_ROUND == FOUR_TABLES
+#    define FT4_SET
+#  else
+#    define SBX_SET
+#  endif
+#  if LAST_ENC_ROUND == ONE_TABLE
+#    define FL1_SET
+#  elif LAST_ENC_ROUND == FOUR_TABLES
+#    define FL4_SET
+#  elif !defined( SBX_SET )
+#    define SBX_SET
+#  endif
 #endif
 
-#if defined(DECRYPTION) || defined(AES_ASM)
-#if DEC_ROUND == ONE_TABLE
-#define IT1_SET
-#elif DEC_ROUND == FOUR_TABLES
-#define IT4_SET
-#else
-#define ISB_SET
-#endif
-#if LAST_DEC_ROUND == ONE_TABLE
-#define IL1_SET
-#elif LAST_DEC_ROUND == FOUR_TABLES
-#define IL4_SET
-#elif !defined(ISB_SET)
-#define ISB_SET
-#endif
+#if ( FUNCS_IN_C & DECRYPTION_IN_C ) || defined( ASM_X86_V1C )
+#  if DEC_ROUND == ONE_TABLE
+#    define IT1_SET
+#  elif DEC_ROUND == FOUR_TABLES
+#    define IT4_SET
+#  else
+#    define ISB_SET
+#  endif
+#  if LAST_DEC_ROUND == ONE_TABLE
+#    define IL1_SET
+#  elif LAST_DEC_ROUND == FOUR_TABLES
+#    define IL4_SET
+#  elif !defined(ISB_SET)
+#    define ISB_SET
+#  endif
 #endif
 
-#if defined(ENCRYPTION_KEY_SCHEDULE) || defined(DECRYPTION_KEY_SCHEDULE)
-#if KEY_SCHED == ONE_TABLE
-#define LS1_SET
-#define IM1_SET
-#elif KEY_SCHED == FOUR_TABLES
-#define LS4_SET
-#define IM4_SET
-#elif !defined(SBX_SET)
-#define SBX_SET
-#endif
-#endif
-
-#ifdef  FIXED_TABLES
-#define prefx   extern const
-#else
-#define prefx   extern
-extern aes_08t  tab_init;
-void gen_tabs(void);
-#endif
-
-prefx aes_32t  rcon_tab[29];
-
-#ifdef  SBX_SET
-prefx aes_08t s_box[256];
-#endif
-
-#ifdef  ISB_SET
-prefx aes_08t inv_s_box[256];
-#endif
-
-#ifdef  FT1_SET
-prefx aes_32t ft_tab[256];
-#endif
-
-#ifdef  FT4_SET
-prefx aes_32t ft_tab[4][256];
-#endif
-
-#ifdef  FL1_SET
-prefx aes_32t fl_tab[256];
-#endif
-
-#ifdef  FL4_SET
-prefx aes_32t fl_tab[4][256];
-#endif
-
-#ifdef  IT1_SET
-prefx aes_32t it_tab[256];
-#endif
-
-#ifdef  IT4_SET
-prefx aes_32t it_tab[4][256];
-#endif
-
-#ifdef  IL1_SET
-prefx aes_32t il_tab[256];
-#endif
-
-#ifdef  IL4_SET
-prefx aes_32t il_tab[4][256];
-#endif
-
-#ifdef  LS1_SET
-#ifdef  FL1_SET
-#undef  LS1_SET
-#else
-prefx aes_32t ls_tab[256];
-#endif
-#endif
-
-#ifdef  LS4_SET
-#ifdef  FL4_SET
-#undef  LS4_SET
-#else
-prefx aes_32t ls_tab[4][256];
-#endif
-#endif
-
-#ifdef  IM1_SET
-prefx aes_32t im_tab[256];
-#endif
-
-#ifdef  IM4_SET
-prefx aes_32t im_tab[4][256];
-#endif
-
-/* Set the number of columns in nc.  Note that it is important
-   that nc is a constant which is known at compile time if the
-   highest speed version of the code is needed.
-*/
-
-#if defined(BLOCK_SIZE)
-#define nc  (BLOCK_SIZE >> 2)
-#else
-#define nc  (cx->n_blk >> 2)
+#if !(defined( REDUCE_CODE_SIZE ) && (defined( ASM_X86_V2 ) || defined( ASM_X86_V2C )))
+#  if ((FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C))
+#    if KEY_SCHED == ONE_TABLE
+#      if !defined( FL1_SET )  && !defined( FL4_SET ) 
+#        define LS1_SET
+#      endif
+#    elif KEY_SCHED == FOUR_TABLES
+#      if !defined( FL4_SET )
+#        define LS4_SET
+#      endif
+#    elif !defined( SBX_SET )
+#      define SBX_SET
+#    endif
+#  endif
+#  if (FUNCS_IN_C & DEC_KEYING_IN_C)
+#    if KEY_SCHED == ONE_TABLE
+#      define IM1_SET
+#    elif KEY_SCHED == FOUR_TABLES
+#      define IM4_SET
+#    elif !defined( SBX_SET )
+#      define SBX_SET
+#    endif
+#  endif
 #endif
 
 /* generic definitions of Rijndael macros that use tables    */
@@ -792,45 +722,48 @@ prefx aes_32t im_tab[4][256];
 
 #define vf1(x,r,c)  (x)
 #define rf1(r,c)    (r)
-#define rf2(r,c)    ((r-c)&3)
+#define rf2(r,c)    ((8+r-c)&3)
 
 /* perform forward and inverse column mix operation on four bytes in long word x in */
 /* parallel. NOTE: x must be a simple variable, NOT an expression in these macros.  */
 
-#define dec_fmvars
-#if defined(FM4_SET)    /* not currently used */
-#define fwd_mcol(x)     four_tables(x,fm_tab,vf1,rf1,0)
-#elif defined(FM1_SET)  /* not currently used */
-#define fwd_mcol(x)     one_table(x,upr,fm_tab,vf1,rf1,0)
+#if !(defined( REDUCE_CODE_SIZE ) && (defined( ASM_X86_V2 ) || defined( ASM_X86_V2C ))) 
+
+#if defined( FM4_SET )      /* not currently used */
+#  define fwd_mcol(x)       four_tables(x,t_use(f,m),vf1,rf1,0)
+#elif defined( FM1_SET )    /* not currently used */
+#  define fwd_mcol(x)       one_table(x,upr,t_use(f,m),vf1,rf1,0)
 #else
-#undef  dec_fmvars
-#define dec_fmvars      aes_32t f1, f2;
-#define fwd_mcol(x)     (f1 = (x), f2 = FFmulX(f1), f2 ^ upr(f1 ^ f2, 3) ^ upr(f1, 2) ^ upr(f1, 1))
+#  define dec_fmvars        uint32_t g2
+#  define fwd_mcol(x)       (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ upr((x), 2) ^ upr((x), 1))
 #endif
 
-#define dec_imvars
-#if defined(IM4_SET)
-#define inv_mcol(x)     four_tables(x,im_tab,vf1,rf1,0)
-#elif defined(IM1_SET)
-#define inv_mcol(x)     one_table(x,upr,im_tab,vf1,rf1,0)
+#if defined( IM4_SET )
+#  define inv_mcol(x)       four_tables(x,t_use(i,m),vf1,rf1,0)
+#elif defined( IM1_SET )
+#  define inv_mcol(x)       one_table(x,upr,t_use(i,m),vf1,rf1,0)
 #else
-#undef  dec_imvars
-#define dec_imvars      aes_32t    f2, f4, f8, f9;
-#define inv_mcol(x) \
-    (f9 = (x), f2 = FFmulX(f9), f4 = FFmulX(f2), f8 = FFmulX(f4), f9 ^= f8, \
-    f2 ^= f4 ^ f8 ^ upr(f2 ^ f9,3) ^ upr(f4 ^ f9,2) ^ upr(f9,1))
+#  define dec_imvars        uint32_t g2, g4, g9
+#  define inv_mcol(x)       (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = (x) ^ gf_mulx(g4), g4 ^= g9, \
+                            (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ upr(g4, 2) ^ upr(g9, 1))
 #endif
 
-#if defined(FL4_SET)
-#define ls_box(x,c)     four_tables(x,fl_tab,vf1,rf2,c)
-#elif   defined(LS4_SET)
-#define ls_box(x,c)     four_tables(x,ls_tab,vf1,rf2,c)
-#elif defined(FL1_SET)
-#define ls_box(x,c)     one_table(x,upr,fl_tab,vf1,rf2,c)
-#elif defined(LS1_SET)
-#define ls_box(x,c)     one_table(x,upr,ls_tab,vf1,rf2,c)
+#if defined( FL4_SET )
+#  define ls_box(x,c)       four_tables(x,t_use(f,l),vf1,rf2,c)
+#elif defined( LS4_SET )
+#  define ls_box(x,c)       four_tables(x,t_use(l,s),vf1,rf2,c)
+#elif defined( FL1_SET )
+#  define ls_box(x,c)       one_table(x,upr,t_use(f,l),vf1,rf2,c)
+#elif defined( LS1_SET )
+#  define ls_box(x,c)       one_table(x,upr,t_use(l,s),vf1,rf2,c)
 #else
-#define ls_box(x,c)     no_table(x,s_box,vf1,rf2,c)
+#  define ls_box(x,c)       no_table(x,t_use(s,box),vf1,rf2,c)
+#endif
+
+#endif
+
+#if defined( ASM_X86_V1C ) && defined( AES_DECRYPT ) && !defined( ISB_SET )
+#  define ISB_SET
 #endif
 
 #endif
diff --git a/aestab.c b/aestab.c
index cbfa70d20d..8fd11f94ad 100644
--- a/aestab.c
+++ b/aestab.c
@@ -1,181 +1,143 @@
 /*
--------------------------------------------------------------------------
-Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
-All rights reserved.
+---------------------------------------------------------------------------
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
 
-LICENSE TERMS
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
 
-The free distribution and use of this software in both source and binary
-form is allowed (with or without changes) provided that:
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
 
-1. distributions of this source code include the above copyright
-notice, this list of conditions and the following disclaimer;
-
-2. distributions in binary form include the above copyright
-notice, this list of conditions and the following disclaimer
-in the documentation and/or other associated materials;
-
-3. the copyright holder's name is not used to endorse products
-built using this software without specific written permission.
-
-DISCLAIMER
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
 
 This software is provided 'as is' with no explicit or implied warranties
-in respect of its properties, including, but not limited to, correctness
+in respect of its operation, including, but not limited to, correctness
 and fitness for purpose.
--------------------------------------------------------------------------
-Issue Date: 29/07/2002
+---------------------------------------------------------------------------
+Issue Date: 20/12/2007
 */
 
+#define DO_TABLES
+
+#include "aes.h"
 #include "aesopt.h"
 
-#if defined(FIXED_TABLES) || !defined(FF_TABLES)
-
-/*  finite field arithmetic operations */
-
-#define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
-#define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
-#define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) \
-^ (((x>>5) & 4) * WPOLY))
-#define f3(x)   (f2(x) ^ x)
-#define f9(x)   (f8(x) ^ x)
-#define fb(x)   (f8(x) ^ f2(x) ^ x)
-#define fd(x)   (f8(x) ^ f4(x) ^ x)
-#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
-
-#endif
-
 #if defined(FIXED_TABLES)
 
-#define sb_data(w) \
-w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
-w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
-w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
-w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
-w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
-w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
-w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
-w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
-w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
-w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
-w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
-w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
-w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
-w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
-w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
-w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
-w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
-w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
-w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
-w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
-w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
-w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
-w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
-w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
-w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
-w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
-w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
-w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
-w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
-w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
-w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
-w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16)
+#define sb_data(w) {\
+    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
+    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
+    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
+    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
+    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
+    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
+    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
+    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
+    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
+    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
+    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
+    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
+    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
+    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
+    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
+    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
+    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
+    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
+    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
+    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
+    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
+    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
+    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
+    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
+    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
+    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
+    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
+    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
+    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
+    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
+    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
+    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
 
-#define isb_data(w) \
-w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), w(0x38),\
-w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), w(0xd7), w(0xfb),\
-w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), w(0x2f), w(0xff), w(0x87),\
-w(0x34), w(0x8e), w(0x43), w(0x44), w(0xc4), w(0xde), w(0xe9), w(0xcb),\
-w(0x54), w(0x7b), w(0x94), w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d),\
-w(0xee), w(0x4c), w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e),\
-w(0x08), w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2),\
-w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), w(0x25),\
-w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), w(0x98), w(0x16),\
-w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), w(0x65), w(0xb6), w(0x92),\
-w(0x6c), w(0x70), w(0x48), w(0x50), w(0xfd), w(0xed), w(0xb9), w(0xda),\
-w(0x5e), w(0x15), w(0x46), w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84),\
-w(0x90), w(0xd8), w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a),\
-w(0xf7), w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06),\
-w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), w(0x02),\
-w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), w(0x8a), w(0x6b),\
-w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), w(0x67), w(0xdc), w(0xea),\
-w(0x97), w(0xf2), w(0xcf), w(0xce), w(0xf0), w(0xb4), w(0xe6), w(0x73),\
-w(0x96), w(0xac), w(0x74), w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85),\
-w(0xe2), w(0xf9), w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e),\
-w(0x47), w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89),\
-w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), w(0x1b),\
-w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), w(0x79), w(0x20),\
-w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), w(0xcd), w(0x5a), w(0xf4),\
-w(0x1f), w(0xdd), w(0xa8), w(0x33), w(0x88), w(0x07), w(0xc7), w(0x31),\
-w(0xb1), w(0x12), w(0x10), w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f),\
-w(0x60), w(0x51), w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d),\
-w(0x2d), w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef),\
-w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), w(0xb0),\
-w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), w(0x99), w(0x61),\
-w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), w(0x77), w(0xd6), w(0x26),\
-w(0xe1), w(0x69), w(0x14), w(0x63), w(0x55), w(0x21), w(0x0c), w(0x7d),
+#define isb_data(w) {\
+    w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), w(0x38),\
+    w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), w(0xd7), w(0xfb),\
+    w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), w(0x2f), w(0xff), w(0x87),\
+    w(0x34), w(0x8e), w(0x43), w(0x44), w(0xc4), w(0xde), w(0xe9), w(0xcb),\
+    w(0x54), w(0x7b), w(0x94), w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d),\
+    w(0xee), w(0x4c), w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e),\
+    w(0x08), w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2),\
+    w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), w(0x25),\
+    w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), w(0x98), w(0x16),\
+    w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), w(0x65), w(0xb6), w(0x92),\
+    w(0x6c), w(0x70), w(0x48), w(0x50), w(0xfd), w(0xed), w(0xb9), w(0xda),\
+    w(0x5e), w(0x15), w(0x46), w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84),\
+    w(0x90), w(0xd8), w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a),\
+    w(0xf7), w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06),\
+    w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), w(0x02),\
+    w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), w(0x8a), w(0x6b),\
+    w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), w(0x67), w(0xdc), w(0xea),\
+    w(0x97), w(0xf2), w(0xcf), w(0xce), w(0xf0), w(0xb4), w(0xe6), w(0x73),\
+    w(0x96), w(0xac), w(0x74), w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85),\
+    w(0xe2), w(0xf9), w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e),\
+    w(0x47), w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89),\
+    w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), w(0x1b),\
+    w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), w(0x79), w(0x20),\
+    w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), w(0xcd), w(0x5a), w(0xf4),\
+    w(0x1f), w(0xdd), w(0xa8), w(0x33), w(0x88), w(0x07), w(0xc7), w(0x31),\
+    w(0xb1), w(0x12), w(0x10), w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f),\
+    w(0x60), w(0x51), w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d),\
+    w(0x2d), w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef),\
+    w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), w(0xb0),\
+    w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), w(0x99), w(0x61),\
+    w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), w(0x77), w(0xd6), w(0x26),\
+    w(0xe1), w(0x69), w(0x14), w(0x63), w(0x55), w(0x21), w(0x0c), w(0x7d) }
 
-#define mm_data(w) \
-w(0x00), w(0x01), w(0x02), w(0x03), w(0x04), w(0x05), w(0x06), w(0x07),\
-w(0x08), w(0x09), w(0x0a), w(0x0b), w(0x0c), w(0x0d), w(0x0e), w(0x0f),\
-w(0x10), w(0x11), w(0x12), w(0x13), w(0x14), w(0x15), w(0x16), w(0x17),\
-w(0x18), w(0x19), w(0x1a), w(0x1b), w(0x1c), w(0x1d), w(0x1e), w(0x1f),\
-w(0x20), w(0x21), w(0x22), w(0x23), w(0x24), w(0x25), w(0x26), w(0x27),\
-w(0x28), w(0x29), w(0x2a), w(0x2b), w(0x2c), w(0x2d), w(0x2e), w(0x2f),\
-w(0x30), w(0x31), w(0x32), w(0x33), w(0x34), w(0x35), w(0x36), w(0x37),\
-w(0x38), w(0x39), w(0x3a), w(0x3b), w(0x3c), w(0x3d), w(0x3e), w(0x3f),\
-w(0x40), w(0x41), w(0x42), w(0x43), w(0x44), w(0x45), w(0x46), w(0x47),\
-w(0x48), w(0x49), w(0x4a), w(0x4b), w(0x4c), w(0x4d), w(0x4e), w(0x4f),\
-w(0x50), w(0x51), w(0x52), w(0x53), w(0x54), w(0x55), w(0x56), w(0x57),\
-w(0x58), w(0x59), w(0x5a), w(0x5b), w(0x5c), w(0x5d), w(0x5e), w(0x5f),\
-w(0x60), w(0x61), w(0x62), w(0x63), w(0x64), w(0x65), w(0x66), w(0x67),\
-w(0x68), w(0x69), w(0x6a), w(0x6b), w(0x6c), w(0x6d), w(0x6e), w(0x6f),\
-w(0x70), w(0x71), w(0x72), w(0x73), w(0x74), w(0x75), w(0x76), w(0x77),\
-w(0x78), w(0x79), w(0x7a), w(0x7b), w(0x7c), w(0x7d), w(0x7e), w(0x7f),\
-w(0x80), w(0x81), w(0x82), w(0x83), w(0x84), w(0x85), w(0x86), w(0x87),\
-w(0x88), w(0x89), w(0x8a), w(0x8b), w(0x8c), w(0x8d), w(0x8e), w(0x8f),\
-w(0x90), w(0x91), w(0x92), w(0x93), w(0x94), w(0x95), w(0x96), w(0x97),\
-w(0x98), w(0x99), w(0x9a), w(0x9b), w(0x9c), w(0x9d), w(0x9e), w(0x9f),\
-w(0xa0), w(0xa1), w(0xa2), w(0xa3), w(0xa4), w(0xa5), w(0xa6), w(0xa7),\
-w(0xa8), w(0xa9), w(0xaa), w(0xab), w(0xac), w(0xad), w(0xae), w(0xaf),\
-w(0xb0), w(0xb1), w(0xb2), w(0xb3), w(0xb4), w(0xb5), w(0xb6), w(0xb7),\
-w(0xb8), w(0xb9), w(0xba), w(0xbb), w(0xbc), w(0xbd), w(0xbe), w(0xbf),\
-w(0xc0), w(0xc1), w(0xc2), w(0xc3), w(0xc4), w(0xc5), w(0xc6), w(0xc7),\
-w(0xc8), w(0xc9), w(0xca), w(0xcb), w(0xcc), w(0xcd), w(0xce), w(0xcf),\
-w(0xd0), w(0xd1), w(0xd2), w(0xd3), w(0xd4), w(0xd5), w(0xd6), w(0xd7),\
-w(0xd8), w(0xd9), w(0xda), w(0xdb), w(0xdc), w(0xdd), w(0xde), w(0xdf),\
-w(0xe0), w(0xe1), w(0xe2), w(0xe3), w(0xe4), w(0xe5), w(0xe6), w(0xe7),\
-w(0xe8), w(0xe9), w(0xea), w(0xeb), w(0xec), w(0xed), w(0xee), w(0xef),\
-w(0xf0), w(0xf1), w(0xf2), w(0xf3), w(0xf4), w(0xf5), w(0xf6), w(0xf7),\
-w(0xf8), w(0xf9), w(0xfa), w(0xfb), w(0xfc), w(0xfd), w(0xfe), w(0xff)
+#define mm_data(w) {\
+    w(0x00), w(0x01), w(0x02), w(0x03), w(0x04), w(0x05), w(0x06), w(0x07),\
+    w(0x08), w(0x09), w(0x0a), w(0x0b), w(0x0c), w(0x0d), w(0x0e), w(0x0f),\
+    w(0x10), w(0x11), w(0x12), w(0x13), w(0x14), w(0x15), w(0x16), w(0x17),\
+    w(0x18), w(0x19), w(0x1a), w(0x1b), w(0x1c), w(0x1d), w(0x1e), w(0x1f),\
+    w(0x20), w(0x21), w(0x22), w(0x23), w(0x24), w(0x25), w(0x26), w(0x27),\
+    w(0x28), w(0x29), w(0x2a), w(0x2b), w(0x2c), w(0x2d), w(0x2e), w(0x2f),\
+    w(0x30), w(0x31), w(0x32), w(0x33), w(0x34), w(0x35), w(0x36), w(0x37),\
+    w(0x38), w(0x39), w(0x3a), w(0x3b), w(0x3c), w(0x3d), w(0x3e), w(0x3f),\
+    w(0x40), w(0x41), w(0x42), w(0x43), w(0x44), w(0x45), w(0x46), w(0x47),\
+    w(0x48), w(0x49), w(0x4a), w(0x4b), w(0x4c), w(0x4d), w(0x4e), w(0x4f),\
+    w(0x50), w(0x51), w(0x52), w(0x53), w(0x54), w(0x55), w(0x56), w(0x57),\
+    w(0x58), w(0x59), w(0x5a), w(0x5b), w(0x5c), w(0x5d), w(0x5e), w(0x5f),\
+    w(0x60), w(0x61), w(0x62), w(0x63), w(0x64), w(0x65), w(0x66), w(0x67),\
+    w(0x68), w(0x69), w(0x6a), w(0x6b), w(0x6c), w(0x6d), w(0x6e), w(0x6f),\
+    w(0x70), w(0x71), w(0x72), w(0x73), w(0x74), w(0x75), w(0x76), w(0x77),\
+    w(0x78), w(0x79), w(0x7a), w(0x7b), w(0x7c), w(0x7d), w(0x7e), w(0x7f),\
+    w(0x80), w(0x81), w(0x82), w(0x83), w(0x84), w(0x85), w(0x86), w(0x87),\
+    w(0x88), w(0x89), w(0x8a), w(0x8b), w(0x8c), w(0x8d), w(0x8e), w(0x8f),\
+    w(0x90), w(0x91), w(0x92), w(0x93), w(0x94), w(0x95), w(0x96), w(0x97),\
+    w(0x98), w(0x99), w(0x9a), w(0x9b), w(0x9c), w(0x9d), w(0x9e), w(0x9f),\
+    w(0xa0), w(0xa1), w(0xa2), w(0xa3), w(0xa4), w(0xa5), w(0xa6), w(0xa7),\
+    w(0xa8), w(0xa9), w(0xaa), w(0xab), w(0xac), w(0xad), w(0xae), w(0xaf),\
+    w(0xb0), w(0xb1), w(0xb2), w(0xb3), w(0xb4), w(0xb5), w(0xb6), w(0xb7),\
+    w(0xb8), w(0xb9), w(0xba), w(0xbb), w(0xbc), w(0xbd), w(0xbe), w(0xbf),\
+    w(0xc0), w(0xc1), w(0xc2), w(0xc3), w(0xc4), w(0xc5), w(0xc6), w(0xc7),\
+    w(0xc8), w(0xc9), w(0xca), w(0xcb), w(0xcc), w(0xcd), w(0xce), w(0xcf),\
+    w(0xd0), w(0xd1), w(0xd2), w(0xd3), w(0xd4), w(0xd5), w(0xd6), w(0xd7),\
+    w(0xd8), w(0xd9), w(0xda), w(0xdb), w(0xdc), w(0xdd), w(0xde), w(0xdf),\
+    w(0xe0), w(0xe1), w(0xe2), w(0xe3), w(0xe4), w(0xe5), w(0xe6), w(0xe7),\
+    w(0xe8), w(0xe9), w(0xea), w(0xeb), w(0xec), w(0xed), w(0xee), w(0xef),\
+    w(0xf0), w(0xf1), w(0xf2), w(0xf3), w(0xf4), w(0xf5), w(0xf6), w(0xf7),\
+    w(0xf8), w(0xf9), w(0xfa), w(0xfb), w(0xfc), w(0xfd), w(0xfe), w(0xff) }
+
+#define rc_data(w) {\
+    w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\
+    w(0x1b), w(0x36) }
 
 #define h0(x)   (x)
 
-/*  These defines are used to ensure tables are generated in the
-right format depending on the internal byte order required
-*/
-
 #define w0(p)   bytes2word(p, 0, 0, 0)
 #define w1(p)   bytes2word(0, p, 0, 0)
 #define w2(p)   bytes2word(0, 0, p, 0)
 #define w3(p)   bytes2word(0, 0, 0, p)
 
-/*  Number of elements required in this table for different
-block and key lengths is:
-
-Rcon Table      key length (bytes)
-Length          16  20  24  28  32
----------------------
-block     16 |  10   9   8   7   7
-length    20 |  14  11  10   9   9
-(bytes)   24 |  19  15  12  11  11
-28 |  24  19  16  13  13
-32 |  29  23  19  17  14
-
-this table can be a table of bytes if the key schedule
-code is adjusted accordingly
-*/
-
 #define u0(p)   bytes2word(f2(p), p, p, f3(p))
 #define u1(p)   bytes2word(f3(p), f2(p), p, p)
 #define u2(p)   bytes2word(p, f3(p), f2(p), p)
@@ -186,309 +148,244 @@ code is adjusted accordingly
 #define v2(p)   bytes2word(fd(p), fb(p), fe(p), f9(p))
 #define v3(p)   bytes2word(f9(p), fd(p), fb(p), fe(p))
 
-const aes_32t rcon_tab[29] =
-{
-w0(0x01), w0(0x02), w0(0x04), w0(0x08),
-w0(0x10), w0(0x20), w0(0x40), w0(0x80),
-w0(0x1b), w0(0x36), w0(0x6c), w0(0xd8),
-w0(0xab), w0(0x4d), w0(0x9a), w0(0x2f),
-w0(0x5e), w0(0xbc), w0(0x63), w0(0xc6),
-w0(0x97), w0(0x35), w0(0x6a), w0(0xd4),
-w0(0xb3), w0(0x7d), w0(0xfa), w0(0xef),
-w0(0xc5)
-};
-
-#ifdef  SBX_SET
-const aes_08t s_box[256] = { sb_data(h0) };
-#endif
-#ifdef  ISB_SET
-const aes_08t inv_s_box[256] = { isb_data(h0) };
 #endif
 
-#ifdef  FT1_SET
-const aes_32t ft_tab[256] = { sb_data(u0) };
-#endif
-#ifdef  FT4_SET
-const aes_32t ft_tab[4][256] =
-{ {  sb_data(u0) }, {  sb_data(u1) }, {  sb_data(u2) }, {  sb_data(u3) } };
-#endif
+#if defined(FIXED_TABLES) || !defined(FF_TABLES)
 
-#ifdef  FL1_SET
-const aes_32t fl_tab[256] = { sb_data(w0) };
-#endif
-#ifdef  FL4_SET
-const aes_32t fl_tab[4][256] =
-{ {  sb_data(w0) }, {  sb_data(w1) }, {  sb_data(w2) }, {  sb_data(w3) } };
-#endif
-
-#ifdef  IT1_SET
-const aes_32t it_tab[256] = { isb_data(v0) };
-#endif
-#ifdef  IT4_SET
-const aes_32t it_tab[4][256] =
-{ { isb_data(v0) }, { isb_data(v1) }, { isb_data(v2) }, { isb_data(v3) } };
-#endif
-
-#ifdef  IL1_SET
-const aes_32t il_tab[256] = { isb_data(w0) };
-#endif
-#ifdef  IL4_SET
-const aes_32t il_tab[4][256] =
-{ { isb_data(w0) }, { isb_data(w1) }, { isb_data(w2) }, { isb_data(w3) } };
-#endif
-
-#ifdef  LS1_SET
-const aes_32t ls_tab[256] = { sb_data(w0) };
-#endif
-#ifdef  LS4_SET
-const aes_32t ls_tab[4][256] =
-{ {  sb_data(w0) }, {  sb_data(w1) }, {  sb_data(w2) }, {  sb_data(w3) } };
-#endif
-
-#ifdef  IM1_SET
-const aes_32t im_tab[256] = { mm_data(v0) };
-#endif
-#ifdef  IM4_SET
-const aes_32t im_tab[4][256] =
-{ {  mm_data(v0) }, {  mm_data(v1) }, {  mm_data(v2) }, {  mm_data(v3) } };
-#endif
-
-#else   /* dynamic table generation */
-
-aes_08t tab_init = 0;
-
-#define const
-
-aes_32t  rcon_tab[RC_LENGTH];
-
-#ifdef  SBX_SET
-aes_08t s_box[256];
-#endif
-#ifdef  ISB_SET
-aes_08t inv_s_box[256];
-#endif
-
-#ifdef  FT1_SET
-aes_32t ft_tab[256];
-#endif
-#ifdef  FT4_SET
-aes_32t ft_tab[4][256];
-#endif
-
-#ifdef  FL1_SET
-aes_32t fl_tab[256];
-#endif
-#ifdef  FL4_SET
-aes_32t fl_tab[4][256];
-#endif
-
-#ifdef  IT1_SET
-aes_32t it_tab[256];
-#endif
-#ifdef  IT4_SET
-aes_32t it_tab[4][256];
-#endif
-
-#ifdef  IL1_SET
-aes_32t il_tab[256];
-#endif
-#ifdef  IL4_SET
-aes_32t il_tab[4][256];
-#endif
-
-#ifdef  LS1_SET
-aes_32t ls_tab[256];
-#endif
-#ifdef  LS4_SET
-aes_32t ls_tab[4][256];
-#endif
-
-#ifdef  IM1_SET
-aes_32t im_tab[256];
-#endif
-#ifdef  IM4_SET
-aes_32t im_tab[4][256];
-#endif
-
-#if !defined(FF_TABLES)
-
-/*  Generate the tables for the dynamic table option
-
-It will generally be sensible to use tables to compute finite
-field multiplies and inverses but where memory is scarse this
-code might sometimes be better. But it only has effect during
-initialisation so its pretty unimportant in overall terms.
-*/
-
-/*  return 2 ^ (n - 1) where n is the bit number of the highest bit
-set in x with x in the range 1 < x < 0x00000200.   This form is
-used so that locals within fi can be bytes rather than words
-*/
-
-static aes_08t hibit(const aes_32t x)
-{   aes_08t r = (aes_08t)((x >> 1) | (x >> 2));
-
-r |= (r >> 2);
-r |= (r >> 4);
-return (r + 1) >> 1;
-}
-
-/* return the inverse of the finite field element x */
-
-static aes_08t fi(const aes_08t x)
-{   aes_08t p1 = x, p2 = BPOLY, n1 = hibit(x), n2 = 0x80, v1 = 1, v2 = 0;
-
-if(x < 2) return x;
-
-for(;;)
-{
-if(!n1) return v1;
-
-while(n2 >= n1)
-{
-n2 /= n1; p2 ^= p1 * n2; v2 ^= v1 * n2; n2 = hibit(p2);
-}
-
-if(!n2) return v2;
-
-while(n1 >= n2)
-{
-n1 /= n2; p1 ^= p2 * n1; v1 ^= v2 * n1; n1 = hibit(p1);
-}
-}
-}
+#define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
+#define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
+#define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) \
+                        ^ (((x>>5) & 4) * WPOLY))
+#define f3(x)   (f2(x) ^ x)
+#define f9(x)   (f8(x) ^ x)
+#define fb(x)   (f8(x) ^ f2(x) ^ x)
+#define fd(x)   (f8(x) ^ f4(x) ^ x)
+#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
 
 #else
 
-/* define the finite field multiplies required for Rijndael */
-
 #define f2(x) ((x) ? pow[log[x] + 0x19] : 0)
 #define f3(x) ((x) ? pow[log[x] + 0x01] : 0)
 #define f9(x) ((x) ? pow[log[x] + 0xc7] : 0)
 #define fb(x) ((x) ? pow[log[x] + 0x68] : 0)
 #define fd(x) ((x) ? pow[log[x] + 0xee] : 0)
 #define fe(x) ((x) ? pow[log[x] + 0xdf] : 0)
-#define fi(x) ((x) ?   pow[255 - log[x]]: 0)
+
+#endif
+
+#include "aestab.h"
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#if defined(FIXED_TABLES)
+
+/* implemented in case of wrong call for fixed tables */
+
+AES_RETURN aes_init(void)
+{
+    return EXIT_SUCCESS;
+}
+
+#else   /*  Generate the tables for the dynamic table option */
+
+#if defined(FF_TABLES)
+
+#define gf_inv(x)   ((x) ? pow[ 255 - log[x]] : 0)
+
+#else 
+
+/*  It will generally be sensible to use tables to compute finite
+    field multiplies and inverses but where memory is scarse this
+    code might sometimes be better. But it only has effect during
+    initialisation so its pretty unimportant in overall terms.
+*/
+
+/*  return 2 ^ (n - 1) where n is the bit number of the highest bit
+    set in x with x in the range 1 < x < 0x00000200.   This form is
+    used so that locals within fi can be bytes rather than words
+*/
+
+static uint8_t hibit(const uint32_t x)
+{   uint8_t r = (uint8_t)((x >> 1) | (x >> 2));
+
+    r |= (r >> 2);
+    r |= (r >> 4);
+    return (r + 1) >> 1;
+}
+
+/* return the inverse of the finite field element x */
+
+static uint8_t gf_inv(const uint8_t x)
+{   uint8_t p1 = x, p2 = BPOLY, n1 = hibit(x), n2 = 0x80, v1 = 1, v2 = 0;
+
+    if(x < 2) 
+        return x;
+
+    for( ; ; )
+    {
+        if(n1)
+            while(n2 >= n1)             /* divide polynomial p2 by p1    */
+            {
+                n2 /= n1;               /* shift smaller polynomial left */ 
+                p2 ^= (p1 * n2) & 0xff; /* and remove from larger one    */
+                v2 ^= v1 * n2;          /* shift accumulated value and   */ 
+                n2 = hibit(p2);         /* add into result               */
+            }
+        else
+            return v1;
+
+        if(n2)                          /* repeat with values swapped    */ 
+            while(n1 >= n2)
+            {
+                n1 /= n2; 
+                p1 ^= p2 * n1; 
+                v1 ^= v2 * n1; 
+                n1 = hibit(p1);
+            }
+        else
+            return v2;
+    }
+}
 
 #endif
 
 /* The forward and inverse affine transformations used in the S-box */
+uint8_t fwd_affine(const uint8_t x)
+{   uint32_t w = x;
+    w ^= (w << 1) ^ (w << 2) ^ (w << 3) ^ (w << 4);
+    return 0x63 ^ ((w ^ (w >> 8)) & 0xff);
+}
 
-#define fwd_affine(x) \
-(w = (aes_32t)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(aes_08t)(w^(w>>8)))
+uint8_t inv_affine(const uint8_t x)
+{   uint32_t w = x;
+    w = (w << 1) ^ (w << 3) ^ (w << 6);
+    return 0x05 ^ ((w ^ (w >> 8)) & 0xff);
+}
 
-#define inv_affine(x) \
-(w = (aes_32t)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(aes_08t)(w^(w>>8)))
+static int init = 0;
 
-void gen_tabs(void)
-{   aes_32t  i, w;
+AES_RETURN aes_init(void)
+{   uint32_t  i, w;
 
 #if defined(FF_TABLES)
 
-aes_08t  pow[512], log[256];
+    uint8_t  pow[512], log[256];
 
-/*  log and power tables for GF(2^8) finite field with
-WPOLY as modular polynomial - the simplest primitive
-root is 0x03, used here to generate the tables
-*/
+    if(init)
+        return EXIT_SUCCESS;
+    /*  log and power tables for GF(2^8) finite field with
+        WPOLY as modular polynomial - the simplest primitive
+        root is 0x03, used here to generate the tables
+    */
 
-i = 0; w = 1;
-do
-{
-pow[i] = (aes_08t)w;
-pow[i + 255] = (aes_08t)w;
-log[w] = (aes_08t)i++;
-w ^=  (w << 1) ^ (w & 0x80 ? WPOLY : 0);
-}
-while (w != 1);
+    i = 0; w = 1;
+    do
+    {
+        pow[i] = (uint8_t)w;
+        pow[i + 255] = (uint8_t)w;
+        log[w] = (uint8_t)i++;
+        w ^=  (w << 1) ^ (w & 0x80 ? WPOLY : 0);
+    }
+    while (w != 1);
 
+#else
+    if(init)
+        return EXIT_SUCCESS;
 #endif
 
-for(i = 0, w = 1; i < RC_LENGTH; ++i)
-{
-rcon_tab[i] = bytes2word(w, 0, 0, 0);
-w = f2(w);
-}
+    for(i = 0, w = 1; i < RC_LENGTH; ++i)
+    {
+        t_set(r,c)[i] = bytes2word(w, 0, 0, 0);
+        w = f2(w);
+    }
 
-for(i = 0; i < 256; ++i)
-{   aes_08t    b;
+    for(i = 0; i < 256; ++i)
+    {   uint8_t    b;
 
-b = fwd_affine(fi((aes_08t)i));
-w = bytes2word(f2(b), b, b, f3(b));
+        b = fwd_affine(gf_inv((uint8_t)i));
+        w = bytes2word(f2(b), b, b, f3(b));
 
-#ifdef  SBX_SET
-s_box[i] = b;
+#if defined( SBX_SET )
+        t_set(s,box)[i] = b;
 #endif
 
-#ifdef  FT1_SET                 /* tables for a normal encryption round */
-ft_tab[i] = w;
+#if defined( FT1_SET )                 /* tables for a normal encryption round */
+        t_set(f,n)[i] = w;
 #endif
-#ifdef  FT4_SET
-ft_tab[0][i] = w;
-ft_tab[1][i] = upr(w,1);
-ft_tab[2][i] = upr(w,2);
-ft_tab[3][i] = upr(w,3);
+#if defined( FT4_SET )
+        t_set(f,n)[0][i] = w;
+        t_set(f,n)[1][i] = upr(w,1);
+        t_set(f,n)[2][i] = upr(w,2);
+        t_set(f,n)[3][i] = upr(w,3);
 #endif
-w = bytes2word(b, 0, 0, 0);
+        w = bytes2word(b, 0, 0, 0);
 
-#ifdef  FL1_SET                 /* tables for last encryption round (may also   */
-fl_tab[i] = w;          /* be used in the key schedule)                 */
+#if defined( FL1_SET )            /* tables for last encryption round (may also   */
+        t_set(f,l)[i] = w;        /* be used in the key schedule)                 */
 #endif
-#ifdef  FL4_SET
-fl_tab[0][i] = w;
-fl_tab[1][i] = upr(w,1);
-fl_tab[2][i] = upr(w,2);
-fl_tab[3][i] = upr(w,3);
+#if defined( FL4_SET )
+        t_set(f,l)[0][i] = w;
+        t_set(f,l)[1][i] = upr(w,1);
+        t_set(f,l)[2][i] = upr(w,2);
+        t_set(f,l)[3][i] = upr(w,3);
 #endif
 
-#ifdef  LS1_SET                 /* table for key schedule if fl_tab above is    */
-ls_tab[i] = w;          /* not of the required form                     */
+#if defined( LS1_SET )			/* table for key schedule if t_set(f,l) above is*/
+        t_set(l,s)[i] = w;      /* not of the required form                     */
 #endif
-#ifdef  LS4_SET
-ls_tab[0][i] = w;
-ls_tab[1][i] = upr(w,1);
-ls_tab[2][i] = upr(w,2);
-ls_tab[3][i] = upr(w,3);
+#if defined( LS4_SET )
+        t_set(l,s)[0][i] = w;
+        t_set(l,s)[1][i] = upr(w,1);
+        t_set(l,s)[2][i] = upr(w,2);
+        t_set(l,s)[3][i] = upr(w,3);
 #endif
 
-b = fi(inv_affine((aes_08t)i));
-w = bytes2word(fe(b), f9(b), fd(b), fb(b));
+        b = gf_inv(inv_affine((uint8_t)i));
+        w = bytes2word(fe(b), f9(b), fd(b), fb(b));
 
-#ifdef  IM1_SET                 /* tables for the inverse mix column operation  */
-im_tab[b] = w;
+#if defined( IM1_SET )			/* tables for the inverse mix column operation  */
+        t_set(i,m)[b] = w;
 #endif
-#ifdef  IM4_SET
-im_tab[0][b] = w;
-im_tab[1][b] = upr(w,1);
-im_tab[2][b] = upr(w,2);
-im_tab[3][b] = upr(w,3);
+#if defined( IM4_SET )
+        t_set(i,m)[0][b] = w;
+        t_set(i,m)[1][b] = upr(w,1);
+        t_set(i,m)[2][b] = upr(w,2);
+        t_set(i,m)[3][b] = upr(w,3);
 #endif
 
-#ifdef  ISB_SET
-inv_s_box[i] = b;
+#if defined( ISB_SET )
+        t_set(i,box)[i] = b;
 #endif
-#ifdef  IT1_SET                 /* tables for a normal decryption round */
-it_tab[i] = w;
+#if defined( IT1_SET )			/* tables for a normal decryption round */
+        t_set(i,n)[i] = w;
 #endif
-#ifdef  IT4_SET
-it_tab[0][i] = w;
-it_tab[1][i] = upr(w,1);
-it_tab[2][i] = upr(w,2);
-it_tab[3][i] = upr(w,3);
+#if defined( IT4_SET )
+        t_set(i,n)[0][i] = w;
+        t_set(i,n)[1][i] = upr(w,1);
+        t_set(i,n)[2][i] = upr(w,2);
+        t_set(i,n)[3][i] = upr(w,3);
 #endif
-w = bytes2word(b, 0, 0, 0);
-#ifdef  IL1_SET                 /* tables for last decryption round */
-il_tab[i] = w;
+        w = bytes2word(b, 0, 0, 0);
+#if defined( IL1_SET )			/* tables for last decryption round */
+        t_set(i,l)[i] = w;
 #endif
-#ifdef  IL4_SET
-il_tab[0][i] = w;
-il_tab[1][i] = upr(w,1);
-il_tab[2][i] = upr(w,2);
-il_tab[3][i] = upr(w,3);
+#if defined( IL4_SET )
+        t_set(i,l)[0][i] = w;
+        t_set(i,l)[1][i] = upr(w,1);
+        t_set(i,l)[2][i] = upr(w,2);
+        t_set(i,l)[3][i] = upr(w,3);
 #endif
-}
-
-tab_init = 1;
+    }
+    init = 1;
+    return EXIT_SUCCESS;
 }
 
 #endif
+
+#if defined(__cplusplus)
+}
+#endif
+
diff --git a/aestab.h b/aestab.h
new file mode 100644
index 0000000000..46719d3304
--- /dev/null
+++ b/aestab.h
@@ -0,0 +1,173 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 20/12/2007
+
+ This file contains the code for declaring the tables needed to implement
+ AES. The file aesopt.h is assumed to be included before this header file.
+ If there are no global variables, the definitions here can be used to put
+ the AES tables in a structure so that a pointer can then be added to the
+ AES context to pass them to the AES routines that need them.   If this
+ facility is used, the calling program has to ensure that this pointer is
+ managed appropriately.  In particular, the value of the t_dec(in,it) item
+ in the table structure must be set to zero in order to ensure that the
+ tables are initialised. In practice the three code sequences in aeskey.c
+ that control the calls to aes_init() and the aes_init() routine itself will
+ have to be changed for a specific implementation. If global variables are
+ available it will generally be preferable to use them with the precomputed
+ FIXED_TABLES option that uses static global tables.
+
+ The following defines can be used to control the way the tables
+ are defined, initialised and used in embedded environments that
+ require special features for these purposes
+
+    the 't_dec' construction is used to declare fixed table arrays
+    the 't_set' construction is used to set fixed table values
+    the 't_use' construction is used to access fixed table values
+
+    256 byte tables:
+
+        t_xxx(s,box)    => forward S box
+        t_xxx(i,box)    => inverse S box
+
+    256 32-bit word OR 4 x 256 32-bit word tables:
+
+        t_xxx(f,n)      => forward normal round
+        t_xxx(f,l)      => forward last round
+        t_xxx(i,n)      => inverse normal round
+        t_xxx(i,l)      => inverse last round
+        t_xxx(l,s)      => key schedule table
+        t_xxx(i,m)      => key schedule table
+
+    Other variables and tables:
+
+        t_xxx(r,c)      => the rcon table
+*/
+
+#if !defined( _AESTAB_H )
+#define _AESTAB_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define t_dec(m,n) t_##m##n
+#define t_set(m,n) t_##m##n
+#define t_use(m,n) t_##m##n
+
+#if defined(FIXED_TABLES)
+#  if !defined( __GNUC__ ) && (defined( __MSDOS__ ) || defined( __WIN16__ ))
+/*   make tables far data to avoid using too much DGROUP space (PG) */
+#    define CONST const far
+#  else
+#    define CONST const
+#  endif
+#else
+#  define CONST
+#endif
+
+#if defined(DO_TABLES)
+#  define EXTERN
+#else
+#  define EXTERN extern
+#endif
+
+#if defined(_MSC_VER) && defined(TABLE_ALIGN)
+#define ALIGN __declspec(align(TABLE_ALIGN))
+#else
+#define ALIGN
+#endif
+
+#if defined( __WATCOMC__ ) && ( __WATCOMC__ >= 1100 )
+#  define XP_DIR __cdecl
+#else
+#  define XP_DIR
+#endif
+
+#if defined(DO_TABLES) && defined(FIXED_TABLES)
+#define d_1(t,n,b,e)       EXTERN ALIGN CONST XP_DIR t n[256]    =   b(e)
+#define d_4(t,n,b,e,f,g,h) EXTERN ALIGN CONST XP_DIR t n[4][256] = { b(e), b(f), b(g), b(h) }
+EXTERN ALIGN CONST uint32_t t_dec(r,c)[RC_LENGTH] = rc_data(w0);
+#else
+#define d_1(t,n,b,e)       EXTERN ALIGN CONST XP_DIR t n[256]
+#define d_4(t,n,b,e,f,g,h) EXTERN ALIGN CONST XP_DIR t n[4][256]
+EXTERN ALIGN CONST uint32_t t_dec(r,c)[RC_LENGTH];
+#endif
+
+#if defined( SBX_SET )
+    d_1(uint8_t, t_dec(s,box), sb_data, h0);
+#endif
+#if defined( ISB_SET )
+    d_1(uint8_t, t_dec(i,box), isb_data, h0);
+#endif
+
+#if defined( FT1_SET )
+    d_1(uint32_t, t_dec(f,n), sb_data, u0);
+#endif
+#if defined( FT4_SET )
+    d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3);
+#endif
+
+#if defined( FL1_SET )
+    d_1(uint32_t, t_dec(f,l), sb_data, w0);
+#endif
+#if defined( FL4_SET )
+    d_4(uint32_t, t_dec(f,l), sb_data, w0, w1, w2, w3);
+#endif
+
+#if defined( IT1_SET )
+    d_1(uint32_t, t_dec(i,n), isb_data, v0);
+#endif
+#if defined( IT4_SET )
+    d_4(uint32_t, t_dec(i,n), isb_data, v0, v1, v2, v3);
+#endif
+
+#if defined( IL1_SET )
+    d_1(uint32_t, t_dec(i,l), isb_data, w0);
+#endif
+#if defined( IL4_SET )
+    d_4(uint32_t, t_dec(i,l), isb_data, w0, w1, w2, w3);
+#endif
+
+#if defined( LS1_SET )
+#if defined( FL1_SET )
+#undef  LS1_SET
+#else
+    d_1(uint32_t, t_dec(l,s), sb_data, w0);
+#endif
+#endif
+
+#if defined( LS4_SET )
+#if defined( FL4_SET )
+#undef  LS4_SET
+#else
+    d_4(uint32_t, t_dec(l,s), sb_data, w0, w1, w2, w3);
+#endif
+#endif
+
+#if defined( IM1_SET )
+    d_1(uint32_t, t_dec(i,m), mm_data, v0);
+#endif
+#if defined( IM4_SET )
+    d_4(uint32_t, t_dec(i,m), mm_data, v0, v1, v2, v3);
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/bip39.c b/bip39.c
index aa0e89afb8..d12105935d 100644
--- a/bip39.c
+++ b/bip39.c
@@ -30,8 +30,6 @@
 #include "pbkdf2.h"
 #include "bip39_english.h"
 
-#define PBKDF2_ROUNDS 2048
-
 const char *mnemonic_generate(int strength)
 {
 	if (strength % 32 || strength < 128 || strength > 256) {
@@ -153,7 +151,7 @@ void mnemonic_to_seed(const char *mnemonic, const char *passphrase, uint8_t seed
 	memcpy(salt, "mnemonic", 8);
 	memcpy(salt + 8, passphrase, saltlen);
 	saltlen += 8;
-	pbkdf2((const uint8_t *)mnemonic, strlen(mnemonic), salt, saltlen, PBKDF2_ROUNDS, seed, 512 / 8, progress_callback);
+	pbkdf2((const uint8_t *)mnemonic, strlen(mnemonic), salt, saltlen, BIP39_PBKDF2_ROUNDS, seed, 512 / 8, progress_callback);
 }
 
 const char **mnemonic_wordlist(void)
diff --git a/bip39.h b/bip39.h
index 3bf73b532c..cad7a66d31 100644
--- a/bip39.h
+++ b/bip39.h
@@ -26,6 +26,8 @@
 
 #include <stdint.h>
 
+#define BIP39_PBKDF2_ROUNDS 2048
+
 const char *mnemonic_generate(int strength);	// strength in bits
 
 const char *mnemonic_from_data(const uint8_t *data, int len);
diff --git a/brg_endian.h b/brg_endian.h
new file mode 100644
index 0000000000..b44c5cbd66
--- /dev/null
+++ b/brg_endian.h
@@ -0,0 +1,126 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 20/12/2007
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
diff --git a/brg_types.h b/brg_types.h
new file mode 100644
index 0000000000..307319bf63
--- /dev/null
+++ b/brg_types.h
@@ -0,0 +1,191 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 20/12/2007
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint32_t'.  These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable.  However,
+ since the latter are of the form uint<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef _BRG_TYPES_H
+#define _BRG_TYPES_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+#include <stdint.h>
+
+#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
+#  include <stddef.h>
+#  define ptrint_t intptr_t
+#elif defined( __ECOS__ )
+#  define intptr_t unsigned int
+#  define ptrint_t intptr_t
+#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 )
+#  define ptrint_t intptr_t
+#else
+#  define ptrint_t int
+#endif
+
+#ifndef BRG_UI32
+#  define BRG_UI32
+#  if UINT_MAX == 4294967295u
+#    define li_32(h) 0x##h##u
+#  elif ULONG_MAX == 4294967295u
+#    define li_32(h) 0x##h##ul
+#  elif defined( _CRAY )
+#    error This code needs 32-bit data types, which Cray machines do not provide
+#  else
+#    error Please define uint32_t as a 32-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI64
+#  if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+#  elif defined( _MSC_VER ) && ( _MSC_VER < 1300 )    /* 1300 == VC++ 7.0 */
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+#  elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+#  elif defined( __MVS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+#  elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
+#    if UINT_MAX == 18446744073709551615u
+#      define BRG_UI64
+#      define li_64(h) 0x##h##u
+#    endif
+#  elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
+#    if ULONG_MAX == 18446744073709551615ul
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ul
+#    endif
+#  elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
+#    if ULLONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+#    endif
+#  elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
+#    if ULONG_LONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+#    endif
+#  endif
+#endif
+
+#if !defined( BRG_UI64 )
+#  if defined( NEED_UINT_64T )
+#    error Please define uint64_t as an unsigned 64 bit type in brg_types.h
+#  endif
+#endif
+
+#ifndef RETURN_VALUES
+#  define RETURN_VALUES
+#  if defined( DLL_EXPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
+#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllexport__ ) void
+#      define INT_RETURN     __declspec( __dllexport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( DLL_IMPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
+#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllimport__ ) void
+#      define INT_RETURN     __declspec( __dllimport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( __WATCOMC__ )
+#    define VOID_RETURN  void __cdecl
+#    define INT_RETURN   int  __cdecl
+#  else
+#    define VOID_RETURN  void
+#    define INT_RETURN   int
+#  endif
+#endif
+
+/*	These defines are used to detect and set the memory alignment of pointers.
+    Note that offsets are in bytes.
+
+    ALIGN_OFFSET(x,n)			return the positive or zero offset of 
+                                the memory addressed by the pointer 'x' 
+                                from an address that is aligned on an 
+                                'n' byte boundary ('n' is a power of 2)
+
+    ALIGN_FLOOR(x,n)			return a pointer that points to memory
+                                that is aligned on an 'n' byte boundary 
+                                and is not higher than the memory address
+                                pointed to by 'x' ('n' is a power of 2)
+
+    ALIGN_CEIL(x,n)				return a pointer that points to memory
+                                that is aligned on an 'n' byte boundary 
+                                and is not lower than the memory address
+                                pointed to by 'x' ('n' is a power of 2)
+*/
+
+#define ALIGN_OFFSET(x,n)	(((ptrint_t)(x)) & ((n) - 1))
+#define ALIGN_FLOOR(x,n)	((uint8_t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1)))
+#define ALIGN_CEIL(x,n)		((uint8_t*)(x) + (-((ptrint_t)(x)) & ((n) - 1)))
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8. NOTE that the 
+    buffer size is in bytes but the type length is in bits
+
+    UNIT_TYPEDEF(x,size)        declares a variable 'x' of length 
+                                'size' bits
+
+    BUFR_TYPEDEF(x,size,bsize)  declares a buffer 'x' of length 'bsize' 
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a 
+                                multiple of size / 8)
+
+    UNIT_CAST(x,size)           casts a variable to a type of 
+                                length 'size' bits
+
+    UPTR_CAST(x,size)           casts a pointer to a pointer to a 
+                                varaiable of length 'size' bits
+*/
+
+#define UI_TYPE(size)               uint##size##_t
+#define UNIT_TYPEDEF(x,size)        typedef UI_TYPE(size) x
+#define BUFR_TYPEDEF(x,size,bsize)  typedef UI_TYPE(size) x[bsize / (size >> 3)]
+#define UNIT_CAST(x,size)           ((UI_TYPE(size) )(x))  
+#define UPTR_CAST(x,size)           ((UI_TYPE(size)*)(x))
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/tests.c b/tests.c
index 0ba3fac394..69195cd8e6 100644
--- a/tests.c
+++ b/tests.c
@@ -44,8 +44,10 @@ uint8_t *fromhex(const char *str)
 		c = 0;
 		if (str[i*2] >= '0' && str[i*2] <= '9') c += (str[i*2] - '0') << 4;
 		if (str[i*2] >= 'a' && str[i*2] <= 'f') c += (10 + str[i*2] - 'a') << 4;
+		if (str[i*2] >= 'A' && str[i*2] <= 'F') c += (10 + str[i*2] - 'A') << 4;
 		if (str[i*2+1] >= '0' && str[i*2+1] <= '9') c += (str[i*2+1] - '0');
 		if (str[i*2+1] >= 'a' && str[i*2+1] <= 'f') c += (10 + str[i*2+1] - 'a');
+		if (str[i*2+1] >= 'A' && str[i*2+1] <= 'F') c += (10 + str[i*2+1] - 'A');
 		buf[i] = c;
 	}
 	return buf;
@@ -435,30 +437,162 @@ START_TEST(test_verify_speed)
 }
 END_TEST
 
-#define test_aes(KEY, BLKLEN, IN, OUT) do { \
-	sha256_Raw((uint8_t *)KEY, strlen(KEY), key); \
-	aes_enc_key(key, 32, &ctx); \
-	memcpy(in, fromhex(IN), BLKLEN); \
-	aes_enc_blk(in, out, &ctx); \
-	ck_assert_mem_eq(out, fromhex(OUT), BLKLEN); \
-} while (0)
-
-START_TEST(test_rijndael)
+void aes_ctr_counter_inc(uint8_t *ctr)
 {
-	aes_ctx ctx;
-	uint8_t key[32], in[32], out[32];
+	int i = 15;
+	while (i >= 0) {
+		ctr[i]++;
+		if (ctr[i]) return; // if there was no overflow
+		i--;
+	}
+}
 
-	test_aes("mnemonic", 16, "00000000000000000000000000000000", "a3af8b7d326a2d47bd7576012e07d103");
-//	test_aes("mnemonic", 24, "000000000000000000000000000000000000000000000000", "7b8704678f263c316ddd1746d8377a4046a99dd9e5687d59");
-//	test_aes("mnemonic", 32, "0000000000000000000000000000000000000000000000000000000000000000", "7c0575db9badc9960441c6b8dcbd5ebdfec522ede5309904b7088d0e77c2bcef");
+// test vectors from http://www.inconteam.com/software-development/41-encryption/55-aes-test-vectors
+START_TEST(test_aes)
+{
+	aes_encrypt_ctx ctxe;
+	aes_decrypt_ctx ctxd;
+	uint8_t ibuf[16], obuf[16], iv[16], cntr[16];
+	const char **ivp, **plainp, **cipherp;
 
-	test_aes("mnemonic", 16, "686f6a6461686f6a6461686f6a6461686f6a6461", "9c3bb85af2122cc2df449033338beb56");
-//	test_aes("mnemonic", 24, "686f6a6461686f6a6461686f6a6461686f6a6461686f6a64", "0d7009c589869eaa1d7398bffc7660cce32207a520d6cafe");
-//	test_aes("mnemonic", 32, "686f6a6461686f6a6461686f6a6461686f6a6461686f6a6461686f6a6461686f", "b1a4d05e3827611c5986ea4c207679a6934f20767434218029c4b3b7a53806a3");
+	// ECB
+	static const char *ecb_vector[] = {
+		// plain                            cipher
+		"6bc1bee22e409f96e93d7e117393172a", "f3eed1bdb5d2a03c064b5a7e3db181f8",
+		"ae2d8a571e03ac9c9eb76fac45af8e51", "591ccb10d410ed26dc5ba74a31362870",
+		"30c81c46a35ce411e5fbc1191a0a52ef", "b6ed21b99ca6f4f9f153e7b1beafed1d",
+		"f69f2445df4f9b17ad2b417be66c3710", "23304b7a39f9f3ff067d8d8f9e24ecc7",
+		0, 0,
+	};
+	plainp = ecb_vector;
+	cipherp = ecb_vector + 1;
+	while (*plainp && *cipherp) {
+		// encrypt
+		aes_encrypt_key256(fromhex("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"), &ctxe);
+		memcpy(ibuf, fromhex(*plainp), 16);
+		aes_ecb_encrypt(ibuf, obuf, 16, &ctxe);
+		ck_assert_mem_eq(obuf, fromhex(*cipherp), 16);
+		// decrypt
+		aes_decrypt_key256(fromhex("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"), &ctxd);
+		memcpy(ibuf, fromhex(*cipherp), 16);
+		aes_ecb_decrypt(ibuf, obuf, 16, &ctxd);
+		ck_assert_mem_eq(obuf, fromhex(*plainp), 16);
+		plainp += 2; cipherp += 2;
+	}
 
-	test_aes("mnemonic", 16, "ffffffffffffffffffffffffffffffff", "e720f4474b7dabe382eec0529e2b1128");
-//	test_aes("mnemonic", 24, "ffffffffffffffffffffffffffffffffffffffffffffffff", "14dfe4c7a93e14616dce6c793110baee0b8bb404f3bec6c5");
-//	test_aes("mnemonic", 32, "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", "ccf498fd9a57f872a4d274549fab474cbacdbd9d935ca31b06e3025526a704fb");
+	// CBC
+	static const char *cbc_vector[] = {
+		// iv                               plain                               cipher
+		"000102030405060708090A0B0C0D0E0F", "6bc1bee22e409f96e93d7e117393172a", "f58c4c04d6e5f1ba779eabfb5f7bfbd6",
+		"F58C4C04D6E5F1BA779EABFB5F7BFBD6", "ae2d8a571e03ac9c9eb76fac45af8e51", "9cfc4e967edb808d679f777bc6702c7d",
+		"9CFC4E967EDB808D679F777BC6702C7D", "30c81c46a35ce411e5fbc1191a0a52ef", "39f23369a9d9bacfa530e26304231461",
+		"39F23369A9D9BACFA530E26304231461", "f69f2445df4f9b17ad2b417be66c3710", "b2eb05e2c39be9fcda6c19078c6a9d1b",
+		0, 0, 0,
+	};
+	ivp = cbc_vector;
+	plainp = cbc_vector + 1;
+	cipherp = cbc_vector + 2;
+	while (*plainp && *cipherp) {
+		// encrypt
+		aes_encrypt_key256(fromhex("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"), &ctxe);
+		memcpy(iv, fromhex(*ivp), 16);
+		memcpy(ibuf, fromhex(*plainp), 16);
+		aes_cbc_encrypt(ibuf, obuf, 16, iv, &ctxe);
+		ck_assert_mem_eq(obuf, fromhex(*cipherp), 16);
+		// decrypt
+		aes_decrypt_key256(fromhex("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"), &ctxd);
+		memcpy(iv, fromhex(*ivp), 16);
+		memcpy(ibuf, fromhex(*cipherp), 16);
+		aes_cbc_decrypt(ibuf, obuf, 16, iv, &ctxd);
+		ck_assert_mem_eq(obuf, fromhex(*plainp), 16);
+		ivp += 3; plainp += 3; cipherp += 3;
+	}
+
+	// CFB
+	static const char *cfb_vector[] = {
+		"000102030405060708090A0B0C0D0E0F", "6bc1bee22e409f96e93d7e117393172a", "DC7E84BFDA79164B7ECD8486985D3860",
+		"DC7E84BFDA79164B7ECD8486985D3860", "ae2d8a571e03ac9c9eb76fac45af8e51", "39ffed143b28b1c832113c6331e5407b",
+		"39FFED143B28B1C832113C6331E5407B", "30c81c46a35ce411e5fbc1191a0a52ef", "df10132415e54b92a13ed0a8267ae2f9",
+		"DF10132415E54B92A13ED0A8267AE2F9", "f69f2445df4f9b17ad2b417be66c3710", "75a385741ab9cef82031623d55b1e471",
+		0, 0, 0,
+	};
+	ivp = cfb_vector;
+	plainp = cfb_vector + 1;
+	cipherp = cfb_vector + 2;
+	while (*plainp && *cipherp) {
+		// encrypt
+		aes_encrypt_key256(fromhex("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"), &ctxe);
+		memcpy(iv, fromhex(*ivp), 16);
+		memcpy(ibuf, fromhex(*plainp), 16);
+		aes_cfb_encrypt(ibuf, obuf, 16, iv, &ctxe);
+		ck_assert_mem_eq(obuf, fromhex(*cipherp), 16);
+		// decrypt (uses encryption)
+		aes_encrypt_key256(fromhex("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"), &ctxe);
+		memcpy(iv, fromhex(*ivp), 16);
+		memcpy(ibuf, fromhex(*cipherp), 16);
+		aes_cfb_decrypt(ibuf, obuf, 16, iv, &ctxe);
+		ck_assert_mem_eq(obuf, fromhex(*plainp), 16);
+		ivp += 3; plainp += 3; cipherp += 3;
+	}
+
+	// OFB
+	static const char *ofb_vector[] = {
+		"000102030405060708090A0B0C0D0E0F", "6bc1bee22e409f96e93d7e117393172a", "dc7e84bfda79164b7ecd8486985d3860",
+		"B7BF3A5DF43989DD97F0FA97EBCE2F4A", "ae2d8a571e03ac9c9eb76fac45af8e51", "4febdc6740d20b3ac88f6ad82a4fb08d",
+		"E1C656305ED1A7A6563805746FE03EDC", "30c81c46a35ce411e5fbc1191a0a52ef", "71ab47a086e86eedf39d1c5bba97c408",
+		"41635BE625B48AFC1666DD42A09D96E7", "f69f2445df4f9b17ad2b417be66c3710", "0126141d67f37be8538f5a8be740e484",
+		0, 0, 0,
+	};
+	ivp = ofb_vector;
+	plainp = ofb_vector + 1;
+	cipherp = ofb_vector + 2;
+	while (*plainp && *cipherp) {
+		// encrypt
+		aes_encrypt_key256(fromhex("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"), &ctxe);
+		memcpy(iv, fromhex(*ivp), 16);
+		memcpy(ibuf, fromhex(*plainp), 16);
+		aes_ofb_encrypt(ibuf, obuf, 16, iv, &ctxe);
+		ck_assert_mem_eq(obuf, fromhex(*cipherp), 16);
+		// decrypt (uses encryption)
+		aes_encrypt_key256(fromhex("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"), &ctxe);
+		memcpy(iv, fromhex(*ivp), 16);
+		memcpy(ibuf, fromhex(*cipherp), 16);
+		aes_ofb_decrypt(ibuf, obuf, 16, iv, &ctxe);
+		ck_assert_mem_eq(obuf, fromhex(*plainp), 16);
+		ivp += 3; plainp += 3; cipherp += 3;
+	}
+
+	// CTR
+	static const char *ctr_vector[] = {
+		// plain                            cipher
+		"6bc1bee22e409f96e93d7e117393172a", "601ec313775789a5b7a7f504bbf3d228",
+		"ae2d8a571e03ac9c9eb76fac45af8e51", "f443e3ca4d62b59aca84e990cacaf5c5",
+		"30c81c46a35ce411e5fbc1191a0a52ef", "2b0930daa23de94ce87017ba2d84988d",
+		"f69f2445df4f9b17ad2b417be66c3710", "dfc9c58db67aada613c2dd08457941a6",
+		0, 0,
+	};
+	// encrypt
+	plainp = ctr_vector;
+	cipherp = ctr_vector + 1;
+	memcpy(cntr, fromhex("f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"), 16);
+	aes_encrypt_key256(fromhex("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"), &ctxe);
+	while (*plainp && *cipherp) {
+		memcpy(ibuf, fromhex(*plainp), 16);
+		aes_ctr_encrypt(ibuf, obuf, 16, cntr, aes_ctr_counter_inc, &ctxe);
+		ck_assert_mem_eq(obuf, fromhex(*cipherp), 16);
+		plainp += 2; cipherp += 2;
+	}
+	// decrypt (uses encryption)
+	plainp = ctr_vector;
+	cipherp = ctr_vector + 1;
+	memcpy(cntr, fromhex("f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"), 16);
+	aes_encrypt_key256(fromhex("603deb1015ca71be2b73aef0857d77811f352c073b6108d72d9810a30914dff4"), &ctxe);
+	while (*plainp && *cipherp) {
+		memcpy(ibuf, fromhex(*cipherp), 16);
+		aes_ctr_decrypt(ibuf, obuf, 16, cntr, aes_ctr_counter_inc, &ctxe);
+		ck_assert_mem_eq(obuf, fromhex(*plainp), 16);
+		plainp += 2; cipherp += 2;
+	}
 }
 END_TEST
 
@@ -894,8 +1028,8 @@ Suite *test_suite(void)
 	tcase_add_test(tc, test_ecdsa_der);
 	suite_add_tcase(s, tc);
 
-	tc = tcase_create("rijndael");
-	tcase_add_test(tc, test_rijndael);
+	tc = tcase_create("aes");
+	tcase_add_test(tc, test_aes);
 	suite_add_tcase(s, tc);
 
 	tc = tcase_create("pbkdf2");