feat(crypto): Add Brian Gladman's implementation of GCM.

2025-07-31 19:08:28 +00:00 · 2024-03-15 11:10:47 +01:00 · 2024-03-15 11:10:47 +01:00 · 6e207215e3
commit 6e207215e3
parent 89147ef493
9 changed files with 2599 additions and 2 deletions
--- a/2
+++ b/2
@ -65,7 +65,7 @@ yaml_check: ## check yaml formatting
 	yamllint .

 editor_check: ## check editorconfig formatting
-	editorconfig-checker -exclude '.*\.(so|dat|toif|der)'
+	editorconfig-checker -exclude '.*\.(so|dat|toif|der)|^crypto/aes/'

 cstyle_check: ## run code style check on low-level C code
 	clang-format --version
--- a/crypto/aes/aesgcm.c
+++ b/crypto/aes/aesgcm.c
@ -0,0 +1,547 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 30/03/2011
+
+ My thanks to:
+
+   Colin Sinclair for finding an error and suggesting a number of
+   improvements to this code. 
+ 
+   John Viega and David McGrew for their support in the development 
+   of this code and to David for testing it on a big-endIAN system.
+
+   Mark Rodenkirch and Jason Papadopoulos for their help in finding
+   a bug in the fast buffer operations on big endian systems.
+*/
+
+#include "gcm.h"
+#include "mode_hdr.h"
+
+/*  This GCM implementation needs a Galois Field multiplier for GF(2^128).
+    which operates on field elements using a polynomial field representation
+    x^127 + x^126 + ... + x^2 + x + 1 using the bits in a bit sequence that
+    will be numbered by the power of x that they represent. GCM uses the
+    polynomial x^128 + x^7 + x^2 + x + 1 as its basis for representation.
+
+    The obvious way of representing this in a computer system is to map GF
+    'x' to the binary integer '2' - but this was way too obvious for any
+    cryptographer to adopt!
+
+    Here bytes are numbered in memory order and  bits within bytes according
+    to their integer numeric significance. The term 'little endian' is then
+    used to describe mappings in which numeric (power of 2) or field (power
+    of x) significance increase with increasing bit or byte numbers with
+    'big endian' being used to describe the inverse situation.
+
+    GCM uses little endian byte ordering and big endian bit ordering, a
+    representation that will be described as LB. Hence the low end of the
+    field polynomial is in byte[0], which has the value 0xe1 rather than
+    0x87 in the more obvious mappings.
+
+    The related field multipler can use this mapping but if you want to
+    use an alternative (e.g hardware) multiplier that uses a different
+    polynomial field representation, you can do so by changing the form
+    used for the field elements when this alternative multiplier is used.
+
+    If GF_REPRESENTATION is defined as one of:
+
+        REVERSE_BITS                      // change to LL
+        REVERSE_BYTES | REVERSE_BITS      // change to BL
+        REVERSE_NONE                      // no change
+        REVERSE_BYTES                     // change to BB
+
+    then an appropriate change of representation will occur before and
+    after calls to your revised field multiplier. To use this you need
+    to add gf_convert.c to your application.  
+*/
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#if 1
+#  undef GF_REPRESENTATION
+#elif 0
+#  define GF_REPRESENTATION REVERSE_BITS
+#elif 0
+#  define GF_REPRESENTATION REVERSE_BYTES | REVERSE_BITS
+#elif 0
+#  define GF_REPRESENTATION REVERSE_NONE
+#elif 0
+#  define GF_REPRESENTATION REVERSE_BITS
+#endif
+
+#define BLOCK_SIZE      GCM_BLOCK_SIZE      /* block length                 */
+#define BLK_ADR_MASK    (BLOCK_SIZE - 1)    /* mask for 'in block' address  */
+#define CTR_POS         12
+
+#define inc_ctr(x)  \
+    {   int i = BLOCK_SIZE; while(i-- > CTR_POS && !++(UI8_PTR(x)[i])) ; }
+
+ret_type gcm_init_and_key(                  /* initialise mode and set key  */
+            const unsigned char key[],      /* the key value                */
+            unsigned long key_len,          /* and its length in bytes      */
+            gcm_ctx ctx[1])                 /* the mode context             */
+{
+    memset(ctx->ghash_h, 0, sizeof(ctx->ghash_h));
+
+    /* set the AES key                          */
+    aes_encrypt_key(key, key_len, ctx->aes);
+
+    /* compute E(0) (for the hash function)     */
+    aes_encrypt(UI8_PTR(ctx->ghash_h), UI8_PTR(ctx->ghash_h), ctx->aes);
+
+#if defined( GF_REPRESENTATION )
+    convert_representation(ctx->ghash_h, ctx->ghash_h, GF_REPRESENTATION);
+#endif
+
+#if defined( TABLES_64K )
+    init_64k_table(ctx->ghash_h, ctx->gf_t64k);
+#elif defined( TABLES_8K )
+    init_8k_table(ctx->ghash_h, ctx->gf_t8k);
+#elif defined( TABLES_4K )
+    init_4k_table(ctx->ghash_h, ctx->gf_t4k);
+#elif defined( TABLES_256 )
+    init_256_table(ctx->ghash_h, ctx->gf_t256);
+#endif
+#if defined(  GF_REPRESENTATION )
+    convert_representation(ctx->ghash_h, ctx->ghash_h, GF_REPRESENTATION);
+#endif
+    return RETURN_GOOD;
+}
+
+void gf_mul_hh(gf_t a, gcm_ctx ctx[1])
+{
+#if defined( GF_REPRESENTATION ) || !defined( NO_TABLES )
+    gf_t    scr;
+#endif
+#if defined(  GF_REPRESENTATION )
+    convert_representation(a, a, GF_REPRESENTATION);
+#endif
+
+#if defined( TABLES_64K )
+    gf_mul_64k(a, ctx->gf_t64k, scr);
+#elif defined( TABLES_8K )
+    gf_mul_8k(a, ctx->gf_t8k, scr);
+#elif defined( TABLES_4K )
+    gf_mul_4k(a, ctx->gf_t4k, scr);
+#elif defined( TABLES_256 )
+    gf_mul_256(a, ctx->gf_t256, scr);
+#else
+# if defined( GF_REPRESENTATION )
+    convert_representation(scr, ctx->ghash_h, GF_REPRESENTATION);
+    gf_mul(a, scr);
+# else
+    gf_mul(a, ctx->ghash_h);
+# endif
+#endif
+
+#if defined(  GF_REPRESENTATION )
+    convert_representation(a, a, GF_REPRESENTATION);
+#endif
+}
+
+ret_type gcm_init_message(                  /* initialise a new message     */
+            const unsigned char iv[],       /* the initialisation vector    */
+            unsigned long iv_len,           /* and its length in bytes      */
+            gcm_ctx ctx[1])                 /* the mode context             */
+{   uint32_t i, n_pos = 0;
+    uint8_t *p;
+
+    memset(ctx->ctr_val, 0, BLOCK_SIZE);
+    if(iv_len == CTR_POS)
+    {
+        memcpy(ctx->ctr_val, iv, CTR_POS); UI8_PTR(ctx->ctr_val)[15] = 0x01;
+    }
+    else
+    {   n_pos = iv_len;
+        while(n_pos >= BLOCK_SIZE)
+        {
+            xor_block_aligned(ctx->ctr_val, ctx->ctr_val, iv);
+            n_pos -= BLOCK_SIZE;
+            iv += BLOCK_SIZE;
+            gf_mul_hh((gf_t*)ctx->ctr_val, ctx);
+        }
+
+        if(n_pos)
+        {
+            p = UI8_PTR(ctx->ctr_val);
+            while(n_pos-- > 0)
+                *p++ ^= *iv++;
+            gf_mul_hh((gf_t*)ctx->ctr_val, ctx);
+        }
+        n_pos = (iv_len << 3);
+        for(i = BLOCK_SIZE - 1; n_pos; --i, n_pos >>= 8)
+            UI8_PTR(ctx->ctr_val)[i] ^= (unsigned char)n_pos;
+        gf_mul_hh((gf_t*)ctx->ctr_val, ctx);
+    }
+
+    ctx->y0_val = *UI32_PTR(UI8_PTR(ctx->ctr_val) + CTR_POS);
+    memset(ctx->hdr_ghv, 0, BLOCK_SIZE);
+    memset(ctx->txt_ghv, 0, BLOCK_SIZE);
+    ctx->hdr_cnt = 0;
+    ctx->txt_ccnt = ctx->txt_acnt = 0;
+    return RETURN_GOOD;
+}
+
+ret_type gcm_auth_header(                   /* authenticate the header      */
+            const unsigned char hdr[],      /* the header buffer            */
+            unsigned long hdr_len,          /* and its length in bytes      */
+            gcm_ctx ctx[1])                 /* the mode context             */
+{   uint32_t cnt = 0, b_pos = (uint32_t)ctx->hdr_cnt & BLK_ADR_MASK;
+
+    if(!hdr_len)
+        return RETURN_GOOD;
+
+    if(ctx->hdr_cnt && b_pos == 0)
+        gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
+
+    if(!((hdr - (UI8_PTR(ctx->hdr_ghv) + b_pos)) & BUF_ADRMASK))
+    {
+		while(cnt < hdr_len && (b_pos & BUF_ADRMASK))
+		    UI8_PTR(ctx->hdr_ghv)[b_pos++] ^= hdr[cnt++];
+
+		while(cnt + BUF_INC <= hdr_len && b_pos <= BLOCK_SIZE - BUF_INC)
+        {
+            *UNIT_PTR(UI8_PTR(ctx->hdr_ghv) + b_pos) ^= *UNIT_PTR(hdr + cnt);
+            cnt += BUF_INC; b_pos += BUF_INC;
+        }
+
+        while(cnt + BLOCK_SIZE <= hdr_len)
+        {
+            gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
+            xor_block_aligned(ctx->hdr_ghv, ctx->hdr_ghv, hdr + cnt);
+            cnt += BLOCK_SIZE;
+        }
+    }
+    else
+    {
+        while(cnt < hdr_len && b_pos < BLOCK_SIZE)
+            UI8_PTR(ctx->hdr_ghv)[b_pos++] ^= hdr[cnt++];
+
+        while(cnt + BLOCK_SIZE <= hdr_len)
+        {
+            gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
+            xor_block(ctx->hdr_ghv, ctx->hdr_ghv, hdr + cnt);
+            cnt += BLOCK_SIZE;
+        }
+    }
+
+    while(cnt < hdr_len)
+    {
+        if(b_pos == BLOCK_SIZE)
+        {
+            gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
+            b_pos = 0;
+        }
+        UI8_PTR(ctx->hdr_ghv)[b_pos++] ^= hdr[cnt++];
+    }
+
+    ctx->hdr_cnt += cnt;
+    return RETURN_GOOD;
+}
+
+ret_type gcm_auth_data(                     /* authenticate ciphertext data */
+            const unsigned char data[],     /* the data buffer              */
+            unsigned long data_len,         /* and its length in bytes      */
+            gcm_ctx ctx[1])                 /* the mode context             */
+{   uint32_t cnt = 0, b_pos = (uint32_t)ctx->txt_acnt & BLK_ADR_MASK;
+
+    if(!data_len)
+        return RETURN_GOOD;
+
+    if(ctx->txt_acnt && b_pos == 0)
+        gf_mul_hh((gf_t*)ctx->txt_ghv, ctx);
+
+    if(!((data - (UI8_PTR(ctx->txt_ghv) + b_pos)) & BUF_ADRMASK))
+    {
+	    while(cnt < data_len && (b_pos & BUF_ADRMASK))
+		    UI8_PTR(ctx->txt_ghv)[b_pos++] ^= data[cnt++];
+
+        while(cnt + BUF_INC <= data_len && b_pos <= BLOCK_SIZE - BUF_INC)
+        {
+            *UNIT_PTR(UI8_PTR(ctx->txt_ghv) + b_pos) ^= *UNIT_PTR(data + cnt);
+            cnt += BUF_INC; b_pos += BUF_INC;
+        }
+
+        while(cnt + BLOCK_SIZE <= data_len)
+        {
+            gf_mul_hh((gf_t*)ctx->txt_ghv, ctx);
+            xor_block_aligned(ctx->txt_ghv, ctx->txt_ghv, data + cnt);
+            cnt += BLOCK_SIZE;
+        }
+    }
+    else
+    {
+        while(cnt < data_len && b_pos < BLOCK_SIZE)
+            UI8_PTR(ctx->txt_ghv)[b_pos++] ^= data[cnt++];
+
+        while(cnt + BLOCK_SIZE <= data_len)
+        {
+            gf_mul_hh((gf_t*)ctx->txt_ghv, ctx);
+            xor_block(ctx->txt_ghv, ctx->txt_ghv, data + cnt);
+            cnt += BLOCK_SIZE;
+        }
+    }
+
+    while(cnt < data_len)
+    {
+        if(b_pos == BLOCK_SIZE)
+        {
+            gf_mul_hh((gf_t*)ctx->txt_ghv, ctx);
+            b_pos = 0;
+        }
+        UI8_PTR(ctx->txt_ghv)[b_pos++] ^= data[cnt++];
+    }
+
+    ctx->txt_acnt += cnt;
+    return RETURN_GOOD;
+}
+
+ret_type gcm_crypt_data(                    /* encrypt or decrypt data      */
+            unsigned char data[],           /* the data buffer              */
+            unsigned long data_len,         /* and its length in bytes      */
+            gcm_ctx ctx[1])                 /* the mode context             */
+{   uint32_t cnt = 0, b_pos = (uint32_t)ctx->txt_ccnt & BLK_ADR_MASK;
+
+    if(!data_len)
+        return RETURN_GOOD;
+
+    if(!((data - (UI8_PTR(ctx->enc_ctr) + b_pos)) & BUF_ADRMASK))
+    {
+        if(b_pos)
+        {
+	        while(cnt < data_len && (b_pos & BUF_ADRMASK))
+		        data[cnt++] ^= UI8_PTR(ctx->enc_ctr)[b_pos++];
+
+            while(cnt + BUF_INC <= data_len && b_pos <= BLOCK_SIZE - BUF_INC)
+            {
+                *UNIT_PTR(data + cnt) ^= *UNIT_PTR(UI8_PTR(ctx->enc_ctr) + b_pos);
+                cnt += BUF_INC; b_pos += BUF_INC;
+            }
+        }
+
+        while(cnt + BLOCK_SIZE <= data_len)
+        {
+            inc_ctr(ctx->ctr_val);
+            aes_encrypt(UI8_PTR(ctx->ctr_val), UI8_PTR(ctx->enc_ctr), ctx->aes);
+            xor_block_aligned(data + cnt, data + cnt, ctx->enc_ctr);
+            cnt += BLOCK_SIZE;
+        }
+    }
+    else
+    {
+        if(b_pos)
+            while(cnt < data_len && b_pos < BLOCK_SIZE)
+                data[cnt++] ^= UI8_PTR(ctx->enc_ctr)[b_pos++];
+
+        while(cnt + BLOCK_SIZE <= data_len)
+        {
+            inc_ctr(ctx->ctr_val);
+            aes_encrypt(UI8_PTR(ctx->ctr_val), UI8_PTR(ctx->enc_ctr), ctx->aes);
+            xor_block(data + cnt, data + cnt, ctx->enc_ctr);
+            cnt += BLOCK_SIZE;
+        }
+    }
+
+    while(cnt < data_len)
+    {
+        if(b_pos == BLOCK_SIZE || !b_pos)
+        {
+            inc_ctr(ctx->ctr_val);
+            aes_encrypt(UI8_PTR(ctx->ctr_val), UI8_PTR(ctx->enc_ctr), ctx->aes);
+            b_pos = 0;
+        }
+        data[cnt++] ^= UI8_PTR(ctx->enc_ctr)[b_pos++];
+    }
+
+    ctx->txt_ccnt += cnt;
+    return RETURN_GOOD;
+}
+
+ret_type gcm_compute_tag(                   /* compute authentication tag   */
+            unsigned char tag[],            /* the buffer for the tag       */
+            unsigned long tag_len,          /* and its length in bytes      */
+            gcm_ctx ctx[1])                 /* the mode context             */
+{   uint32_t i, ln;
+    gf_t tbuf;
+
+    if(ctx->txt_acnt != ctx->txt_ccnt && ctx->txt_ccnt > 0)
+        return RETURN_ERROR;
+
+    gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
+    gf_mul_hh((gf_t*)ctx->txt_ghv, ctx);
+
+    if(ctx->hdr_cnt)
+    {
+        ln = (uint32_t)((ctx->txt_acnt + BLOCK_SIZE - 1) / BLOCK_SIZE);
+        if(ln)
+        {
+#if 1       /* alternative versions of the exponentiation operation */
+            memcpy(tbuf, ctx->ghash_h, BLOCK_SIZE);
+#       if defined(  GF_REPRESENTATION )
+            convert_representation(tbuf, tbuf, GF_REPRESENTATION);
+            convert_representation(ctx->hdr_ghv, ctx->hdr_ghv, GF_REPRESENTATION);
+#       endif
+            for( ; ; )
+            {
+                if(ln & 1)
+                {
+                    gf_mul((void*)ctx->hdr_ghv, tbuf);
+                }
+                if(!(ln >>= 1))
+                    break;
+                gf_mul(tbuf, tbuf);
+            }
+#else       /* this one seems slower on x86 and x86_64 :-( */
+            i = ln | ln >> 1; i |= i >> 2; i |= i >> 4;
+            i |= i >> 8; i |= i >> 16; i &= ~(i >> 1);
+            memset(tbuf, 0, BLOCK_SIZE);
+            UI8_PTR(tbuf)[0] = 0x80;
+            while(i)
+            {
+#           if defined(  GF_REPRESENTATION )
+                convert_representation(tbuf, tbuf, GF_REPRESENTATION);
+#           endif
+                gf_mul(tbuf, tbuf);
+#           if defined(  GF_REPRESENTATION )
+                convert_representation(tbuf, tbuf, GF_REPRESENTATION);
+#           endif
+                if(i & ln)
+                    gf_mul_hh((gf_t*)tbuf, ctx);
+                i >>= 1;
+            }
+#           if defined(  GF_REPRESENTATION )
+            convert_representation(tbuf, tbuf, GF_REPRESENTATION);
+            convert_representation(ctx->hdr_ghv, ctx->hdr_ghv, GF_REPRESENTATION);
+#           endif
+            gf_mul((void*)ctx->hdr_ghv, tbuf);
+#endif
+#if         defined(  GF_REPRESENTATION )
+            convert_representation(ctx->hdr_ghv, ctx->hdr_ghv, GF_REPRESENTATION);
+#           endif
+        }
+    }
+
+    i = BLOCK_SIZE; 
+#ifdef BRG_UI64
+    {   uint64_t tm = ((uint64_t)ctx->txt_acnt) << 3;
+        while(i-- > 0)
+        {
+            UI8_PTR(ctx->hdr_ghv)[i] ^= UI8_PTR(ctx->txt_ghv)[i] ^ (unsigned char)tm;
+            tm = (i == 8 ? (((uint64_t)ctx->hdr_cnt) << 3) : tm >> 8);
+        }
+    }
+#else   
+    {   uint32_t tm = ctx->txt_acnt << 3;
+
+        while(i-- > 0)
+        {
+            UI8_PTR(ctx->hdr_ghv)[i] ^= UI8_PTR(ctx->txt_ghv)[i] ^ (unsigned char)tm;
+            if(i & 3)
+                tm >>= 8;
+            else if(i == 4)
+                tm = ctx->txt_acnt >> 29;
+            else if(i == 8)
+                tm = ctx->hdr_cnt << 3;
+            else
+                tm = ctx->hdr_cnt >> 29;
+        }
+    }
+#endif
+
+    gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
+
+    memcpy(ctx->enc_ctr, ctx->ctr_val, BLOCK_SIZE);
+    *UI32_PTR(UI8_PTR(ctx->enc_ctr) + CTR_POS) = ctx->y0_val;
+    aes_encrypt(UI8_PTR(ctx->enc_ctr), UI8_PTR(ctx->enc_ctr), ctx->aes);
+    for(i = 0; i < (unsigned int)tag_len; ++i)
+        tag[i] = (unsigned char)(UI8_PTR(ctx->hdr_ghv)[i] ^ UI8_PTR(ctx->enc_ctr)[i]);
+
+    return (ctx->txt_ccnt == ctx->txt_acnt ? RETURN_GOOD : RETURN_WARN);
+}
+
+ret_type gcm_end(                           /* clean up and end operation   */
+            gcm_ctx ctx[1])                 /* the mode context             */
+{
+    memset(ctx, 0, sizeof(gcm_ctx));
+    return RETURN_GOOD;
+}
+
+ret_type gcm_encrypt(                       /* encrypt & authenticate data  */
+            unsigned char data[],           /* the data buffer              */
+            unsigned long data_len,         /* and its length in bytes      */
+            gcm_ctx ctx[1])                 /* the mode context             */
+{
+
+    gcm_crypt_data(data, data_len, ctx);
+    gcm_auth_data(data, data_len, ctx);
+    return RETURN_GOOD;
+}
+
+ret_type gcm_decrypt(                       /* authenticate & decrypt data  */
+            unsigned char data[],           /* the data buffer              */
+            unsigned long data_len,         /* and its length in bytes      */
+            gcm_ctx ctx[1])                 /* the mode context             */
+{
+    gcm_auth_data(data, data_len, ctx);
+    gcm_crypt_data(data, data_len, ctx);
+    return RETURN_GOOD;
+}
+
+ret_type gcm_encrypt_message(               /* encrypt an entire message    */
+            const unsigned char iv[],       /* the initialisation vector    */
+            unsigned long iv_len,           /* and its length in bytes      */
+            const unsigned char hdr[],      /* the header buffer            */
+            unsigned long hdr_len,          /* and its length in bytes      */
+            unsigned char msg[],            /* the message buffer           */
+            unsigned long msg_len,          /* and its length in bytes      */
+            unsigned char tag[],            /* the buffer for the tag       */
+            unsigned long tag_len,          /* and its length in bytes      */
+            gcm_ctx ctx[1])                 /* the mode context             */
+{
+    gcm_init_message(iv, iv_len, ctx);
+    gcm_auth_header(hdr, hdr_len, ctx);
+    gcm_encrypt(msg, msg_len, ctx);
+    return gcm_compute_tag(tag, tag_len, ctx) ? RETURN_ERROR : RETURN_GOOD;
+}
+
+ret_type gcm_decrypt_message(               /* decrypt an entire message    */
+            const unsigned char iv[],       /* the initialisation vector    */
+            unsigned long iv_len,           /* and its length in bytes      */
+            const unsigned char hdr[],      /* the header buffer            */
+            unsigned long hdr_len,          /* and its length in bytes      */
+            unsigned char msg[],            /* the message buffer           */
+            unsigned long msg_len,          /* and its length in bytes      */
+            const unsigned char tag[],      /* the buffer for the tag       */
+            unsigned long tag_len,          /* and its length in bytes      */
+            gcm_ctx ctx[1])                 /* the mode context             */
+{   uint8_t local_tag[BLOCK_SIZE];
+    ret_type rr;
+
+    gcm_init_message(iv, iv_len, ctx);
+    gcm_auth_header(hdr, hdr_len, ctx);
+    gcm_decrypt(msg, msg_len, ctx);
+    rr = gcm_compute_tag(local_tag, tag_len, ctx);
+    return (rr != RETURN_GOOD || memcmp(tag, local_tag, tag_len)) ? RETURN_ERROR : RETURN_GOOD;
+}
+
+#if defined(__cplusplus)
+}
+#endif
--- a/crypto/aes/aesgcm.h
+++ b/crypto/aes/aesgcm.h
@ -0,0 +1,233 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 11/01/2011
+
+ I am grateful for the work done by Mark Rodenkirch and Jason Papadopoulos
+ in helping to remove a bug in the operation of this code on big endian
+ systems when fast buffer operations are enabled.
+ ---------------------------------------------------------------------------
+*/
+
+#ifndef _GCM_H
+#define _GCM_H
+
+#include "aes.h"
+#include "gf128mul.h"
+
+/*  USER DEFINABLE OPTIONS (Further options need to be set in gf128mul.h) */
+
+/*  UNIT_BITS sets the size of variables used to process 16 byte buffers
+    when the buffer alignment allows this.  When buffers are processed
+    in bytes, 16 individual operations are invoolved.  But if, say, such 
+    a buffer is divided into 4 32 bit variables, it can then be processed
+    in 4 operations, making the code typically much faster. In general
+    it will pay to use the longest natively supported size, which will
+    probably be 32 or 64 bits in 32 and 64 bit systems respectively.
+*/
+
+#if defined( UNIT_BITS )
+# undef UNIT_BITS
+#endif
+
+#if !defined( UNIT_BITS )
+#  if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+#    if 0
+#      define UNIT_BITS   8
+#    elif 0
+#      define UNIT_BITS  32
+#    elif 1
+#      define UNIT_BITS  64
+#    endif
+#  elif defined( _WIN64 )
+#    define UNIT_BITS 64
+#  else
+#    define UNIT_BITS 32
+#  endif
+#endif
+
+#if UNIT_BITS == 64 && !defined( NEED_UINT_64T )
+#  define NEED_UINT_64T
+#endif
+
+/* END OF USER DEFINABLE OPTIONS */
+
+/*  After encryption or decryption operations the return value of
+    'compute tag' will be one of the values RETURN_GOOD, RETURN_WARN
+    or RETURN_ERROR, the latter indicating an error. A return value
+    RETURN_GOOD indicates that both encryption and authentication
+    have taken place and resulted in the returned tag value. If
+    the returned value is RETURN_WARN, the tag value is the result
+    of authentication alone without encryption (CCM) or decryption
+    (GCM and EAX).
+*/
+#ifndef RETURN_GOOD
+# define RETURN_WARN      1
+# define RETURN_GOOD      0
+# define RETURN_ERROR    -1
+#endif
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#ifndef RET_TYPE_DEFINED
+  typedef int  ret_type;
+#endif
+UNIT_TYPEDEF(gcm_unit_t, UNIT_BITS);
+BUFR_TYPEDEF(gcm_buf_t, UNIT_BITS, AES_BLOCK_SIZE);
+
+#define GCM_BLOCK_SIZE  AES_BLOCK_SIZE
+
+/* The GCM-AES  context  */
+
+typedef struct
+{
+#if defined( TABLES_64K )
+    gf_t64k_a       gf_t64k;
+#endif
+#if defined( TABLES_8K )
+    gf_t8k_a        gf_t8k;
+#endif
+#if defined( TABLES_4K )
+    gf_t4k_a        gf_t4k;
+#endif
+#if defined( TABLES_256 )
+    gf_t256_a       gf_t256;
+#endif
+    gcm_buf_t       ctr_val;                /* CTR counter value            */
+    gcm_buf_t       enc_ctr;                /* encrypted CTR block          */
+    gcm_buf_t       hdr_ghv;                /* ghash buffer (header)        */
+    gcm_buf_t       txt_ghv;                /* ghash buffer (ciphertext)    */
+    gf_t            ghash_h;                /* ghash H value                */
+    aes_encrypt_ctx aes[1];                 /* AES encryption context       */
+    uint32_t        y0_val;                 /* initial counter value        */
+    uint32_t        hdr_cnt;                /* header bytes so far          */
+    uint32_t        txt_ccnt;               /* text bytes so far (encrypt)  */
+    uint32_t        txt_acnt;               /* text bytes so far (auth)     */
+} gcm_ctx;
+
+/* The following calls handle mode initialisation, keying and completion    */
+
+ret_type gcm_init_and_key(                  /* initialise mode and set key  */
+            const unsigned char key[],      /* the key value                */
+            unsigned long key_len,          /* and its length in bytes      */
+            gcm_ctx ctx[1]);                /* the mode context             */
+
+ret_type gcm_end(                           /* clean up and end operation   */
+            gcm_ctx ctx[1]);                /* the mode context             */
+
+/* The following calls handle complete messages in memory as one operation  */
+
+ret_type gcm_encrypt_message(               /* encrypt an entire message    */
+            const unsigned char iv[],       /* the initialisation vector    */
+            unsigned long iv_len,           /* and its length in bytes      */
+            const unsigned char hdr[],      /* the header buffer            */
+            unsigned long hdr_len,          /* and its length in bytes      */
+            unsigned char msg[],            /* the message buffer           */
+            unsigned long msg_len,          /* and its length in bytes      */
+            unsigned char tag[],            /* the buffer for the tag       */
+            unsigned long tag_len,          /* and its length in bytes      */
+            gcm_ctx ctx[1]);                /* the mode context             */
+
+                                /* RETURN_GOOD is returned if the input tag */
+                                /* matches that for the decrypted message   */
+ret_type gcm_decrypt_message(               /* decrypt an entire message    */
+            const unsigned char iv[],       /* the initialisation vector    */
+            unsigned long iv_len,           /* and its length in bytes      */
+            const unsigned char hdr[],      /* the header buffer            */
+            unsigned long hdr_len,          /* and its length in bytes      */
+            unsigned char msg[],            /* the message buffer           */
+            unsigned long msg_len,          /* and its length in bytes      */
+            const unsigned char tag[],      /* the buffer for the tag       */
+            unsigned long tag_len,          /* and its length in bytes      */
+            gcm_ctx ctx[1]);                /* the mode context             */
+
+/* The following calls handle messages in a sequence of operations followed */
+/* by tag computation after the sequence has been completed. In these calls */
+/* the user is responsible for verfiying the computed tag on decryption     */
+
+ret_type gcm_init_message(                  /* initialise a new message     */
+            const unsigned char iv[],       /* the initialisation vector    */
+            unsigned long iv_len,           /* and its length in bytes      */
+            gcm_ctx ctx[1]);                /* the mode context             */
+
+ret_type gcm_auth_header(                   /* authenticate the header      */
+            const unsigned char hdr[],      /* the header buffer            */
+            unsigned long hdr_len,          /* and its length in bytes      */
+            gcm_ctx ctx[1]);                /* the mode context             */
+
+ret_type gcm_encrypt(                       /* encrypt & authenticate data  */
+            unsigned char data[],           /* the data buffer              */
+            unsigned long data_len,         /* and its length in bytes      */
+            gcm_ctx ctx[1]);                /* the mode context             */
+
+ret_type gcm_decrypt(                       /* authenticate & decrypt data  */
+            unsigned char data[],           /* the data buffer              */
+            unsigned long data_len,         /* and its length in bytes      */
+            gcm_ctx ctx[1]);                /* the mode context             */
+
+ret_type gcm_compute_tag(                   /* compute authentication tag   */
+            unsigned char tag[],            /* the buffer for the tag       */
+            unsigned long tag_len,          /* and its length in bytes      */
+            gcm_ctx ctx[1]);                /* the mode context             */
+
+/*  The use of the following calls should be avoided if possible because 
+    their use requires a very good understanding of the way this encryption 
+    mode works and the way in which this code implements it in order to use 
+    them correctly.
+
+    The gcm_auth_data routine is used to authenticate encrypted message data.
+    In message encryption gcm_crypt_data must be called before gcm_auth_data
+    is called since it is encrypted data that is authenticated.  In message
+    decryption authentication must occur before decryption and data can be
+    authenticated without being decrypted if necessary.
+
+    If these calls are used it is up to the user to ensure that these routines
+    are called in the correct order and that the correct data is passed to 
+    them.
+
+    When gcm_compute_tag is called it is assumed that an error in use has
+    occurred if both encryption (or decryption) and authentication have taken
+    place but the total lengths of the message data respectively authenticated
+    and encrypted are not the same. If authentication has taken place but 
+    there has been no corresponding encryption or decryption operations (none
+    at all) only a warning is issued. This should be treated as an error if it 
+    occurs during encryption but it is only signalled as a warning as it might 
+    be intentional when decryption operations are involved (this avoids having
+    different compute tag functions for encryption and decryption). Decryption
+    operations can be undertaken freely after authetication but if the tag is
+    computed after such operations an error will be signalled if the lengths
+    of the data authenticated and decrypted don't match.
+*/
+
+ret_type gcm_auth_data(                     /* authenticate ciphertext data */
+            const unsigned char data[],     /* the data buffer              */
+            unsigned long data_len,         /* and its length in bytes      */
+            gcm_ctx ctx[1]);                /* the mode context             */
+
+ret_type gcm_crypt_data(                    /* encrypt or decrypt data      */
+            unsigned char data[],           /* the data buffer              */
+            unsigned long data_len,         /* and its length in bytes      */
+            gcm_ctx ctx[1]);                /* the mode context             */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/crypto/aes/brg_endian.h
+++ b/crypto/aes/brg_endian.h
@ -0,0 +1,29 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 10/09/2018
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#endif
--- a/crypto/aes/gf128mul.c
+++ b/crypto/aes/gf128mul.c
@ -0,0 +1,471 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 20/12/2007
+
+ This file provides fast multiplication in GF(128) as required by several
+ cryptographic authentication modes (see gfmul128.h).
+*/
+
+/*  Speed critical loops can be unrolled to gain speed but consume more memory */
+#if 1
+#  define UNROLL_LOOPS
+#endif
+
+/* The order of these includes matters */
+#include "mode_hdr.h"
+#include "gf128mul.h"
+#include "gf_mul_lo.h"
+
+#if defined( GF_MODE_LL )
+#  define mode   _ll
+#elif defined( GF_MODE_BL )
+#  define mode   _bl
+#elif defined( GF_MODE_LB )
+#  define mode   _lb
+#elif defined( GF_MODE_BB )
+#  define mode   _bb
+#else
+#  error mode is not defined
+#endif
+
+#if defined( GF_MODE_LL) || defined( GF_MODE_LB )
+#  define GF_INDEX(i)  (i)
+#else
+#  define GF_INDEX(i)  (15 - (i))
+#endif
+
+/* A slow field multiplier */
+
+void gf_mul(gf_t a, const gf_t b)
+{   gf_t p[8];
+    uint8_t *q, ch;
+    int i;
+
+    copy_block_aligned(p[0], a);
+    for(i = 0; i < 7; ++i)
+        gf_mulx1(mode)(p[i + 1], p[i]);
+
+    q = (uint8_t*)(a == b ? p[0] : b);
+    memset(a, 0, GF_BYTE_LEN);
+    for(i = 15 ;  ; )
+    {
+        ch = q[GF_INDEX(i)];
+        if(ch & X_0)
+            xor_block_aligned(a, a, p[0]);
+        if(ch & X_1)
+            xor_block_aligned(a, a, p[1]);
+        if(ch & X_2)
+            xor_block_aligned(a, a, p[2]);
+        if(ch & X_3)
+            xor_block_aligned(a, a, p[3]);
+        if(ch & X_4)
+            xor_block_aligned(a, a, p[4]);
+        if(ch & X_5)
+            xor_block_aligned(a, a, p[5]);
+        if(ch & X_6)
+            xor_block_aligned(a, a, p[6]);
+        if(ch & X_7)
+            xor_block_aligned(a, a, p[7]);
+        if(!i--)
+            break;
+        gf_mulx8(mode)(a);
+    }
+}
+
+#if defined( TABLES_64K )
+
+/*  This version uses 64k bytes of table space on the stack.
+    An input variable field value in a[] has to be multiplied
+    by a key value in g[] that changes far less frequently.
+
+    To do this a[] is split up into 16 smaller field values,
+    each one byte in length. For the 256 values of each of
+    these smaller values, we can precompute the result of
+    mulltiplying g by this field value. We can then combine
+    these values to provide the full multiply. So for each
+    of 16 bytes we have a table of 256 field values each of
+    16 bytes - 64k bytes in total.
+*/
+
+void init_64k_table(const gf_t g, gf_t64k_t t)
+{   int i = 0, j, k;
+
+    /*
+    depending on the representation we have to process bits
+    within bytes high to low (0xe1 style ) or low to high
+    (0x87 style).  We start by producing the powers x ,x^2
+    .. x^7 and put them in t[0][1], t[0][2] .. t[128] or in
+    t[128], t[64] .. t[1] depending on the bit order in use.
+    */
+
+    /* clear the element for the zero field element */
+    memset(t[0][0], 0, GF_BYTE_LEN);
+
+#if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
+
+    /* g -> t[0][1], generate t[0][2] ...           */
+    memcpy(t[0][1], g, GF_BYTE_LEN);
+    for(j = 1; j <= 64; j <<= 1)
+        gf_mulx1(mode)(t[0][j + j], t[0][j]);
+#else
+
+    /* g -> t[0][128], generate t[0][64] ...        */
+    memcpy(t[0][128], g, GF_BYTE_LEN);
+    for(j = 64; j >= 1; j >>= 1)
+        gf_mulx1(mode)(t[0][j], t[0][j + j]);
+#endif
+
+    for( ; ; )
+    {
+        /*  if { n } stands for the field value represented by
+            the integer n, we can express higher multiplies in
+            the table as follows:
+
+                1. g * { 3} = g * {2} ^ g * {1}
+
+                2. g * { 5} = g * {4} ^ g * {1}
+                   g * { 6} = g * {4} ^ g * {2}
+                   g * { 7} = g * {4} ^ g * {3}
+
+                3. g * { 9} = g * {8} ^ g * {1}
+                   g * {10} = g * {8} ^ g * {2}
+                   ....
+
+           and so on.  This is what the following loops do.
+        */
+        for(j = 2; j < 256; j += j)
+            for(k = 1; k < j; ++k)
+                xor_block_aligned(t[i][j + k], t[i][j], t[i][k]);
+
+        if(++i == GF_BYTE_LEN)  /* all 16 byte positions done */
+            return;
+
+        /*  We now move to the next byte up and set up its eight
+            starting values by multiplying the values in the
+            lower table by x^8
+        */
+        memset(t[i][0], 0, GF_BYTE_LEN);
+        for(j = 128; j > 0; j >>= 1)
+        {
+            memcpy(t[i][j], t[i - 1][j], GF_BYTE_LEN);
+            gf_mulx8(mode)(t[i][j]);
+        }
+    }
+}
+
+#define xor_64k(i,ap,t,r) xor_block_aligned(r, r, t[i][ap[GF_INDEX(i)]])
+
+#if defined( UNROLL_LOOPS )
+
+void gf_mul_64k(gf_t a, const  gf_t64k_t t, gf_t r)
+{   uint8_t *ap = (uint8_t*)a;
+    memset(r, 0, GF_BYTE_LEN);
+    xor_64k(15, ap, t, r); xor_64k(14, ap, t, r);
+    xor_64k(13, ap, t, r); xor_64k(12, ap, t, r);
+    xor_64k(11, ap, t, r); xor_64k(10, ap, t, r);
+    xor_64k( 9, ap, t, r); xor_64k( 8, ap, t, r);
+    xor_64k( 7, ap, t, r); xor_64k( 6, ap, t, r);
+    xor_64k( 5, ap, t, r); xor_64k( 4, ap, t, r);
+    xor_64k( 3, ap, t, r); xor_64k( 2, ap, t, r);
+    xor_64k( 1, ap, t, r); xor_64k( 0, ap, t, r);
+    copy_block_aligned(a, r);
+}
+
+#else
+
+void gf_mul_64k(gf_t a, const  gf_t64k_t t, gf_t r)
+{   int i;
+    uint8_t *ap = (uint8_t*)a;
+    memset(r, 0, GF_BYTE_LEN);
+    for(i = 15; i >= 0; --i)
+    {
+        xor_64k(i,ap,t,r);
+    }
+    copy_block_aligned(a, r);
+}
+
+#endif
+
+#endif
+
+#if defined( TABLES_8K )
+
+/*  This version uses 8k bytes of table space on the stack.
+    An input field value in a[] has to be multiplied by a
+    key value in g[]. To do this a[] is split up into 32
+    smaller field values each 4-bits in length. For the
+    16 values of each of these smaller field values we can
+    precompute the result of mulltiplying g[] by the field
+    value in question. So for each of 32 nibbles we have a
+    table of 16 field values, each of 16 bytes - 8k bytes
+    in total.
+*/
+void init_8k_table(const gf_t g, gf_t8k_t t)
+{   int i = 0, j, k;
+
+    /*  do the low 4-bit nibble first - t[0][16] - and note
+        that the unit multiplier sits at 0x01 - t[0][1] in
+        the table. Then multiplies by x go at 2, 4, 8
+    */
+    /* set the table elements for a zero multiplier */
+    memset(t[0][0], 0, GF_BYTE_LEN);
+    memset(t[1][0], 0, GF_BYTE_LEN);
+
+#if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
+
+    /* t[0][1] = g, compute t[0][2], t[0][4], t[0][8]   */
+    memcpy(t[0][1], g, GF_BYTE_LEN);
+    for(j = 1; j <= 4; j <<= 1)
+        gf_mulx1(mode)(t[0][j + j], t[0][j]);
+    /* t[1][1] = t[0][1] * x^4 = t[0][8] * x            */
+    gf_mulx1(mode)(t[1][1], t[0][8]);
+    for(j = 1; j <= 4; j <<= 1)
+        gf_mulx1(mode)(t[1][j + j], t[1][j]);
+#else
+
+    /* g -> t[0][8], compute t[0][4], t[0][2], t[0][1]  */
+    memcpy(t[1][8], g, GF_BYTE_LEN);
+    for(j = 4; j >= 1; j >>= 1)
+        gf_mulx1(mode)(t[1][j], t[1][j + j]);
+    /* t[1][1] = t[0][1] * x^4 = t[0][8] * x            */
+    gf_mulx1(mode)(t[0][8], t[1][1]);
+    for(j = 4; j >= 1; j >>= 1)
+        gf_mulx1(mode)(t[0][j], t[0][j + j]);
+#endif
+
+    for( ; ; )
+    {
+        for(j = 2; j < 16; j += j)
+            for(k = 1; k < j; ++k)
+                xor_block_aligned(t[i][j + k], t[i][j], t[i][k]);
+
+        if(++i == 2 * GF_BYTE_LEN)
+            return;
+
+        if(i > 1)
+        {
+            memset(t[i][0], 0, GF_BYTE_LEN);
+            for(j = 8; j > 0; j >>= 1)
+            {
+                memcpy(t[i][j], t[i - 2][j], GF_BYTE_LEN);
+                gf_mulx8(mode)(t[i][j]);
+            }
+        }
+
+    }
+}
+
+#define xor_8k(i,ap,t,r)   \
+    xor_block_aligned(r, r, t[i + i][ap[GF_INDEX(i)] & 15]); \
+    xor_block_aligned(r, r, t[i + i + 1][ap[GF_INDEX(i)] >> 4])
+
+#if defined( UNROLL_LOOPS )
+
+void gf_mul_8k(gf_t a, const gf_t8k_t t, gf_t r)
+{   uint8_t *ap = (uint8_t*)a;
+    memset(r, 0, GF_BYTE_LEN);
+    xor_8k(15, ap, t, r); xor_8k(14, ap, t, r);
+    xor_8k(13, ap, t, r); xor_8k(12, ap, t, r);
+    xor_8k(11, ap, t, r); xor_8k(10, ap, t, r);
+    xor_8k( 9, ap, t, r); xor_8k( 8, ap, t, r);
+    xor_8k( 7, ap, t, r); xor_8k( 6, ap, t, r);
+    xor_8k( 5, ap, t, r); xor_8k( 4, ap, t, r);
+    xor_8k( 3, ap, t, r); xor_8k( 2, ap, t, r);
+    xor_8k( 1, ap, t, r); xor_8k( 0, ap, t, r);
+    copy_block_aligned(a, r);
+}
+
+#else
+
+void gf_mul_8k(gf_t a, const gf_t8k_t t, gf_t r)
+{   int i;
+    uint8_t *ap = (uint8_t*)a;
+    memset(r, 0, GF_BYTE_LEN);
+    for(i = 15; i >= 0; --i)
+    {
+        xor_8k(i,ap,t,r);
+    }
+    memcpy(a, r, GF_BYTE_LEN);
+}
+
+#endif
+
+#endif
+
+#if defined( TABLES_4K )
+
+/*  This version uses 4k bytes of table space on the stack.
+    A 16 byte buffer has to be multiplied by a 16 byte key
+    value in GF(128).  If we consider a GF(128) value in a
+    single byte, we can construct a table of the 256 16
+    byte values that result from multiplying g by the 256
+    values of this byte.  This requires 4096 bytes.
+
+    If we take the highest byte in the buffer and use this
+    table to multiply it by g, we then have to multiply it
+    by x^120 to get the final value. For the next highest
+    byte the result has to be multiplied by x^112 and so on.
+
+    But we can do this by accumulating the result in an
+    accumulator starting with the result for the top byte.
+    We repeatedly multiply the accumulator value by x^8 and
+    then add in (i.e. xor) the 16 bytes of the next lower
+    byte in the buffer, stopping when we reach the lowest
+    byte. This requires a 4096 byte table.
+*/
+
+void init_4k_table(const gf_t g, gf_t4k_t t)
+{   int j, k;
+
+    memset(t[0], 0, GF_BYTE_LEN);
+
+#if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
+
+    memcpy(t[1], g, GF_BYTE_LEN);
+    for(j = 1; j <= 64; j <<= 1)
+        gf_mulx1(mode)(t[j + j], t[j]);
+#else
+
+    memcpy(t[128], g, GF_BYTE_LEN);
+    for(j = 64; j >= 1; j >>= 1)
+        gf_mulx1(mode)(t[j], t[j + j]);
+#endif
+
+    for(j = 2; j < 256; j += j)
+        for(k = 1; k < j; ++k)
+            xor_block_aligned(t[j + k], t[j], t[k]);
+}
+
+#define xor_4k(i,ap,t,r) gf_mulx8(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)]])
+
+#if defined( UNROLL_LOOPS )
+
+void gf_mul_4k(gf_t a, const gf_t4k_t t, gf_t r)
+{   uint8_t *ap = (uint8_t*)a;
+    memset(r, 0, GF_BYTE_LEN);
+    xor_4k(15, ap, t, r); xor_4k(14, ap, t, r);
+    xor_4k(13, ap, t, r); xor_4k(12, ap, t, r);
+    xor_4k(11, ap, t, r); xor_4k(10, ap, t, r);
+    xor_4k( 9, ap, t, r); xor_4k( 8, ap, t, r);
+    xor_4k( 7, ap, t, r); xor_4k( 6, ap, t, r);
+    xor_4k( 5, ap, t, r); xor_4k( 4, ap, t, r);
+    xor_4k( 3, ap, t, r); xor_4k( 2, ap, t, r);
+    xor_4k( 1, ap, t, r); xor_4k( 0, ap, t, r);
+    copy_block_aligned(a, r);
+}
+
+#else
+
+void gf_mul_4k(gf_t a, const gf_t4k_t t, gf_t r)
+{   int i = 15;
+    uint8_t *ap = (uint8_t*)a;
+    memset(r, 0, GF_BYTE_LEN);
+    for(i = 15; i >=0; --i)
+    {
+        xor_4k(i, ap, t, r);
+    }
+    copy_block_aligned(a, r);
+}
+
+#endif
+
+#endif
+
+#if defined( TABLES_256 )
+
+/*  This version uses 256 bytes of table space on the stack.
+    A 16 byte buffer has to be multiplied by a 16 byte key
+    value in GF(128).  If we consider a GF(128) value in a
+    single 4-bit nibble, we can construct a table of the 16
+    16 byte  values that result from the 16 values of this
+    byte.  This requires 256 bytes. If we take the highest
+    4-bit nibble in the buffer and use this table to get the
+    result, we then have to multiply by x^124 to get the
+    final value. For the next highest byte the result has to
+    be multiplied by x^120 and so on. But we can do this by
+    accumulating the result in an accumulator starting with
+    the result for the top nibble.  We repeatedly multiply
+    the accumulator value by x^4 and then add in (i.e. xor)
+    the 16 bytes of the next lower nibble in the buffer,
+    stopping when we reach the lowest nibble. This uses a
+    256 byte table.
+*/
+
+void init_256_table(const gf_t g, gf_t256_t t)
+{   int j, k;
+
+    memset(t[0], 0, GF_BYTE_LEN);
+
+#if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
+
+    memcpy(t[1], g, GF_BYTE_LEN);
+    for(j = 1; j <= 4; j <<= 1)
+        gf_mulx1(mode)(t[j + j], t[j]);
+#else
+
+    memcpy(t[8], g, GF_BYTE_LEN);
+    for(j = 4; j >= 1; j >>= 1)
+        gf_mulx1(mode)(t[j], t[j + j]);
+#endif
+
+    for(j = 2; j < 16; j += j)
+        for(k = 1; k < j; ++k)
+            xor_block_aligned(t[j + k], t[j], t[k]);
+}
+
+#define x_lo(i,ap,t,r) gf_mulx4(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)] & 0x0f])
+#define x_hi(i,ap,t,r) gf_mulx4(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)] >> 4])
+
+#if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
+#define xor_256(a,b,c,d)    x_hi(a,b,c,d);  x_lo(a,b,c,d)
+#else
+#define xor_256(a,b,c,d)    x_lo(a,b,c,d);  x_hi(a,b,c,d)
+#endif
+
+#if defined( UNROLL_LOOPS )
+
+void gf_mul_256(gf_t a, const gf_t256_t t, gf_t r)
+{   uint8_t *ap = (uint8_t*)a;
+    memset(r, 0, GF_BYTE_LEN);
+    xor_256(15, ap, t, r); xor_256(14, ap, t, r);
+    xor_256(13, ap, t, r); xor_256(12, ap, t, r);
+    xor_256(11, ap, t, r); xor_256(10, ap, t, r);
+    xor_256( 9, ap, t, r); xor_256( 8, ap, t, r);
+    xor_256( 7, ap, t, r); xor_256( 6, ap, t, r);
+    xor_256( 5, ap, t, r); xor_256( 4, ap, t, r);
+    xor_256( 3, ap, t, r); xor_256( 2, ap, t, r);
+    xor_256( 1, ap, t, r); xor_256( 0, ap, t, r);
+    copy_block_aligned(a, r);
+}
+
+#else
+
+void gf_mul_256(gf_t a, const gf_t256_t t, gf_t r)
+{   int i;
+    uint8_t *ap = (uint8_t*)a;
+    memset(r, 0, GF_BYTE_LEN);
+    for(i = 15; i >= 0; --i)
+    {
+        xor_256(i, ap, t, r);
+    }
+    copy_block_aligned(a, r);
+}
+
+#endif
+
+#endif
--- a/crypto/aes/gf128mul.h
+++ b/crypto/aes/gf128mul.h
@ -0,0 +1,215 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 11/01/2011
+
+ I am grateful for the work done by Mark Rodenkirch and Jason Papadopoulos
+ in helping to remove a bug in the operation of this code on big endian
+ systems when fast buffer operations are enabled.
+ ---------------------------------------------------------------------------
+
+ An implementation of field multiplication in the Galois Field GF(2^128)
+
+ A polynomial representation is used for the field with the coefficients
+ held in bit sequences in which the bit numbers are the powers of x that
+ a bit represents. The field polynomial used is (x^128+x^7+x^2+x+1).
+ 
+ The obvious way of representing field elements in a computer system is 
+ to map 'x' in the field to the binary integer '2'. But this was way too
+ obvious for cryptographers!
+ 
+ Here bytes are numbered in their memory order and bits within bytes are
+ numbered according to their integer numeric significance (that is as is 
+ now normal with bit 0 representing unity). The term 'little endian' 
+ will then used to describe mappings where numeric (power of 2) or field 
+ (power of x) significance increases with increasing bit or byte numbers 
+ with 'big endian' being used to describe the inverse situation.  
+
+ The GF bit sequence can then be mapped onto 8-bit bytes in computer 
+ memory in one of four simple ways:
+
+     A mapping in which x maps to the integer 2 in little endian 
+     form for both bytes and bits within bytes:
+     
+         LL: bit for x^n ==> bit for 2^(n % 8) in byte[n / 8]
+
+     A mapping in which x maps to the integer 2 in big endian form 
+     for both bytes and bits within bytes:
+
+         BL: bit for x^n ==> bit for 2^(n % 8) in byte[15 - n / 8]
+ 
+     A little endian mapping for bytes but with the bits within 
+     bytes in reverse order (big endian bytes):
+
+         LB: bit for x^n ==> bit for 2^(7 - n % 8) in byte[n / 8]
+
+     A big endian mapping for bytes but with the bits within 
+     bytes in reverse order (big endian bytes):
+
+         BB: bit for x^n ==> bit for 2^(7 - n % 8) in byte[15 - n / 8]
+
+ 128-bit field elements are represented by 16 byte buffers but for
+ processing efficiency reasons it is often desirable to process arrays 
+ of bytes using longer types such as, for example, unsigned long values. 
+ The type used for representing these buffers will be called a 'gf_unit' 
+ and the buffer itself will be referred to as a 'gf_t' type.
+
+ THe field multiplier is based on the assumption that one of the two
+ field elements involved in multiplication will change only relatively
+ infrequently, making it worthwhile to precompute tables to speed up
+ multiplication by this value. 
+*/
+
+#ifndef _GF128MUL_H
+#define _GF128MUL_H
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "brg_endian.h"
+
+/* USER DEFINABLE OPTIONS */
+/*  UNIT_BITS sets the size of variables used to process 16 byte buffers
+    when the buffer alignment allows this.  When buffers are processed
+    in bytes, 16 individual operations are invoolved.  But if, say, such 
+    a buffer is divided into 4 32 bit variables, it can then be processed 
+    in 4 operations, making the code typically much faster. In general
+    it will pay to use the longest natively supported size, which will
+    probably be 32 or 64 bits in 32 and 64 bit systems respectively.
+*/
+
+#if defined( UNIT_BITS )
+# undef UNIT_BITS
+#endif
+
+#if !defined( UNIT_BITS )
+#  if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+#    if 0
+#      define UNIT_BITS   8
+#    elif 0
+#      define UNIT_BITS  32
+#    elif 1
+#      define UNIT_BITS  64
+#    endif
+#  elif defined( _WIN64 )
+#    define UNIT_BITS 64
+#  else
+#    define UNIT_BITS 32
+#  endif
+#endif
+
+#if UNIT_BITS == 64 && !defined( NEED_UINT_64T )
+#  define NEED_UINT_64T
+#endif
+
+#include "brg_types.h"
+
+/* Choose the Galois Field representation to use (see above) */
+#if 0
+#  define GF_MODE_LL
+#elif 0
+#  define GF_MODE_BL
+#elif 1
+#  define GF_MODE_LB    /* the representation used by GCM */
+#elif 0
+#  define GF_MODE_BB
+#else
+#  error mode is not defined
+#endif
+
+/*  Table sizes for GF(128) Multiply.  Normally larger tables give 
+    higher speed but cache loading might change this. Normally only 
+    one table size (or none at all) will be specified here
+*/
+#if 0
+#  define TABLES_64K
+#endif
+#if 0
+#  define TABLES_8K
+#endif
+#if 1
+#  define TABLES_4K
+#endif
+#if 0
+#  define TABLES_256
+#endif
+
+/* END OF USER DEFINABLE OPTIONS */
+
+#if !(defined( TABLES_64K ) || defined( TABLES_8K ) \
+    || defined( TABLES_4K ) || defined( TABLES_256 ))
+#  define NO_TABLES
+#endif
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#define GF_BYTE_LEN 16
+#define GF_UNIT_LEN (GF_BYTE_LEN / (UNIT_BITS >> 3))
+
+UNIT_TYPEDEF(gf_unit_t, UNIT_BITS);
+BUFR_TYPEDEF(gf_t, UNIT_BITS, GF_BYTE_LEN);
+
+/*  Code for conversion between the four different galois field representations 
+    is optionally available using gf_convert.c
+*/
+
+typedef enum { REVERSE_NONE = 0, REVERSE_BITS = 1, REVERSE_BYTES = 2 } transform;
+
+void convert_representation(gf_t dest, const gf_t source, transform rev);
+
+void gf_mul(gf_t a, const gf_t b);      /* slow field multiply  */  
+
+/* types and calls for 64k table driven field multiplier        */
+
+typedef gf_t    gf_t64k_a[16][256]; 
+typedef gf_t    (*gf_t64k_t)[256];
+
+void init_64k_table(const gf_t g, gf_t64k_t t);
+void gf_mul_64k(gf_t a, const gf_t64k_t t, void *r);
+
+/* types and calls for 8k table driven field multiplier        */
+
+typedef gf_t    gf_t8k_a[32][16];
+typedef gf_t    (*gf_t8k_t)[16];
+
+void init_8k_table(const gf_t g, gf_t8k_t t);
+void gf_mul_8k(gf_t a, const gf_t8k_t t, gf_t r);
+
+/* types and calls for 8k table driven field multiplier        */
+
+typedef gf_t    gf_t4k_a[256];
+typedef gf_t    (*gf_t4k_t);
+
+void init_4k_table(const gf_t g, gf_t4k_t t);
+void gf_mul_4k(gf_t a, const gf_t4k_t t, gf_t r);
+
+/* types and calls for 8k table driven field multiplier        */
+
+typedef gf_t    gf_t256_a[16];
+typedef gf_t    (*gf_t256_t);
+
+void init_256_table(const gf_t g, gf_t256_t t);
+void gf_mul_256(gf_t a, const gf_t256_t t, gf_t r);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/crypto/aes/gf_mul_lo.h
+++ b/crypto/aes/gf_mul_lo.h
@ -0,0 +1,773 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 18/02/2014
+
+ This file provides the low level primitives needed for Galois Field 
+ operations in GF(2^128) for the four most likely field representations.
+*/
+
+#ifndef _GF_MUL_LO_H
+#define _GF_MUL_LO_H
+
+#if defined( USE_INLINING )
+#  if defined( _MSC_VER )
+#    define gf_decl __inline
+#  elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#    define gf_decl static inline
+#  else
+#    define gf_decl static
+#  endif
+#endif
+
+#if 0   /* used for testing only: t1(UNIT_BITS), t2(UNIT_BITS)  */
+#  define _t1(n) bswap ## n ## _block(x, x)
+#  define  t1(n) _t1(n)
+#  define _t2(n) bswap ## n ## _block(x, x); bswap ## n ## _block(r, r) 
+#  define  t2(n) _t2(n)
+#endif
+
+#define gf_m(n,x)    gf_mulx ## n ## x
+#define gf_mulx1(x)  gf_m(1,x)
+#define gf_mulx4(x)  gf_m(4,x)
+#define gf_mulx8(x)  gf_m(8,x)
+
+#define MASK(x) ((x) * (UNIT_CAST(-1,UNIT_BITS) / 0xff))
+
+#define DATA_256(q) {\
+    q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
+    q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
+    q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
+    q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
+    q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
+    q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
+    q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
+    q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
+    q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
+    q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
+    q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
+    q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
+    q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
+    q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
+    q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
+    q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
+    q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
+    q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
+    q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
+    q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
+    q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
+    q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
+    q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
+    q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
+    q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
+    q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
+    q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
+    q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
+    q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
+    q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
+    q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
+    q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) }
+
+/*  Within the 16 bytes of the field element the top and bottom field bits
+    are within bytes as follows (bit numbers in bytes 0 from ls up) for
+    each of the four field representations supported (see gf128mul.txt):
+
+    GF_BIT   127 126 125 124 123 122 121 120     .....  7 6 5 4 3 2 1 0
+                                                  0x87  1 0 0 0 0 1 1 1
+    BL x[ 0]   7   6   5   4   3   2   1   0     x[15]  7 6 5 4 3 2 1 0
+    LL x[15]   7   6   5   4   3   2   1   0     x[ 0]  7 6 5 4 3 2 1 0
+
+    GF_BIT   120 121 122 123 124 125 126 127     .....  0 1 2 3 4 5 6 7
+                                                  0xc1  1 1 1 0 0 0 0 1
+    BB x[ 0]   7   6   5   4   3   2   1   0     x[15]  7 6 5 4 3 2 1 0
+    LB x[15]   7   6   5   4   3   2   1   0     x[ 0]  7 6 5 4 3 2 1 0
+
+    When the field element is multiplied by x^n, the high bits overflow
+    and are used to form an overflow byte. For the BL and LL modes this
+    byte has the lowest overflow bit in bit 0 whereas for the BB and LB
+    modes this bit is in biit 7.  So we have for this byte:
+
+    bit (bit n = 2^n)    7   6   5   4   3   2   1   0
+    BL and LL          x^7 x^6 x^5 x^4 x^3 x^2 x^1 x^0  
+    BB and LB          x^0 x^1 x^2 x^3 x^4 x^5 x^6 x^7  
+    
+    This byte then has to be multiplied by the low bits of the field
+    polynomial, which produces a value of 16 bits to be xored into the 
+    left shifted field value. For the BL and LL modes bit 0 gives the
+    word value 0x0087, bit 1 gives 0x010e (0x87 left shifted 1), 0x021c
+    (0x87 left shifted 2), ... For the BB and LB modes, bit 7 gives the
+    value 0x00e1, bit 6 gives 0x8070, bit 5 gives 0x4038, ... Each bit
+    in the overflow byte is expanded in this way and is xored into the
+    overall result, so eaach of the 256 byte values will produce a
+    corresponding word value that is computed by the gf_uint16_xor(i)
+    macros below.
+
+    These word values have to be xored into the low 16 bits of the 
+    field value. If the byte endianess of the mode matches that of
+    the architecture xoring the word value will be correct. But if
+    the mode has the opposite endianess, the word value has to be
+    xored in byte reversed order. This is done by the ord() macro.
+*/
+
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN \
+      && (defined( GF_MODE_LB ) || defined( GF_MODE_LL )) || \
+    PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN \
+      && (defined( GF_MODE_BB ) || defined( GF_MODE_BL ))
+#  define ord(hi, lo)   0x##hi##lo
+#else 
+#  define ord(hi, lo)   0x##lo##hi
+#endif
+
+#if defined( GF_MODE_BL ) || defined( GF_MODE_LL )
+
+/* field and numeric bit significance correspond */
+
+#define gf_uint16_xor(i) ( \
+    (i & 0x01 ? ord(00,87) : 0) ^ (i & 0x02 ? ord(01,0e) : 0) ^ \
+    (i & 0x04 ? ord(02,1c) : 0) ^ (i & 0x08 ? ord(04,38) : 0) ^ \
+    (i & 0x10 ? ord(08,70) : 0) ^ (i & 0x20 ? ord(10,e0) : 0) ^ \
+    (i & 0x40 ? ord(21,c0) : 0) ^ (i & 0x80 ? ord(43,80) : 0) )
+
+enum x_bit 
+{ 
+    X_0 = 0x01, X_1 = 0x02, X_2 = 0x04, X_3 = 0x08, 
+    X_4 = 0x10, X_5 = 0x20, X_6 = 0x40, X_7 = 0x80
+};
+
+#elif defined( GF_MODE_BB ) || defined( GF_MODE_LB )
+
+/* field and numeric bit significance are in reverse */
+
+#define gf_uint16_xor(i) ( \
+    (i & 0x80 ? ord(00,e1) : 0) ^ (i & 0x40 ? ord(80,70) : 0) ^ \
+    (i & 0x20 ? ord(40,38) : 0) ^ (i & 0x10 ? ord(20,1c) : 0) ^ \
+    (i & 0x08 ? ord(10,0e) : 0) ^ (i & 0x04 ? ord(08,07) : 0) ^ \
+    (i & 0x02 ? ord(84,03) : 0) ^ (i & 0x01 ? ord(c2,01) : 0) )
+
+enum x_bit 
+{ 
+    X_0 = 0x80, X_1 = 0x40, X_2 = 0x20, X_3 = 0x10, 
+    X_4 = 0x08, X_5 = 0x04, X_6 = 0x02, X_7 = 0x01
+};
+
+#else
+#error Galois Field representation has not been set
+#endif
+
+const uint16_t gf_tab[256] = DATA_256(gf_uint16_xor);
+
+/* LL Mode Galois Field operations 
+
+  x[0]     x[1]     x[2]     x[3]     x[4]     x[5]     x[6]    x[7]
+ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
+10000111 ........ ........ ........ ........ ........ ........ ........
+07....00 15....08 23....16 31....24 39....32 47....40 55....48 63....56
+  x[8]    x[9]   x[10]   x[11]   x[12]   x[13]   x[14]  x[15]
+ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
+........ ........ ........ ........ ........ ........ ........ M.......
+71....64 79....72 87....80 95....88 103...96 111..104 119..112 127..120
+*/
+
+#if UNIT_BITS == 64
+
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+#define f1_ll(n,r,x)   r[n] = (x[n] << 1) | (n ? x[n-1] >> 63 : 0)
+#define f4_ll(n,r,x)   r[n] = (x[n] << 4) | (n ? x[n-1] >> 60 : 0)
+#define f8_ll(n,r,x)   r[n] = (x[n] << 8) | (n ? x[n-1] >> 56 : 0)
+#else
+#define f1_ll(n,r,x)   r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \
+                            | (n ? x[n-1] << 49 : 0)) & MASK(0x01))
+#define f4_ll(n,r,x)   r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \
+                            | (n ? x[n-1] << 52 : 0)) & MASK(0x0f))
+#define f8_ll(n,r,x)   r[n] = (x[n] >> 8) | (n ? x[n-1] << 56 : 0)
+#endif
+
+gf_decl void gf_mulx1_ll(gf_t r, const gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[(UNIT_PTR(x)[1] >> 63) & 0x01];
+#else
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] >> 7) & 0x01])) << 48;
+#endif
+    rep2_d2(f1_ll, UNIT_PTR(r), UNIT_PTR(x));
+    UNIT_PTR(r)[0] ^= _tt;
+}
+
+gf_decl void gf_mulx4_ll(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[(UNIT_PTR(x)[1] >> 60) & 0x0f];
+#else
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] >> 4) & 0x0f])) << 48;
+#endif
+    rep2_d2(f4_ll, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[0] ^= _tt;
+}
+
+gf_decl void gf_mulx8_ll(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[UNIT_PTR(x)[1] >> 56];
+#else
+    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[1] & 0xff])) << 48;
+#endif
+    rep2_d2(f8_ll, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[0] ^= _tt;
+}
+
+#elif UNIT_BITS == 32
+
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+#define f1_ll(n,r,x)   r[n] = (x[n] << 1) | (n ? x[n-1] >> 31 : 0)
+#define f4_ll(n,r,x)   r[n] = (x[n] << 4) | (n ? x[n-1] >> 28 : 0)
+#define f8_ll(n,r,x)   r[n] = (x[n] << 8) | (n ? x[n-1] >> 24 : 0)
+#else
+#define f1_ll(n,r,x)   r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \
+                            | (n ? x[n-1] << 17 : 0)) & MASK(0x01))
+#define f4_ll(n,r,x)   r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \
+                            | (n ? x[n-1] << 20 : 0)) & MASK(0x0f))
+#define f8_ll(n,r,x)   r[n] = (x[n] >> 8) | (n ? x[n-1] << 24 : 0)
+#endif
+
+gf_decl void gf_mulx1_ll(gf_t r, const gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[(UNIT_PTR(x)[3] >> 31) & 0x01];
+#else
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] >> 7) & 0x01])) << 16;
+#endif
+    rep2_d4(f1_ll, UNIT_PTR(r), UNIT_PTR(x));
+    UNIT_PTR(r)[0] ^= _tt;
+}
+
+gf_decl void gf_mulx4_ll(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[(UNIT_PTR(x)[3] >> 28) & 0x0f];
+#else
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] >> 4) & 0x0f])) << 16;
+#endif
+    rep2_d4(f4_ll, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[0] ^= _tt;
+}
+
+gf_decl void gf_mulx8_ll(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[UNIT_PTR(x)[3] >> 24];
+#else
+    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[3] & 0xff])) << 16;
+#endif
+    rep2_d4(f8_ll, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[0] ^= _tt;
+}
+
+#else
+
+#define f1_ll(n,r,x)   r[n] = (x[n] << 1) | (n ? x[n-1] >> 7 : 0)
+#define f4_ll(n,r,x)   r[n] = (x[n] << 4) | (n ? x[n-1] >> 4 : 0)
+
+gf_decl void gf_mulx1_ll(gf_t r, const gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[(UNIT_PTR(x)[15] >> 7) & 0x01];
+    rep2_d16(f1_ll, UNIT_PTR(r), UNIT_PTR(x));
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(r)[0] ^= _tt & 0xff;
+#else
+    UNIT_PTR(r)[0] ^= _tt >> 8;
+#endif
+}
+
+gf_decl void gf_mulx4_ll(gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[(UNIT_PTR(x)[15] >> 4) & 0x0f];
+    rep2_d16(f4_ll, UNIT_PTR(x), UNIT_PTR(x));
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(x)[1] ^= _tt >> 8;
+    UNIT_PTR(x)[0] ^= _tt & 0xff;
+#else
+    UNIT_PTR(x)[1] ^= _tt & 0xff;
+    UNIT_PTR(x)[0] =  _tt >> 8;
+#endif
+}
+
+gf_decl void gf_mulx8_ll(gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[UNIT_PTR(x)[15]];
+    memmove(UNIT_PTR(x) + 1, UNIT_PTR(x), 15);
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(x)[1] ^= _tt >> 8;
+    UNIT_PTR(x)[0] =  _tt & 0xff;
+#else
+    UNIT_PTR(x)[1] ^= _tt & 0xff;
+    UNIT_PTR(x)[0] =  _tt >> 8;
+#endif
+}
+
+#endif
+
+/* BL Mode Galois Field operations 
+
+  x[0]     x[1]     x[2]     x[3]     x[4]     x[5]     x[6]     x[7]
+ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
+M....... ........ ........ ........ ........ ........ ........ ........
+127..120 119..112 111..104 103...96 95....88 87....80 79....72 71....64
+  x[8]     x[9]    x[10]    x[11]    x[12]    x[13]    x[14]    x[15]
+ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
+........ ........ ........ ........ ........ ........ ........ 10000111
+63....56 55....48 47....40 39....32 31....24 23....16 15....08 07....00
+*/
+
+#if UNIT_BITS == 64
+
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+#define f1_bl(n,r,x)   r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \
+                            | (!n ? x[n+1] << 49 : 0)) & MASK(0x01))
+#define f4_bl(n,r,x)   r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \
+                            | (!n ? x[n+1] << 52 : 0)) & MASK(0x0f))
+#define f8_bl(n,r,x)   r[n] = (x[n] >> 8) | (!n ? x[n+1] << 56 : 0)
+#else
+#define f1_bl(n,r,x)   r[n] = (x[n] << 1) | (!n ? x[n+1] >> 63 : 0)
+#define f4_bl(n,r,x)   r[n] = (x[n] << 4) | (!n ? x[n+1] >> 60 : 0)
+#define f8_bl(n,r,x)   r[n] = (x[n] << 8) | (!n ? x[n+1] >> 56 : 0)
+#endif
+
+gf_decl void gf_mulx1_bl(gf_t r, const gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 7) & 0x01])) << 48;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 63) & 0x01];
+#endif
+    rep2_u2(f1_bl, UNIT_PTR(r), UNIT_PTR(x));
+    UNIT_PTR(r)[1] ^= _tt;
+}
+
+gf_decl void gf_mulx4_bl(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 4) & 0x0f])) << 48;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 60) & 0x0f];
+#endif
+    rep2_u2(f4_bl, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[1] ^= _tt;
+}
+
+gf_decl void gf_mulx8_bl(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 48;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 56) & 0xff];
+#endif
+    rep2_u2(f8_bl, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[1] ^= _tt;
+}
+
+#elif UNIT_BITS == 32
+
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+#define f1_bl(n,r,x)   r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \
+                            | (n < 3 ? x[n+1] << 17 : 0)) & MASK(0x01))
+#define f4_bl(n,r,x)   r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \
+                            | (n < 3 ? x[n+1] << 20 : 0)) & MASK(0x0f))
+#define f8_bl(n,r,x)   r[n] = (x[n] >> 8) | (n < 3 ? x[n+1] << 24 : 0)
+#else
+#define f1_bl(n,r,x)   r[n] = (x[n] << 1) | (n < 3 ? x[n+1] >> 31 : 0)
+#define f4_bl(n,r,x)   r[n] = (x[n] << 4) | (n < 3 ? x[n+1] >> 28 : 0)
+#define f8_bl(n,r,x)   r[n] = (x[n] << 8) | (n < 3 ? x[n+1] >> 24 : 0)
+#endif
+
+gf_decl void gf_mulx1_bl(gf_t r, const gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 7) & 0x01])) << 16;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 31) & 0x01];
+#endif
+    rep2_u4(f1_bl, UNIT_PTR(r), UNIT_PTR(x));
+    UNIT_PTR(r)[3] ^= _tt;
+}
+
+gf_decl void gf_mulx4_bl(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 4) & 0x0f])) << 16;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 28) & 0x0f];
+#endif
+    rep2_u4(f4_bl, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[3] ^= _tt;
+}
+
+gf_decl void gf_mulx8_bl(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 16;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 24) & 0xff];
+#endif
+    rep2_u4(f8_bl, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[3] ^= _tt;
+}
+
+#else
+
+#define f1_bl(n,r,x)   r[n] = (x[n] << 1) | (n < 15 ? x[n+1] >> 7 : 0)
+#define f4_bl(n,r,x)   r[n] = (x[n] << 4) | (n < 15 ? x[n+1] >> 4 : 0)
+
+gf_decl void gf_mulx1_bl(gf_t r, const gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[(UNIT_PTR(x)[0] >> 7) & 0x01];
+    rep2_u16(f1_bl, UNIT_PTR(r), UNIT_PTR(x));
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(r)[15] ^= _tt >> 8;
+#else
+    UNIT_PTR(r)[15] ^= _tt & 0xff;
+#endif
+}
+
+gf_decl void gf_mulx4_bl(gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[(UNIT_PTR(x)[0] >> 4) & 0x0f];
+    rep2_u16(f4_bl, UNIT_PTR(x), UNIT_PTR(x));
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(x)[14] ^= _tt & 0xff;
+    UNIT_PTR(x)[15] ^= _tt >> 8;
+#else
+    UNIT_PTR(x)[14] ^= _tt >> 8;
+    UNIT_PTR(x)[15] = _tt & 0xff;
+#endif
+}
+
+gf_decl void gf_mulx8_bl(gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[UNIT_PTR(x)[0]];
+    memmove(UNIT_PTR(x), UNIT_PTR(x) + 1, 15);
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(x)[14] ^= _tt & 0xff;
+    UNIT_PTR(x)[15]  = _tt >> 8;
+#else
+    UNIT_PTR(x)[14] ^= _tt >> 8;
+    UNIT_PTR(x)[15]  = _tt & 0xff;
+#endif
+}
+
+#endif
+
+/* LB Mode Galois Field operations 
+
+   x[0]    x[1]     x[2]     x[3]     x[4]     x[5]     x[6]     x[7]
+ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
+11100001 ........ ........ ........ ........ ........ ........ ........
+00....07 08....15 16....23 24....31 32....39 40....47 48....55 56....63
+   x[8]    x[9]    x[10]    x[11]    x[12]    x[13]    x[14]    x[15]
+ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
+........ ........ ........ ........ ........ ........ ........ .......M
+64....71 72....79 80....87 88....95 96...103 104..111 112..119 120..127
+*/
+
+#if UNIT_BITS == 64
+
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+#define f1_lb(n,r,x)   r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \
+                            | (n ? x[n-1] >> 49 : 0)) & MASK(0x80))
+#define f4_lb(n,r,x)   r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \
+                            | (n ? x[n-1] >> 52 : 0)) & MASK(0xf0))
+#define f8_lb(n,r,x)   r[n] = (x[n] << 8) | (n ? x[n-1] >> 56 : 0)
+#else
+#define f1_lb(n,r,x)   r[n] = (x[n] >> 1) | (n ? x[n-1] << 63 : 0)
+#define f4_lb(n,r,x)   r[n] = (x[n] >> 4) | (n ? x[n-1] << 60 : 0)
+#define f8_lb(n,r,x)   x[n] = (x[n] >> 8) | (n ? x[n-1] << 56 : 0)
+#endif
+
+gf_decl void gf_mulx1_lb(gf_t r, const gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[(UNIT_PTR(x)[1] >> 49) & MASK(0x80)];
+#else
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] << 7) & 0xff])) << 48;
+#endif
+    rep2_d2(f1_lb, UNIT_PTR(r), UNIT_PTR(x));
+    UNIT_PTR(r)[0] ^= _tt;
+}
+
+gf_decl void gf_mulx4_lb(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[(UNIT_PTR(x)[1] >> 52) & MASK(0xf0)];
+#else
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] << 4) & 0xff])) << 48;
+#endif
+    rep2_d2(f4_lb, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[0] ^= _tt;
+}
+
+gf_decl void gf_mulx8_lb(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[UNIT_PTR(x)[1] >> 56];
+#else
+    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[1] & 0xff])) << 48;
+#endif
+    rep2_d2(f8_lb, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[0] ^= _tt;
+}
+
+#elif UNIT_BITS == 32
+
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+#define f1_lb(n,r,x)   r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \
+                            | (n ? x[n-1] >> 17 : 0)) & MASK(0x80))
+#define f4_lb(n,r,x)   r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \
+                            | (n ? x[n-1] >> 20 : 0)) & MASK(0xf0))
+#define f8_lb(n,r,x)   r[n] = (x[n] << 8) | (n ? x[n-1] >> 24 : 0)
+#else
+#define f1_lb(n,r,x)   r[n] = (x[n] >> 1) | (n ? x[n-1] << 31 : 0)
+#define f4_lb(n,r,x)   r[n] = (x[n] >> 4) | (n ? x[n-1] << 28 : 0)
+#define f8_lb(n,r,x)   r[n] = (x[n] >> 8) | (n ? x[n-1] << 24 : 0)
+#endif
+
+gf_decl void gf_mulx1_lb(gf_t r, const gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[(UNIT_PTR(x)[3] >> 17) & MASK(0x80)];
+#else
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] << 7) & 0xff])) << 16;
+#endif
+    rep2_d4(f1_lb, UNIT_PTR(r), UNIT_PTR(x));
+    UNIT_PTR(r)[0] ^= _tt;
+}
+
+gf_decl void gf_mulx4_lb(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[(UNIT_PTR(x)[3] >> 20) & MASK(0xf0)];
+#else
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] << 4) & 0xff])) << 16;
+#endif
+    rep2_d4(f4_lb, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[0] ^= _tt;
+}
+
+gf_decl void gf_mulx8_lb(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = gf_tab[UNIT_PTR(x)[3] >> 24];
+#else
+    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[3] & 0xff])) << 16;
+#endif
+    rep2_d4(f8_lb, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[0] ^= _tt;
+}
+
+#else
+
+#define f1_lb(n,r,x)   r[n] = (x[n] >> 1) | (n ? x[n-1] << 7 : 0)
+#define f4_lb(n,r,x)   r[n] = (x[n] >> 4) | (n ? x[n-1] << 4 : 0)
+
+gf_decl void gf_mulx1_lb(gf_t r, const gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[(UNIT_PTR(x)[15] << 7) & 0x80];
+    rep2_d16(f1_lb, UNIT_PTR(r), UNIT_PTR(x));
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(r)[0] ^= _tt;
+#else
+    UNIT_PTR(r)[0] ^= _tt >> 8;
+#endif
+}
+
+gf_decl void gf_mulx4_lb(gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[(UNIT_PTR(x)[15] << 4) & 0xf0];
+    rep2_d16(f4_lb, UNIT_PTR(x), UNIT_PTR(x));
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(x)[1] ^= _tt >> 8;
+    UNIT_PTR(x)[0] ^= _tt & 0xff;
+#else
+    UNIT_PTR(x)[1] ^= _tt & 0xff;
+    UNIT_PTR(x)[0] ^= _tt >> 8;
+#endif
+}
+
+gf_decl void gf_mulx8_lb(gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[UNIT_PTR(x)[15]];
+    memmove(UNIT_PTR(x) + 1, UNIT_PTR(x), 15);
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(x)[1] ^= _tt >> 8;
+    UNIT_PTR(x)[0] = _tt & 0xff;
+#else
+    UNIT_PTR(x)[1] ^= _tt & 0xff;
+    UNIT_PTR(x)[0] = _tt >> 8;
+#endif
+}
+
+#endif
+
+/* BB Mode Galois Field operations 
+
+  x[0]     x[1]     x[2]     x[3]     x[4]     x[5]     x[6]     x[7]
+ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
+.......M ........ ........ ........ ........ ........ ........ ........
+120..127 112..119 104..111 96...103 88....95 80....87 72....79 64....71
+  x[8]     x[9]     x[10]    x[11]    x[12]    x[13]    x[14]   x[15]
+ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
+........ ........ ........ ........ ........ ........ ........ 11100001
+56....63 48....55 40....47 32....39 24....31 16....23 08....15 00....07
+*/
+
+#if UNIT_BITS == 64
+
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+#define f1_bb(n,r,x)   r[n] = (x[n] >> 1) | (!n ? x[n+1] << 63 : 0)
+#define f4_bb(n,r,x)   r[n] = (x[n] >> 4) | (!n ? x[n+1] << 60 : 0)
+#define f8_bb(n,r,x)   r[n] = (x[n] >> 8) | (!n ? x[n+1] << 56 : 0)
+#else
+#define f1_bb(n,r,x)   r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \
+                            | (!n ? x[n+1] >> 49 : 0)) & MASK(0x80))
+#define f4_bb(n,r,x)   r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \
+                            | (!n ? x[n+1] >> 52 : 0)) & MASK(0xf0))
+#define f8_bb(n,r,x)   r[n] = (x[n] << 8) | (!n ? x[n+1] >> 56 : 0)
+#endif
+
+gf_decl void gf_mulx1_bb(gf_t r, const gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = (( gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 7) & 0x80])) << 48;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 49) & 0x80];
+#endif
+    rep2_u2(f1_bb, UNIT_PTR(r), UNIT_PTR(x));
+    UNIT_PTR(r)[1] ^= _tt;
+}
+
+gf_decl void gf_mulx4_bb(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 4) & 0xf0])) << 48;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 52) & 0xf0];
+#endif
+    rep2_u2(f4_bb, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[1] ^= _tt;
+}
+
+gf_decl void gf_mulx8_bb(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 48;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 56) & 0xff];
+#endif
+    rep2_u2(f8_bb, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[1] ^= _tt;
+}
+
+#elif UNIT_BITS == 32
+
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+#define f1_bb(n,r,x)   r[n] = (x[n] >> 1) | (n < 3 ? x[n+1] << 31 : 0)
+#define f4_bb(n,r,x)   r[n] = (x[n] >> 4) | (n < 3 ? x[n+1] << 28 : 0)
+#define f8_bb(n,r,x)   r[n] = (x[n] >> 8) | (n < 3 ? x[n+1] << 24 : 0)
+#else
+#define f1_bb(n,r,x)   r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \
+                            | (n < 3 ? x[n+1] >> 17 : 0)) & MASK(0x80))
+#define f4_bb(n,r,x)   r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \
+                            | (n < 3 ? x[n+1] >> 20 : 0)) & MASK(0xf0))
+#define f8_bb(n,r,x)   r[n] = (x[n] << 8) | (n < 3 ? x[n+1] >> 24 : 0)
+#endif
+
+gf_decl void gf_mulx1_bb(gf_t r, const gf_t x)
+{   gf_unit_t _tt; 
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 7) & 0x80])) << 16;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 17) & 0x80];
+#endif
+    rep2_u4(f1_bb, UNIT_PTR(r), UNIT_PTR(x));
+    UNIT_PTR(r)[3] ^= _tt;
+}
+
+gf_decl void gf_mulx4_bb(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 4) & 0xf0])) << 16;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 20) & 0xf0];
+#endif
+    rep2_u4(f4_bb, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[3] ^= _tt;
+}
+
+gf_decl void gf_mulx8_bb(gf_t x)
+{   gf_unit_t _tt;
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 16;
+#else
+    _tt = gf_tab[(UNIT_PTR(x)[0] >> 24) & 0xff];
+#endif
+    rep2_u4(f8_bb, UNIT_PTR(x), UNIT_PTR(x));
+    UNIT_PTR(x)[3] ^= _tt;
+}
+
+#else
+
+#define f1_bb(n,r,x)   r[n] = (x[n] >> 1) | (n < 15 ? x[n+1] << 7 : 0)
+#define f4_bb(n,r,x)   r[n] = (x[n] >> 4) | (n < 15 ? x[n+1] << 4 : 0)
+
+gf_decl void gf_mulx1_bb(gf_t r, const gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[(UNIT_PTR(x)[0] << 7) & 0x80];
+    rep2_u16(f1_bb, UNIT_PTR(r), UNIT_PTR(x));
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(r)[15] ^= _tt >> 8;
+#else
+    UNIT_PTR(r)[15] ^= _tt;
+#endif
+}
+
+gf_decl void gf_mulx4_bb(gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[(UNIT_PTR(x)[0] << 4) & 0xf0];
+    rep2_u16(f4_bb, UNIT_PTR(x), UNIT_PTR(x));
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(x)[14] ^= _tt & 0xff;
+    UNIT_PTR(x)[15] ^= _tt >> 8;
+#else
+    UNIT_PTR(x)[14] ^= _tt >> 8;
+    UNIT_PTR(x)[15] ^= _tt & 0xff;
+#endif
+}
+
+gf_decl void gf_mulx8_bb(gf_t x)
+{   uint16_t _tt;
+	_tt = gf_tab[UNIT_PTR(x)[0]];
+    memmove(UNIT_PTR(x), UNIT_PTR(x) + 1, 15);
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+    UNIT_PTR(x)[14] ^= _tt & 0xff;
+    UNIT_PTR(x)[15] = _tt >> 8;
+#else
+    UNIT_PTR(x)[14] ^= _tt >> 8;
+    UNIT_PTR(x)[15] = _tt & 0xff;
+#endif
+}
+
+#endif
+
+#endif
--- a/crypto/aes/mode_hdr.h
+++ b/crypto/aes/mode_hdr.h
@ -0,0 +1,329 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2014, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 18/02/2014
+
+This header file is an INTERNAL file which supports mode implementation
+*/
+
+#ifndef _MODE_HDR_H
+#define _MODE_HDR_H
+
+#include <string.h>
+#include <limits.h>
+
+#include "brg_endian.h"
+
+/*  This define sets the units in which buffers are processed.  This code
+    can provide significant speed gains if buffers can be processed in
+    32 or 64 bit chunks rather than in bytes.  This define sets the units
+    in which buffers will be accessed if possible
+*/
+#if !defined( UNIT_BITS )
+#  if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+#    if 0
+#      define UNIT_BITS  32
+#    elif 1
+#      define UNIT_BITS  64
+#    endif
+#  elif defined( _WIN64 )
+#    define UNIT_BITS 64
+#  else
+#    define UNIT_BITS 32
+#  endif
+#endif
+
+#if UNIT_BITS == 64 && !defined( NEED_UINT_64T )
+#  define NEED_UINT_64T
+#endif
+
+#include "brg_types.h"
+
+/*  Use of inlines is preferred but code blocks can also be expanded inline
+    using 'defines'.  But the latter approach will typically generate a LOT
+    of code and is not recommended. 
+*/
+#if 1 && !defined( USE_INLINING )
+#  define USE_INLINING
+#endif
+
+#if defined( _MSC_VER )
+#  if _MSC_VER >= 1400
+#    include <stdlib.h>
+#    include <intrin.h>
+#    pragma intrinsic(memset)
+#    pragma intrinsic(memcpy)
+#    define rotl32        _rotl
+#    define rotr32        _rotr
+#    define rotl64        _rotl64
+#    define rotr64        _rotl64
+#    define bswap_16(x)   _byteswap_ushort(x)
+#    define bswap_32(x)   _byteswap_ulong(x)
+#    define bswap_64(x)   _byteswap_uint64(x)
+#  else
+#    define rotl32 _lrotl
+#    define rotr32 _lrotr
+#  endif
+#endif
+
+#if defined( USE_INLINING )
+#  if defined( _MSC_VER )
+#    define mh_decl __inline
+#  elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#    define mh_decl static inline
+#  else
+#    define mh_decl static
+#  endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define  UI8_PTR(x)     UPTR_CAST(x,  8)
+#define UI16_PTR(x)     UPTR_CAST(x, 16)
+#define UI32_PTR(x)     UPTR_CAST(x, 32)
+#define UI64_PTR(x)     UPTR_CAST(x, 64)
+#define UNIT_PTR(x)     UPTR_CAST(x, UNIT_BITS)
+
+#define  UI8_VAL(x)     UNIT_CAST(x,  8)
+#define UI16_VAL(x)     UNIT_CAST(x, 16)
+#define UI32_VAL(x)     UNIT_CAST(x, 32)
+#define UI64_VAL(x)     UNIT_CAST(x, 64)
+#define UNIT_VAL(x)     UNIT_CAST(x, UNIT_BITS)
+
+#define BUF_INC          (UNIT_BITS >> 3)
+#define BUF_ADRMASK     ((UNIT_BITS >> 3) - 1)
+
+#define rep2_u2(f,r,x)    f( 0,r,x); f( 1,r,x) 
+#define rep2_u4(f,r,x)    f( 0,r,x); f( 1,r,x); f( 2,r,x); f( 3,r,x) 
+#define rep2_u16(f,r,x)   f( 0,r,x); f( 1,r,x); f( 2,r,x); f( 3,r,x); \
+                          f( 4,r,x); f( 5,r,x); f( 6,r,x); f( 7,r,x); \
+                          f( 8,r,x); f( 9,r,x); f(10,r,x); f(11,r,x); \
+                          f(12,r,x); f(13,r,x); f(14,r,x); f(15,r,x)
+
+#define rep2_d2(f,r,x)    f( 1,r,x); f( 0,r,x) 
+#define rep2_d4(f,r,x)    f( 3,r,x); f( 2,r,x); f( 1,r,x); f( 0,r,x) 
+#define rep2_d16(f,r,x)   f(15,r,x); f(14,r,x); f(13,r,x); f(12,r,x); \
+                          f(11,r,x); f(10,r,x); f( 9,r,x); f( 8,r,x); \
+                          f( 7,r,x); f( 6,r,x); f( 5,r,x); f( 4,r,x); \
+                          f( 3,r,x); f( 2,r,x); f( 1,r,x); f( 0,r,x)
+
+#define rep3_u2(f,r,x,y,c)  f( 0,r,x,y,c); f( 1,r,x,y,c) 
+#define rep3_u4(f,r,x,y,c)  f( 0,r,x,y,c); f( 1,r,x,y,c); f( 2,r,x,y,c); f( 3,r,x,y,c) 
+#define rep3_u16(f,r,x,y,c) f( 0,r,x,y,c); f( 1,r,x,y,c); f( 2,r,x,y,c); f( 3,r,x,y,c); \
+                            f( 4,r,x,y,c); f( 5,r,x,y,c); f( 6,r,x,y,c); f( 7,r,x,y,c); \
+                            f( 8,r,x,y,c); f( 9,r,x,y,c); f(10,r,x,y,c); f(11,r,x,y,c); \
+                            f(12,r,x,y,c); f(13,r,x,y,c); f(14,r,x,y,c); f(15,r,x,y,c)
+
+#define rep3_d2(f,r,x,y,c)  f( 1,r,x,y,c); f( 0,r,x,y,c) 
+#define rep3_d4(f,r,x,y,c)  f( 3,r,x,y,c); f( 2,r,x,y,c); f( 1,r,x,y,c); f( 0,r,x,y,c) 
+#define rep3_d16(f,r,x,y,c) f(15,r,x,y,c); f(14,r,x,y,c); f(13,r,x,y,c); f(12,r,x,y,c); \
+                            f(11,r,x,y,c); f(10,r,x,y,c); f( 9,r,x,y,c); f( 8,r,x,y,c); \
+                            f( 7,r,x,y,c); f( 6,r,x,y,c); f( 5,r,x,y,c); f( 4,r,x,y,c); \
+                            f( 3,r,x,y,c); f( 2,r,x,y,c); f( 1,r,x,y,c); f( 0,r,x,y,c)
+
+/* function pointers might be used for fast XOR operations */
+
+typedef void (*xor_function)(void* r, const void* p, const void* q);
+
+/* left and right rotates on 32 and 64 bit variables */
+
+#if !defined( rotl32 )  /* NOTE: 0 <= n <= 32 ASSUMED */
+mh_decl uint32_t rotl32(uint32_t x, int n)
+{
+    return (((x) << n) | ((x) >> (32 - n)));
+}
+#endif
+
+#if !defined( rotr32 )  /* NOTE: 0 <= n <= 32 ASSUMED */
+mh_decl uint32_t rotr32(uint32_t x, int n)
+{
+    return (((x) >> n) | ((x) << (32 - n)));
+}
+#endif
+
+#if ( UNIT_BITS == 64 ) && !defined( rotl64 )  /* NOTE: 0 <= n <= 64 ASSUMED */
+mh_decl uint64_t rotl64(uint64_t x, int n)
+{
+    return (((x) << n) | ((x) >> (64 - n)));
+}
+#endif
+
+#if ( UNIT_BITS == 64 ) && !defined( rotr64 )  /* NOTE: 0 <= n <= 64 ASSUMED */
+mh_decl uint64_t rotr64(uint64_t x, int n)
+{
+    return (((x) >> n) | ((x) << (64 - n)));
+}
+#endif
+
+/* byte order inversions for 16, 32 and 64 bit variables */
+
+#if !defined(bswap_16)
+mh_decl uint16_t bswap_16(uint16_t x)
+{
+    return (uint16_t)((x >> 8) | (x << 8));
+}
+#endif
+
+#if !defined(bswap_32)
+mh_decl uint32_t bswap_32(uint32_t x)
+{
+    return ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00));
+}
+#endif
+
+#if ( UNIT_BITS == 64 ) && !defined(bswap_64)
+mh_decl uint64_t bswap_64(uint64_t x)
+{   
+    return bswap_32((uint32_t)(x >> 32)) | ((uint64_t)bswap_32((uint32_t)x) << 32);
+}
+#endif
+
+/* support for fast aligned buffer move, xor and byte swap operations - 
+   source and destination buffers for move and xor operations must not 
+   overlap, those for byte order revesal must either not overlap or
+   must be identical
+*/
+#define f_copy(n,p,q)     p[n] = q[n]
+#define f_xor(n,r,p,q,c)  r[n] = c(p[n] ^ q[n])
+
+mh_decl void copy_block(void* p, const void* q)
+{
+    memcpy(p, q, 16);
+}
+
+mh_decl void copy_block_aligned(void *p, const void *q)
+{
+#if UNIT_BITS == 8
+    memcpy(p, q, 16);
+#elif UNIT_BITS == 32
+    rep2_u4(f_copy,UNIT_PTR(p),UNIT_PTR(q));
+#else
+    rep2_u2(f_copy,UNIT_PTR(p),UNIT_PTR(q));
+#endif
+}
+
+mh_decl void xor_block(void *r, const void* p, const void* q)
+{
+    rep3_u16(f_xor, UI8_PTR(r), UI8_PTR(p), UI8_PTR(q), UI8_VAL);
+}
+
+mh_decl void xor_block_aligned(void *r, const void *p, const void *q)
+{
+#if UNIT_BITS == 8
+    rep3_u16(f_xor, UNIT_PTR(r), UNIT_PTR(p), UNIT_PTR(q), UNIT_VAL);
+#elif UNIT_BITS == 32
+    rep3_u4(f_xor, UNIT_PTR(r), UNIT_PTR(p), UNIT_PTR(q), UNIT_VAL);
+#else
+    rep3_u2(f_xor, UNIT_PTR(r), UNIT_PTR(p), UNIT_PTR(q), UNIT_VAL);
+#endif
+}
+
+/* byte swap within 32-bit words in a 16 byte block; don't move 32-bit words */
+mh_decl void bswap32_block(void *d, const void* s)
+{
+#if UNIT_BITS == 8
+    uint8_t t;
+    t = UNIT_PTR(s)[ 0]; UNIT_PTR(d)[ 0] = UNIT_PTR(s)[ 3]; UNIT_PTR(d)[ 3] = t;
+    t = UNIT_PTR(s)[ 1]; UNIT_PTR(d)[ 1] = UNIT_PTR(s)[ 2]; UNIT_PTR(d)[ 2] = t;
+    t = UNIT_PTR(s)[ 4]; UNIT_PTR(d)[ 4] = UNIT_PTR(s)[ 7]; UNIT_PTR(d)[ 7] = t;
+    t = UNIT_PTR(s)[ 5]; UNIT_PTR(d)[ 5] = UNIT_PTR(s)[ 6]; UNIT_PTR(d) [6] = t;
+    t = UNIT_PTR(s)[ 8]; UNIT_PTR(d)[ 8] = UNIT_PTR(s)[11]; UNIT_PTR(d)[12] = t;
+    t = UNIT_PTR(s)[ 9]; UNIT_PTR(d)[ 9] = UNIT_PTR(s)[10]; UNIT_PTR(d)[10] = t;
+    t = UNIT_PTR(s)[12]; UNIT_PTR(d)[12] = UNIT_PTR(s)[15]; UNIT_PTR(d)[15] = t;
+    t = UNIT_PTR(s)[13]; UNIT_PTR(d)[ 3] = UNIT_PTR(s)[14]; UNIT_PTR(d)[14] = t;
+#elif UNIT_BITS == 32
+    UNIT_PTR(d)[0] = bswap_32(UNIT_PTR(s)[0]); UNIT_PTR(d)[1] = bswap_32(UNIT_PTR(s)[1]);
+    UNIT_PTR(d)[2] = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[3] = bswap_32(UNIT_PTR(s)[3]);
+#else
+    UI32_PTR(d)[0] = bswap_32(UI32_PTR(s)[0]); UI32_PTR(d)[1] = bswap_32(UI32_PTR(s)[1]);
+    UI32_PTR(d)[2] = bswap_32(UI32_PTR(s)[2]); UI32_PTR(d)[3] = bswap_32(UI32_PTR(s)[3]);
+#endif
+}
+
+/* byte swap within 64-bit words in a 16 byte block; don't move 64-bit words */
+mh_decl void bswap64_block(void *d, const void* s)
+{
+#if UNIT_BITS == 8
+    uint8_t t;
+    t = UNIT_PTR(s)[ 0]; UNIT_PTR(d)[ 0] = UNIT_PTR(s)[ 7]; UNIT_PTR(d)[ 7] = t;
+    t = UNIT_PTR(s)[ 1]; UNIT_PTR(d)[ 1] = UNIT_PTR(s)[ 6]; UNIT_PTR(d)[ 6] = t;
+    t = UNIT_PTR(s)[ 2]; UNIT_PTR(d)[ 2] = UNIT_PTR(s)[ 5]; UNIT_PTR(d)[ 5] = t;
+    t = UNIT_PTR(s)[ 3]; UNIT_PTR(d)[ 3] = UNIT_PTR(s)[ 3]; UNIT_PTR(d) [3] = t;
+    t = UNIT_PTR(s)[ 8]; UNIT_PTR(d)[ 8] = UNIT_PTR(s)[15]; UNIT_PTR(d)[15] = t;
+    t = UNIT_PTR(s)[ 9]; UNIT_PTR(d)[ 9] = UNIT_PTR(s)[14]; UNIT_PTR(d)[14] = t;
+    t = UNIT_PTR(s)[10]; UNIT_PTR(d)[10] = UNIT_PTR(s)[13]; UNIT_PTR(d)[13] = t;
+    t = UNIT_PTR(s)[11]; UNIT_PTR(d)[11] = UNIT_PTR(s)[12]; UNIT_PTR(d)[12] = t;
+#elif UNIT_BITS == 32
+    uint32_t t;
+    t = bswap_32(UNIT_PTR(s)[0]); UNIT_PTR(d)[0] = bswap_32(UNIT_PTR(s)[1]); UNIT_PTR(d)[1] = t;
+    t = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[2] = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[3] = t;
+#else
+    UNIT_PTR(d)[0] = bswap_64(UNIT_PTR(s)[0]);  UNIT_PTR(d)[1] = bswap_64(UNIT_PTR(s)[1]); 
+#endif
+}
+
+mh_decl void bswap128_block(void *d, const void* s)
+{
+#if UNIT_BITS == 8
+    uint8_t t;
+    t = UNIT_PTR(s)[0]; UNIT_PTR(d)[0] = UNIT_PTR(s)[15]; UNIT_PTR(d)[15] = t;
+    t = UNIT_PTR(s)[1]; UNIT_PTR(d)[1] = UNIT_PTR(s)[14]; UNIT_PTR(d)[14] = t;
+    t = UNIT_PTR(s)[2]; UNIT_PTR(d)[2] = UNIT_PTR(s)[13]; UNIT_PTR(d)[13] = t;
+    t = UNIT_PTR(s)[3]; UNIT_PTR(d)[3] = UNIT_PTR(s)[12]; UNIT_PTR(d)[12] = t;
+    t = UNIT_PTR(s)[4]; UNIT_PTR(d)[4] = UNIT_PTR(s)[11]; UNIT_PTR(d)[11] = t;
+    t = UNIT_PTR(s)[5]; UNIT_PTR(d)[5] = UNIT_PTR(s)[10]; UNIT_PTR(d)[10] = t;
+    t = UNIT_PTR(s)[6]; UNIT_PTR(d)[6] = UNIT_PTR(s)[ 9]; UNIT_PTR(d)[ 9] = t;
+    t = UNIT_PTR(s)[7]; UNIT_PTR(d)[7] = UNIT_PTR(s)[ 8]; UNIT_PTR(d)[ 8] = t;
+#elif UNIT_BITS == 32
+    uint32_t t;
+    t = bswap_32(UNIT_PTR(s)[0]); UNIT_PTR(d)[0] = bswap_32(UNIT_PTR(s)[3]); UNIT_PTR(d)[3] = t;
+    t = bswap_32(UNIT_PTR(s)[1]); UNIT_PTR(d)[1] = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[2] = t;
+#else
+    uint64_t t;
+    t = bswap_64(UNIT_PTR(s)[0]); UNIT_PTR(d)[0] = bswap_64(UNIT_PTR(s)[1]); UNIT_PTR(d)[1] = t;
+#endif
+}
+
+/* platform byte order to big or little endian order for 16, 32 and 64 bit variables */
+
+#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+
+#  define uint16_t_to_le(x) (x) = bswap_16((x))
+#  define uint32_t_to_le(x) (x) = bswap_32((x))
+#  define uint64_t_to_le(x) (x) = bswap_64((x))
+#  define uint16_t_to_be(x)
+#  define uint32_t_to_be(x)
+#  define uint64_t_to_be(x)
+
+#else
+
+#  define uint16_t_to_le(x)
+#  define uint32_t_to_le(x)
+#  define uint64_t_to_le(x)
+#  define uint16_t_to_be(x) (x) = bswap_16((x))
+#  define uint32_t_to_be(x) (x) = bswap_32((x))
+#  define uint64_t_to_be(x) (x) = bswap_64((x))
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/tools/style.c.exclude
+++ b/tools/style.c.exclude
@ -1,5 +1,5 @@
 ^\./core/embed/bootloader/protob/
-^\./crypto/aes/aes\(\|crypt\|key\|_modes\|opt\|tab\|tst\)\.
+^\./crypto/aes/
 ^\./crypto/chacha20poly1305/
 ^\./crypto/ed25519-donna/
 ^\./crypto/gui/