feat(crypto): Add Brian Gladman's implementation of GCM.

2025-07-02 04:42:33 +00:00 · 2024-03-15 11:10:47 +01:00 · 2024-03-15 11:10:47 +01:00 · 6e207215e3
commit 6e207215e3
parent 89147ef493
9 changed files with 2599 additions and 2 deletions
--- a/2
+++ b/2
@ -65,7 +65,7 @@ yaml_check: ## check yaml formatting
 	yamllint .
 editor_check: ## check editorconfig formatting
-	editorconfig-checker -exclude '.*\.(so|dat|toif|der)'
+	editorconfig-checker -exclude '.*\.(so|dat|toif|der)|^crypto/aes/'
 cstyle_check: ## run code style check on low-level C code
 	clang-format --version
--- a/crypto/aes/aesgcm.c
+++ b/crypto/aes/aesgcm.c
@ -0,0 +1,547 @@
 /*
 ---------------------------------------------------------------------------
 Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved.
 The redistribution and use of this software (with or without changes)
 is allowed without the payment of fees or royalties provided that:
  source code distributions include the above copyright notice, this
  list of conditions and the following disclaimer;
  binary distributions include the above copyright notice, this list
  of conditions and the following disclaimer in their documentation.
 This software is provided 'as is' with no explicit or implied warranties
 in respect of its operation, including, but not limited to, correctness
 and fitness for purpose.
 ---------------------------------------------------------------------------
 Issue Date: 30/03/2011
 My thanks to:
   Colin Sinclair for finding an error and suggesting a number of
   improvements to this code. 
   John Viega and David McGrew for their support in the development 
   of this code and to David for testing it on a big-endIAN system.
   Mark Rodenkirch and Jason Papadopoulos for their help in finding
   a bug in the fast buffer operations on big endian systems.
 */
 #include "gcm.h"
 #include "mode_hdr.h"
 /*  This GCM implementation needs a Galois Field multiplier for GF(2^128).
    which operates on field elements using a polynomial field representation
    x^127 + x^126 + ... + x^2 + x + 1 using the bits in a bit sequence that
    will be numbered by the power of x that they represent. GCM uses the
    polynomial x^128 + x^7 + x^2 + x + 1 as its basis for representation.
    The obvious way of representing this in a computer system is to map GF
    'x' to the binary integer '2' - but this was way too obvious for any
    cryptographer to adopt!
    Here bytes are numbered in memory order and  bits within bytes according
    to their integer numeric significance. The term 'little endian' is then
    used to describe mappings in which numeric (power of 2) or field (power
    of x) significance increase with increasing bit or byte numbers with
    'big endian' being used to describe the inverse situation.
    GCM uses little endian byte ordering and big endian bit ordering, a
    representation that will be described as LB. Hence the low end of the
    field polynomial is in byte[0], which has the value 0xe1 rather than
    0x87 in the more obvious mappings.
    The related field multipler can use this mapping but if you want to
    use an alternative (e.g hardware) multiplier that uses a different
    polynomial field representation, you can do so by changing the form
    used for the field elements when this alternative multiplier is used.
    If GF_REPRESENTATION is defined as one of:
        REVERSE_BITS                      // change to LL
        REVERSE_BYTES | REVERSE_BITS      // change to BL
        REVERSE_NONE                      // no change
        REVERSE_BYTES                     // change to BB
    then an appropriate change of representation will occur before and
    after calls to your revised field multiplier. To use this you need
    to add gf_convert.c to your application.  
 */
 #if defined(__cplusplus)
 extern "C"
 {
 #endif
 #if 1
 #  undef GF_REPRESENTATION
 #elif 0
 #  define GF_REPRESENTATION REVERSE_BITS
 #elif 0
 #  define GF_REPRESENTATION REVERSE_BYTES | REVERSE_BITS
 #elif 0
 #  define GF_REPRESENTATION REVERSE_NONE
 #elif 0
 #  define GF_REPRESENTATION REVERSE_BITS
 #endif
 #define BLOCK_SIZE      GCM_BLOCK_SIZE      /* block length                 */
 #define BLK_ADR_MASK    (BLOCK_SIZE - 1)    /* mask for 'in block' address  */
 #define CTR_POS         12
 #define inc_ctr(x)  \
    {   int i = BLOCK_SIZE; while(i-- > CTR_POS && !++(UI8_PTR(x)[i])) ; }
 ret_type gcm_init_and_key(                  /* initialise mode and set key  */
            const unsigned char key[],      /* the key value                */
            unsigned long key_len,          /* and its length in bytes      */
            gcm_ctx ctx[1])                 /* the mode context             */
 {
    memset(ctx->ghash_h, 0, sizeof(ctx->ghash_h));
    /* set the AES key                          */
    aes_encrypt_key(key, key_len, ctx->aes);
    /* compute E(0) (for the hash function)     */
    aes_encrypt(UI8_PTR(ctx->ghash_h), UI8_PTR(ctx->ghash_h), ctx->aes);
 #if defined( GF_REPRESENTATION )
    convert_representation(ctx->ghash_h, ctx->ghash_h, GF_REPRESENTATION);
 #endif
 #if defined( TABLES_64K )
    init_64k_table(ctx->ghash_h, ctx->gf_t64k);
 #elif defined( TABLES_8K )
    init_8k_table(ctx->ghash_h, ctx->gf_t8k);
 #elif defined( TABLES_4K )
    init_4k_table(ctx->ghash_h, ctx->gf_t4k);
 #elif defined( TABLES_256 )
    init_256_table(ctx->ghash_h, ctx->gf_t256);
 #endif
 #if defined(  GF_REPRESENTATION )
    convert_representation(ctx->ghash_h, ctx->ghash_h, GF_REPRESENTATION);
 #endif
    return RETURN_GOOD;
 }
 void gf_mul_hh(gf_t a, gcm_ctx ctx[1])
 {
 #if defined( GF_REPRESENTATION ) || !defined( NO_TABLES )
    gf_t    scr;
 #endif
 #if defined(  GF_REPRESENTATION )
    convert_representation(a, a, GF_REPRESENTATION);
 #endif
 #if defined( TABLES_64K )
    gf_mul_64k(a, ctx->gf_t64k, scr);
 #elif defined( TABLES_8K )
    gf_mul_8k(a, ctx->gf_t8k, scr);
 #elif defined( TABLES_4K )
    gf_mul_4k(a, ctx->gf_t4k, scr);
 #elif defined( TABLES_256 )
    gf_mul_256(a, ctx->gf_t256, scr);
 #else
 # if defined( GF_REPRESENTATION )
    convert_representation(scr, ctx->ghash_h, GF_REPRESENTATION);
    gf_mul(a, scr);
 # else
    gf_mul(a, ctx->ghash_h);
 # endif
 #endif
 #if defined(  GF_REPRESENTATION )
    convert_representation(a, a, GF_REPRESENTATION);
 #endif
 }
 ret_type gcm_init_message(                  /* initialise a new message     */
            const unsigned char iv[],       /* the initialisation vector    */
            unsigned long iv_len,           /* and its length in bytes      */
            gcm_ctx ctx[1])                 /* the mode context             */
 {   uint32_t i, n_pos = 0;
    uint8_t *p;
    memset(ctx->ctr_val, 0, BLOCK_SIZE);
    if(iv_len == CTR_POS)
    {
        memcpy(ctx->ctr_val, iv, CTR_POS); UI8_PTR(ctx->ctr_val)[15] = 0x01;
    }
    else
    {   n_pos = iv_len;
        while(n_pos >= BLOCK_SIZE)
        {
            xor_block_aligned(ctx->ctr_val, ctx->ctr_val, iv);
            n_pos -= BLOCK_SIZE;
            iv += BLOCK_SIZE;
            gf_mul_hh((gf_t*)ctx->ctr_val, ctx);
        }
        if(n_pos)
        {
            p = UI8_PTR(ctx->ctr_val);
            while(n_pos-- > 0)
                *p++ ^= *iv++;
            gf_mul_hh((gf_t*)ctx->ctr_val, ctx);
        }
        n_pos = (iv_len << 3);
        for(i = BLOCK_SIZE - 1; n_pos; --i, n_pos >>= 8)
            UI8_PTR(ctx->ctr_val)[i] ^= (unsigned char)n_pos;
        gf_mul_hh((gf_t*)ctx->ctr_val, ctx);
    }
    ctx->y0_val = *UI32_PTR(UI8_PTR(ctx->ctr_val) + CTR_POS);
    memset(ctx->hdr_ghv, 0, BLOCK_SIZE);
    memset(ctx->txt_ghv, 0, BLOCK_SIZE);
    ctx->hdr_cnt = 0;
    ctx->txt_ccnt = ctx->txt_acnt = 0;
    return RETURN_GOOD;
 }
 ret_type gcm_auth_header(                   /* authenticate the header      */
            const unsigned char hdr[],      /* the header buffer            */
            unsigned long hdr_len,          /* and its length in bytes      */
            gcm_ctx ctx[1])                 /* the mode context             */
 {   uint32_t cnt = 0, b_pos = (uint32_t)ctx->hdr_cnt & BLK_ADR_MASK;
    if(!hdr_len)
        return RETURN_GOOD;
    if(ctx->hdr_cnt && b_pos == 0)
        gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
    if(!((hdr - (UI8_PTR(ctx->hdr_ghv) + b_pos)) & BUF_ADRMASK))
    {
 		while(cnt < hdr_len && (b_pos & BUF_ADRMASK))
 		    UI8_PTR(ctx->hdr_ghv)[b_pos++] ^= hdr[cnt++];
 		while(cnt + BUF_INC <= hdr_len && b_pos <= BLOCK_SIZE - BUF_INC)
        {
            *UNIT_PTR(UI8_PTR(ctx->hdr_ghv) + b_pos) ^= *UNIT_PTR(hdr + cnt);
            cnt += BUF_INC; b_pos += BUF_INC;
        }
        while(cnt + BLOCK_SIZE <= hdr_len)
        {
            gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
            xor_block_aligned(ctx->hdr_ghv, ctx->hdr_ghv, hdr + cnt);
            cnt += BLOCK_SIZE;
        }
    }
    else
    {
        while(cnt < hdr_len && b_pos < BLOCK_SIZE)
            UI8_PTR(ctx->hdr_ghv)[b_pos++] ^= hdr[cnt++];
        while(cnt + BLOCK_SIZE <= hdr_len)
        {
            gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
            xor_block(ctx->hdr_ghv, ctx->hdr_ghv, hdr + cnt);
            cnt += BLOCK_SIZE;
        }
    }
    while(cnt < hdr_len)
    {
        if(b_pos == BLOCK_SIZE)
        {
            gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
            b_pos = 0;
        }
        UI8_PTR(ctx->hdr_ghv)[b_pos++] ^= hdr[cnt++];
    }
    ctx->hdr_cnt += cnt;
    return RETURN_GOOD;
 }
 ret_type gcm_auth_data(                     /* authenticate ciphertext data */
            const unsigned char data[],     /* the data buffer              */
            unsigned long data_len,         /* and its length in bytes      */
            gcm_ctx ctx[1])                 /* the mode context             */
 {   uint32_t cnt = 0, b_pos = (uint32_t)ctx->txt_acnt & BLK_ADR_MASK;
    if(!data_len)
        return RETURN_GOOD;
    if(ctx->txt_acnt && b_pos == 0)
        gf_mul_hh((gf_t*)ctx->txt_ghv, ctx);
    if(!((data - (UI8_PTR(ctx->txt_ghv) + b_pos)) & BUF_ADRMASK))
    {
 	    while(cnt < data_len && (b_pos & BUF_ADRMASK))
 		    UI8_PTR(ctx->txt_ghv)[b_pos++] ^= data[cnt++];
        while(cnt + BUF_INC <= data_len && b_pos <= BLOCK_SIZE - BUF_INC)
        {
            *UNIT_PTR(UI8_PTR(ctx->txt_ghv) + b_pos) ^= *UNIT_PTR(data + cnt);
            cnt += BUF_INC; b_pos += BUF_INC;
        }
        while(cnt + BLOCK_SIZE <= data_len)
        {
            gf_mul_hh((gf_t*)ctx->txt_ghv, ctx);
            xor_block_aligned(ctx->txt_ghv, ctx->txt_ghv, data + cnt);
            cnt += BLOCK_SIZE;
        }
    }
    else
    {
        while(cnt < data_len && b_pos < BLOCK_SIZE)
            UI8_PTR(ctx->txt_ghv)[b_pos++] ^= data[cnt++];
        while(cnt + BLOCK_SIZE <= data_len)
        {
            gf_mul_hh((gf_t*)ctx->txt_ghv, ctx);
            xor_block(ctx->txt_ghv, ctx->txt_ghv, data + cnt);
            cnt += BLOCK_SIZE;
        }
    }
    while(cnt < data_len)
    {
        if(b_pos == BLOCK_SIZE)
        {
            gf_mul_hh((gf_t*)ctx->txt_ghv, ctx);
            b_pos = 0;
        }
        UI8_PTR(ctx->txt_ghv)[b_pos++] ^= data[cnt++];
    }
    ctx->txt_acnt += cnt;
    return RETURN_GOOD;
 }
 ret_type gcm_crypt_data(                    /* encrypt or decrypt data      */
            unsigned char data[],           /* the data buffer              */
            unsigned long data_len,         /* and its length in bytes      */
            gcm_ctx ctx[1])                 /* the mode context             */
 {   uint32_t cnt = 0, b_pos = (uint32_t)ctx->txt_ccnt & BLK_ADR_MASK;
    if(!data_len)
        return RETURN_GOOD;
    if(!((data - (UI8_PTR(ctx->enc_ctr) + b_pos)) & BUF_ADRMASK))
    {
        if(b_pos)
        {
 	        while(cnt < data_len && (b_pos & BUF_ADRMASK))
 		        data[cnt++] ^= UI8_PTR(ctx->enc_ctr)[b_pos++];
            while(cnt + BUF_INC <= data_len && b_pos <= BLOCK_SIZE - BUF_INC)
            {
                *UNIT_PTR(data + cnt) ^= *UNIT_PTR(UI8_PTR(ctx->enc_ctr) + b_pos);
                cnt += BUF_INC; b_pos += BUF_INC;
            }
        }
        while(cnt + BLOCK_SIZE <= data_len)
        {
            inc_ctr(ctx->ctr_val);
            aes_encrypt(UI8_PTR(ctx->ctr_val), UI8_PTR(ctx->enc_ctr), ctx->aes);
            xor_block_aligned(data + cnt, data + cnt, ctx->enc_ctr);
            cnt += BLOCK_SIZE;
        }
    }
    else
    {
        if(b_pos)
            while(cnt < data_len && b_pos < BLOCK_SIZE)
                data[cnt++] ^= UI8_PTR(ctx->enc_ctr)[b_pos++];
        while(cnt + BLOCK_SIZE <= data_len)
        {
            inc_ctr(ctx->ctr_val);
            aes_encrypt(UI8_PTR(ctx->ctr_val), UI8_PTR(ctx->enc_ctr), ctx->aes);
            xor_block(data + cnt, data + cnt, ctx->enc_ctr);
            cnt += BLOCK_SIZE;
        }
    }
    while(cnt < data_len)
    {
        if(b_pos == BLOCK_SIZE || !b_pos)
        {
            inc_ctr(ctx->ctr_val);
            aes_encrypt(UI8_PTR(ctx->ctr_val), UI8_PTR(ctx->enc_ctr), ctx->aes);
            b_pos = 0;
        }
        data[cnt++] ^= UI8_PTR(ctx->enc_ctr)[b_pos++];
    }
    ctx->txt_ccnt += cnt;
    return RETURN_GOOD;
 }
 ret_type gcm_compute_tag(                   /* compute authentication tag   */
            unsigned char tag[],            /* the buffer for the tag       */
            unsigned long tag_len,          /* and its length in bytes      */
            gcm_ctx ctx[1])                 /* the mode context             */
 {   uint32_t i, ln;
    gf_t tbuf;
    if(ctx->txt_acnt != ctx->txt_ccnt && ctx->txt_ccnt > 0)
        return RETURN_ERROR;
    gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
    gf_mul_hh((gf_t*)ctx->txt_ghv, ctx);
    if(ctx->hdr_cnt)
    {
        ln = (uint32_t)((ctx->txt_acnt + BLOCK_SIZE - 1) / BLOCK_SIZE);
        if(ln)
        {
 #if 1       /* alternative versions of the exponentiation operation */
            memcpy(tbuf, ctx->ghash_h, BLOCK_SIZE);
 #       if defined(  GF_REPRESENTATION )
            convert_representation(tbuf, tbuf, GF_REPRESENTATION);
            convert_representation(ctx->hdr_ghv, ctx->hdr_ghv, GF_REPRESENTATION);
 #       endif
            for( ; ; )
            {
                if(ln & 1)
                {
                    gf_mul((void*)ctx->hdr_ghv, tbuf);
                }
                if(!(ln >>= 1))
                    break;
                gf_mul(tbuf, tbuf);
            }
 #else       /* this one seems slower on x86 and x86_64 :-( */
            i = ln | ln >> 1; i |= i >> 2; i |= i >> 4;
            i |= i >> 8; i |= i >> 16; i &= ~(i >> 1);
            memset(tbuf, 0, BLOCK_SIZE);
            UI8_PTR(tbuf)[0] = 0x80;
            while(i)
            {
 #           if defined(  GF_REPRESENTATION )
                convert_representation(tbuf, tbuf, GF_REPRESENTATION);
 #           endif
                gf_mul(tbuf, tbuf);
 #           if defined(  GF_REPRESENTATION )
                convert_representation(tbuf, tbuf, GF_REPRESENTATION);
 #           endif
                if(i & ln)
                    gf_mul_hh((gf_t*)tbuf, ctx);
                i >>= 1;
            }
 #           if defined(  GF_REPRESENTATION )
            convert_representation(tbuf, tbuf, GF_REPRESENTATION);
            convert_representation(ctx->hdr_ghv, ctx->hdr_ghv, GF_REPRESENTATION);
 #           endif
            gf_mul((void*)ctx->hdr_ghv, tbuf);
 #endif
 #if         defined(  GF_REPRESENTATION )
            convert_representation(ctx->hdr_ghv, ctx->hdr_ghv, GF_REPRESENTATION);
 #           endif
        }
    }
    i = BLOCK_SIZE; 
 #ifdef BRG_UI64
    {   uint64_t tm = ((uint64_t)ctx->txt_acnt) << 3;
        while(i-- > 0)
        {
            UI8_PTR(ctx->hdr_ghv)[i] ^= UI8_PTR(ctx->txt_ghv)[i] ^ (unsigned char)tm;
            tm = (i == 8 ? (((uint64_t)ctx->hdr_cnt) << 3) : tm >> 8);
        }
    }
 #else   
    {   uint32_t tm = ctx->txt_acnt << 3;
        while(i-- > 0)
        {
            UI8_PTR(ctx->hdr_ghv)[i] ^= UI8_PTR(ctx->txt_ghv)[i] ^ (unsigned char)tm;
            if(i & 3)
                tm >>= 8;
            else if(i == 4)
                tm = ctx->txt_acnt >> 29;
            else if(i == 8)
                tm = ctx->hdr_cnt << 3;
            else
                tm = ctx->hdr_cnt >> 29;
        }
    }
 #endif
    gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx);
    memcpy(ctx->enc_ctr, ctx->ctr_val, BLOCK_SIZE);
    *UI32_PTR(UI8_PTR(ctx->enc_ctr) + CTR_POS) = ctx->y0_val;
    aes_encrypt(UI8_PTR(ctx->enc_ctr), UI8_PTR(ctx->enc_ctr), ctx->aes);
    for(i = 0; i < (unsigned int)tag_len; ++i)
        tag[i] = (unsigned char)(UI8_PTR(ctx->hdr_ghv)[i] ^ UI8_PTR(ctx->enc_ctr)[i]);
    return (ctx->txt_ccnt == ctx->txt_acnt ? RETURN_GOOD : RETURN_WARN);
 }
 ret_type gcm_end(                           /* clean up and end operation   */
            gcm_ctx ctx[1])                 /* the mode context             */
 {
    memset(ctx, 0, sizeof(gcm_ctx));
    return RETURN_GOOD;
 }
 ret_type gcm_encrypt(                       /* encrypt & authenticate data  */
            unsigned char data[],           /* the data buffer              */
            unsigned long data_len,         /* and its length in bytes      */
            gcm_ctx ctx[1])                 /* the mode context             */
 {
    gcm_crypt_data(data, data_len, ctx);
    gcm_auth_data(data, data_len, ctx);
    return RETURN_GOOD;
 }
 ret_type gcm_decrypt(                       /* authenticate & decrypt data  */
            unsigned char data[],           /* the data buffer              */
            unsigned long data_len,         /* and its length in bytes      */
            gcm_ctx ctx[1])                 /* the mode context             */
 {
    gcm_auth_data(data, data_len, ctx);
    gcm_crypt_data(data, data_len, ctx);
    return RETURN_GOOD;
 }
 ret_type gcm_encrypt_message(               /* encrypt an entire message    */
            const unsigned char iv[],       /* the initialisation vector    */
            unsigned long iv_len,           /* and its length in bytes      */
            const unsigned char hdr[],      /* the header buffer            */
            unsigned long hdr_len,          /* and its length in bytes      */
            unsigned char msg[],            /* the message buffer           */
            unsigned long msg_len,          /* and its length in bytes      */
            unsigned char tag[],            /* the buffer for the tag       */
            unsigned long tag_len,          /* and its length in bytes      */
            gcm_ctx ctx[1])                 /* the mode context             */
 {
    gcm_init_message(iv, iv_len, ctx);
    gcm_auth_header(hdr, hdr_len, ctx);
    gcm_encrypt(msg, msg_len, ctx);
    return gcm_compute_tag(tag, tag_len, ctx) ? RETURN_ERROR : RETURN_GOOD;
 }
 ret_type gcm_decrypt_message(               /* decrypt an entire message    */
            const unsigned char iv[],       /* the initialisation vector    */
            unsigned long iv_len,           /* and its length in bytes      */
            const unsigned char hdr[],      /* the header buffer            */
            unsigned long hdr_len,          /* and its length in bytes      */
            unsigned char msg[],            /* the message buffer           */
            unsigned long msg_len,          /* and its length in bytes      */
            const unsigned char tag[],      /* the buffer for the tag       */
            unsigned long tag_len,          /* and its length in bytes      */
            gcm_ctx ctx[1])                 /* the mode context             */
 {   uint8_t local_tag[BLOCK_SIZE];
    ret_type rr;
    gcm_init_message(iv, iv_len, ctx);
    gcm_auth_header(hdr, hdr_len, ctx);
    gcm_decrypt(msg, msg_len, ctx);
    rr = gcm_compute_tag(local_tag, tag_len, ctx);
    return (rr != RETURN_GOOD || memcmp(tag, local_tag, tag_len)) ? RETURN_ERROR : RETURN_GOOD;
 }
 #if defined(__cplusplus)
 }
 #endif
--- a/crypto/aes/aesgcm.h
+++ b/crypto/aes/aesgcm.h
@ -0,0 +1,233 @@
 /*
 ---------------------------------------------------------------------------
 Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved.
 The redistribution and use of this software (with or without changes)
 is allowed without the payment of fees or royalties provided that:
  source code distributions include the above copyright notice, this
  list of conditions and the following disclaimer;
  binary distributions include the above copyright notice, this list
  of conditions and the following disclaimer in their documentation.
 This software is provided 'as is' with no explicit or implied warranties
 in respect of its operation, including, but not limited to, correctness
 and fitness for purpose.
 ---------------------------------------------------------------------------
 Issue Date: 11/01/2011
 I am grateful for the work done by Mark Rodenkirch and Jason Papadopoulos
 in helping to remove a bug in the operation of this code on big endian
 systems when fast buffer operations are enabled.
 ---------------------------------------------------------------------------
 */
 #ifndef _GCM_H
 #define _GCM_H
 #include "aes.h"
 #include "gf128mul.h"
 /*  USER DEFINABLE OPTIONS (Further options need to be set in gf128mul.h) */
 /*  UNIT_BITS sets the size of variables used to process 16 byte buffers
    when the buffer alignment allows this.  When buffers are processed
    in bytes, 16 individual operations are invoolved.  But if, say, such 
    a buffer is divided into 4 32 bit variables, it can then be processed
    in 4 operations, making the code typically much faster. In general
    it will pay to use the longest natively supported size, which will
    probably be 32 or 64 bits in 32 and 64 bit systems respectively.
 */
 #if defined( UNIT_BITS )
 # undef UNIT_BITS
 #endif
 #if !defined( UNIT_BITS )
 #  if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
 #    if 0
 #      define UNIT_BITS   8
 #    elif 0
 #      define UNIT_BITS  32
 #    elif 1
 #      define UNIT_BITS  64
 #    endif
 #  elif defined( _WIN64 )
 #    define UNIT_BITS 64
 #  else
 #    define UNIT_BITS 32
 #  endif
 #endif
 #if UNIT_BITS == 64 && !defined( NEED_UINT_64T )
 #  define NEED_UINT_64T
 #endif
 /* END OF USER DEFINABLE OPTIONS */
 /*  After encryption or decryption operations the return value of
    'compute tag' will be one of the values RETURN_GOOD, RETURN_WARN
    or RETURN_ERROR, the latter indicating an error. A return value
    RETURN_GOOD indicates that both encryption and authentication
    have taken place and resulted in the returned tag value. If
    the returned value is RETURN_WARN, the tag value is the result
    of authentication alone without encryption (CCM) or decryption
    (GCM and EAX).
 */
 #ifndef RETURN_GOOD
 # define RETURN_WARN      1
 # define RETURN_GOOD      0
 # define RETURN_ERROR    -1
 #endif
 #if defined(__cplusplus)
 extern "C"
 {
 #endif
 #ifndef RET_TYPE_DEFINED
  typedef int  ret_type;
 #endif
 UNIT_TYPEDEF(gcm_unit_t, UNIT_BITS);
 BUFR_TYPEDEF(gcm_buf_t, UNIT_BITS, AES_BLOCK_SIZE);
 #define GCM_BLOCK_SIZE  AES_BLOCK_SIZE
 /* The GCM-AES  context  */
 typedef struct
 {
 #if defined( TABLES_64K )
    gf_t64k_a       gf_t64k;
 #endif
 #if defined( TABLES_8K )
    gf_t8k_a        gf_t8k;
 #endif
 #if defined( TABLES_4K )
    gf_t4k_a        gf_t4k;
 #endif
 #if defined( TABLES_256 )
    gf_t256_a       gf_t256;
 #endif
    gcm_buf_t       ctr_val;                /* CTR counter value            */
    gcm_buf_t       enc_ctr;                /* encrypted CTR block          */
    gcm_buf_t       hdr_ghv;                /* ghash buffer (header)        */
    gcm_buf_t       txt_ghv;                /* ghash buffer (ciphertext)    */
    gf_t            ghash_h;                /* ghash H value                */
    aes_encrypt_ctx aes[1];                 /* AES encryption context       */
    uint32_t        y0_val;                 /* initial counter value        */
    uint32_t        hdr_cnt;                /* header bytes so far          */
    uint32_t        txt_ccnt;               /* text bytes so far (encrypt)  */
    uint32_t        txt_acnt;               /* text bytes so far (auth)     */
 } gcm_ctx;
 /* The following calls handle mode initialisation, keying and completion    */
 ret_type gcm_init_and_key(                  /* initialise mode and set key  */
            const unsigned char key[],      /* the key value                */
            unsigned long key_len,          /* and its length in bytes      */
            gcm_ctx ctx[1]);                /* the mode context             */
 ret_type gcm_end(                           /* clean up and end operation   */
            gcm_ctx ctx[1]);                /* the mode context             */
 /* The following calls handle complete messages in memory as one operation  */
 ret_type gcm_encrypt_message(               /* encrypt an entire message    */
            const unsigned char iv[],       /* the initialisation vector    */
            unsigned long iv_len,           /* and its length in bytes      */
            const unsigned char hdr[],      /* the header buffer            */
            unsigned long hdr_len,          /* and its length in bytes      */
            unsigned char msg[],            /* the message buffer           */
            unsigned long msg_len,          /* and its length in bytes      */
            unsigned char tag[],            /* the buffer for the tag       */
            unsigned long tag_len,          /* and its length in bytes      */
            gcm_ctx ctx[1]);                /* the mode context             */
                                /* RETURN_GOOD is returned if the input tag */
                                /* matches that for the decrypted message   */
 ret_type gcm_decrypt_message(               /* decrypt an entire message    */
            const unsigned char iv[],       /* the initialisation vector    */
            unsigned long iv_len,           /* and its length in bytes      */
            const unsigned char hdr[],      /* the header buffer            */
            unsigned long hdr_len,          /* and its length in bytes      */
            unsigned char msg[],            /* the message buffer           */
            unsigned long msg_len,          /* and its length in bytes      */
            const unsigned char tag[],      /* the buffer for the tag       */
            unsigned long tag_len,          /* and its length in bytes      */
            gcm_ctx ctx[1]);                /* the mode context             */
 /* The following calls handle messages in a sequence of operations followed */
 /* by tag computation after the sequence has been completed. In these calls */
 /* the user is responsible for verfiying the computed tag on decryption     */
 ret_type gcm_init_message(                  /* initialise a new message     */
            const unsigned char iv[],       /* the initialisation vector    */
            unsigned long iv_len,           /* and its length in bytes      */
            gcm_ctx ctx[1]);                /* the mode context             */
 ret_type gcm_auth_header(                   /* authenticate the header      */
            const unsigned char hdr[],      /* the header buffer            */
            unsigned long hdr_len,          /* and its length in bytes      */
            gcm_ctx ctx[1]);                /* the mode context             */
 ret_type gcm_encrypt(                       /* encrypt & authenticate data  */
            unsigned char data[],           /* the data buffer              */
            unsigned long data_len,         /* and its length in bytes      */
            gcm_ctx ctx[1]);                /* the mode context             */
 ret_type gcm_decrypt(                       /* authenticate & decrypt data  */
            unsigned char data[],           /* the data buffer              */
            unsigned long data_len,         /* and its length in bytes      */
            gcm_ctx ctx[1]);                /* the mode context             */
 ret_type gcm_compute_tag(                   /* compute authentication tag   */
            unsigned char tag[],            /* the buffer for the tag       */
            unsigned long tag_len,          /* and its length in bytes      */
            gcm_ctx ctx[1]);                /* the mode context             */
 /*  The use of the following calls should be avoided if possible because 
    their use requires a very good understanding of the way this encryption 
    mode works and the way in which this code implements it in order to use 
    them correctly.
    The gcm_auth_data routine is used to authenticate encrypted message data.
    In message encryption gcm_crypt_data must be called before gcm_auth_data
    is called since it is encrypted data that is authenticated.  In message
    decryption authentication must occur before decryption and data can be
    authenticated without being decrypted if necessary.
    If these calls are used it is up to the user to ensure that these routines
    are called in the correct order and that the correct data is passed to 
    them.
    When gcm_compute_tag is called it is assumed that an error in use has
    occurred if both encryption (or decryption) and authentication have taken
    place but the total lengths of the message data respectively authenticated
    and encrypted are not the same. If authentication has taken place but 
    there has been no corresponding encryption or decryption operations (none
    at all) only a warning is issued. This should be treated as an error if it 
    occurs during encryption but it is only signalled as a warning as it might 
    be intentional when decryption operations are involved (this avoids having
    different compute tag functions for encryption and decryption). Decryption
    operations can be undertaken freely after authetication but if the tag is
    computed after such operations an error will be signalled if the lengths
    of the data authenticated and decrypted don't match.
 */
 ret_type gcm_auth_data(                     /* authenticate ciphertext data */
            const unsigned char data[],     /* the data buffer              */
            unsigned long data_len,         /* and its length in bytes      */
            gcm_ctx ctx[1]);                /* the mode context             */
 ret_type gcm_crypt_data(                    /* encrypt or decrypt data      */
            unsigned char data[],           /* the data buffer              */
            unsigned long data_len,         /* and its length in bytes      */
            gcm_ctx ctx[1]);                /* the mode context             */
 #if defined(__cplusplus)
 }
 #endif
 #endif
--- a/crypto/aes/brg_endian.h
+++ b/crypto/aes/brg_endian.h
@ -0,0 +1,29 @@
 /*
 ---------------------------------------------------------------------------
 Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
 The redistribution and use of this software (with or without changes)
 is allowed without the payment of fees or royalties provided that:
  source code distributions include the above copyright notice, this
  list of conditions and the following disclaimer;
  binary distributions include the above copyright notice, this list
  of conditions and the following disclaimer in their documentation.
 This software is provided 'as is' with no explicit or implied warranties
 in respect of its operation, including, but not limited to, correctness
 and fitness for purpose.
 ---------------------------------------------------------------------------
 Issue Date: 10/09/2018
 */
 #ifndef _BRG_ENDIAN_H
 #define _BRG_ENDIAN_H
 #define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
 #define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
 #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #endif
--- a/crypto/aes/gf128mul.c
+++ b/crypto/aes/gf128mul.c
@ -0,0 +1,471 @@
 /*
 ---------------------------------------------------------------------------
 Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved.
 The redistribution and use of this software (with or without changes)
 is allowed without the payment of fees or royalties provided that:
  source code distributions include the above copyright notice, this
  list of conditions and the following disclaimer;
  binary distributions include the above copyright notice, this list
  of conditions and the following disclaimer in their documentation.
 This software is provided 'as is' with no explicit or implied warranties
 in respect of its operation, including, but not limited to, correctness
 and fitness for purpose.
 ---------------------------------------------------------------------------
 Issue Date: 20/12/2007
 This file provides fast multiplication in GF(128) as required by several
 cryptographic authentication modes (see gfmul128.h).
 */
 /*  Speed critical loops can be unrolled to gain speed but consume more memory */
 #if 1
 #  define UNROLL_LOOPS
 #endif
 /* The order of these includes matters */
 #include "mode_hdr.h"
 #include "gf128mul.h"
 #include "gf_mul_lo.h"
 #if defined( GF_MODE_LL )
 #  define mode   _ll
 #elif defined( GF_MODE_BL )
 #  define mode   _bl
 #elif defined( GF_MODE_LB )
 #  define mode   _lb
 #elif defined( GF_MODE_BB )
 #  define mode   _bb
 #else
 #  error mode is not defined
 #endif
 #if defined( GF_MODE_LL) || defined( GF_MODE_LB )
 #  define GF_INDEX(i)  (i)
 #else
 #  define GF_INDEX(i)  (15 - (i))
 #endif
 /* A slow field multiplier */
 void gf_mul(gf_t a, const gf_t b)
 {   gf_t p[8];
    uint8_t *q, ch;
    int i;
    copy_block_aligned(p[0], a);
    for(i = 0; i < 7; ++i)
        gf_mulx1(mode)(p[i + 1], p[i]);
    q = (uint8_t*)(a == b ? p[0] : b);
    memset(a, 0, GF_BYTE_LEN);
    for(i = 15 ;  ; )
    {
        ch = q[GF_INDEX(i)];
        if(ch & X_0)
            xor_block_aligned(a, a, p[0]);
        if(ch & X_1)
            xor_block_aligned(a, a, p[1]);
        if(ch & X_2)
            xor_block_aligned(a, a, p[2]);
        if(ch & X_3)
            xor_block_aligned(a, a, p[3]);
        if(ch & X_4)
            xor_block_aligned(a, a, p[4]);
        if(ch & X_5)
            xor_block_aligned(a, a, p[5]);
        if(ch & X_6)
            xor_block_aligned(a, a, p[6]);
        if(ch & X_7)
            xor_block_aligned(a, a, p[7]);
        if(!i--)
            break;
        gf_mulx8(mode)(a);
    }
 }
 #if defined( TABLES_64K )
 /*  This version uses 64k bytes of table space on the stack.
    An input variable field value in a[] has to be multiplied
    by a key value in g[] that changes far less frequently.
    To do this a[] is split up into 16 smaller field values,
    each one byte in length. For the 256 values of each of
    these smaller values, we can precompute the result of
    mulltiplying g by this field value. We can then combine
    these values to provide the full multiply. So for each
    of 16 bytes we have a table of 256 field values each of
    16 bytes - 64k bytes in total.
 */
 void init_64k_table(const gf_t g, gf_t64k_t t)
 {   int i = 0, j, k;
    /*
    depending on the representation we have to process bits
    within bytes high to low (0xe1 style ) or low to high
    (0x87 style).  We start by producing the powers x ,x^2
    .. x^7 and put them in t[0][1], t[0][2] .. t[128] or in
    t[128], t[64] .. t[1] depending on the bit order in use.
    */
    /* clear the element for the zero field element */
    memset(t[0][0], 0, GF_BYTE_LEN);
 #if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
    /* g -> t[0][1], generate t[0][2] ...           */
    memcpy(t[0][1], g, GF_BYTE_LEN);
    for(j = 1; j <= 64; j <<= 1)
        gf_mulx1(mode)(t[0][j + j], t[0][j]);
 #else
    /* g -> t[0][128], generate t[0][64] ...        */
    memcpy(t[0][128], g, GF_BYTE_LEN);
    for(j = 64; j >= 1; j >>= 1)
        gf_mulx1(mode)(t[0][j], t[0][j + j]);
 #endif
    for( ; ; )
    {
        /*  if { n } stands for the field value represented by
            the integer n, we can express higher multiplies in
            the table as follows:
                1. g * { 3} = g * {2} ^ g * {1}
                2. g * { 5} = g * {4} ^ g * {1}
                   g * { 6} = g * {4} ^ g * {2}
                   g * { 7} = g * {4} ^ g * {3}
                3. g * { 9} = g * {8} ^ g * {1}
                   g * {10} = g * {8} ^ g * {2}
                   ....
           and so on.  This is what the following loops do.
        */
        for(j = 2; j < 256; j += j)
            for(k = 1; k < j; ++k)
                xor_block_aligned(t[i][j + k], t[i][j], t[i][k]);
        if(++i == GF_BYTE_LEN)  /* all 16 byte positions done */
            return;
        /*  We now move to the next byte up and set up its eight
            starting values by multiplying the values in the
            lower table by x^8
        */
        memset(t[i][0], 0, GF_BYTE_LEN);
        for(j = 128; j > 0; j >>= 1)
        {
            memcpy(t[i][j], t[i - 1][j], GF_BYTE_LEN);
            gf_mulx8(mode)(t[i][j]);
        }
    }
 }
 #define xor_64k(i,ap,t,r) xor_block_aligned(r, r, t[i][ap[GF_INDEX(i)]])
 #if defined( UNROLL_LOOPS )
 void gf_mul_64k(gf_t a, const  gf_t64k_t t, gf_t r)
 {   uint8_t *ap = (uint8_t*)a;
    memset(r, 0, GF_BYTE_LEN);
    xor_64k(15, ap, t, r); xor_64k(14, ap, t, r);
    xor_64k(13, ap, t, r); xor_64k(12, ap, t, r);
    xor_64k(11, ap, t, r); xor_64k(10, ap, t, r);
    xor_64k( 9, ap, t, r); xor_64k( 8, ap, t, r);
    xor_64k( 7, ap, t, r); xor_64k( 6, ap, t, r);
    xor_64k( 5, ap, t, r); xor_64k( 4, ap, t, r);
    xor_64k( 3, ap, t, r); xor_64k( 2, ap, t, r);
    xor_64k( 1, ap, t, r); xor_64k( 0, ap, t, r);
    copy_block_aligned(a, r);
 }
 #else
 void gf_mul_64k(gf_t a, const  gf_t64k_t t, gf_t r)
 {   int i;
    uint8_t *ap = (uint8_t*)a;
    memset(r, 0, GF_BYTE_LEN);
    for(i = 15; i >= 0; --i)
    {
        xor_64k(i,ap,t,r);
    }
    copy_block_aligned(a, r);
 }
 #endif
 #endif
 #if defined( TABLES_8K )
 /*  This version uses 8k bytes of table space on the stack.
    An input field value in a[] has to be multiplied by a
    key value in g[]. To do this a[] is split up into 32
    smaller field values each 4-bits in length. For the
    16 values of each of these smaller field values we can
    precompute the result of mulltiplying g[] by the field
    value in question. So for each of 32 nibbles we have a
    table of 16 field values, each of 16 bytes - 8k bytes
    in total.
 */
 void init_8k_table(const gf_t g, gf_t8k_t t)
 {   int i = 0, j, k;
    /*  do the low 4-bit nibble first - t[0][16] - and note
        that the unit multiplier sits at 0x01 - t[0][1] in
        the table. Then multiplies by x go at 2, 4, 8
    */
    /* set the table elements for a zero multiplier */
    memset(t[0][0], 0, GF_BYTE_LEN);
    memset(t[1][0], 0, GF_BYTE_LEN);
 #if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
    /* t[0][1] = g, compute t[0][2], t[0][4], t[0][8]   */
    memcpy(t[0][1], g, GF_BYTE_LEN);
    for(j = 1; j <= 4; j <<= 1)
        gf_mulx1(mode)(t[0][j + j], t[0][j]);
    /* t[1][1] = t[0][1] * x^4 = t[0][8] * x            */
    gf_mulx1(mode)(t[1][1], t[0][8]);
    for(j = 1; j <= 4; j <<= 1)
        gf_mulx1(mode)(t[1][j + j], t[1][j]);
 #else
    /* g -> t[0][8], compute t[0][4], t[0][2], t[0][1]  */
    memcpy(t[1][8], g, GF_BYTE_LEN);
    for(j = 4; j >= 1; j >>= 1)
        gf_mulx1(mode)(t[1][j], t[1][j + j]);
    /* t[1][1] = t[0][1] * x^4 = t[0][8] * x            */
    gf_mulx1(mode)(t[0][8], t[1][1]);
    for(j = 4; j >= 1; j >>= 1)
        gf_mulx1(mode)(t[0][j], t[0][j + j]);
 #endif
    for( ; ; )
    {
        for(j = 2; j < 16; j += j)
            for(k = 1; k < j; ++k)
                xor_block_aligned(t[i][j + k], t[i][j], t[i][k]);
        if(++i == 2 * GF_BYTE_LEN)
            return;
        if(i > 1)
        {
            memset(t[i][0], 0, GF_BYTE_LEN);
            for(j = 8; j > 0; j >>= 1)
            {
                memcpy(t[i][j], t[i - 2][j], GF_BYTE_LEN);
                gf_mulx8(mode)(t[i][j]);
            }
        }
    }
 }
 #define xor_8k(i,ap,t,r)   \
    xor_block_aligned(r, r, t[i + i][ap[GF_INDEX(i)] & 15]); \
    xor_block_aligned(r, r, t[i + i + 1][ap[GF_INDEX(i)] >> 4])
 #if defined( UNROLL_LOOPS )
 void gf_mul_8k(gf_t a, const gf_t8k_t t, gf_t r)
 {   uint8_t *ap = (uint8_t*)a;
    memset(r, 0, GF_BYTE_LEN);
    xor_8k(15, ap, t, r); xor_8k(14, ap, t, r);
    xor_8k(13, ap, t, r); xor_8k(12, ap, t, r);
    xor_8k(11, ap, t, r); xor_8k(10, ap, t, r);
    xor_8k( 9, ap, t, r); xor_8k( 8, ap, t, r);
    xor_8k( 7, ap, t, r); xor_8k( 6, ap, t, r);
    xor_8k( 5, ap, t, r); xor_8k( 4, ap, t, r);
    xor_8k( 3, ap, t, r); xor_8k( 2, ap, t, r);
    xor_8k( 1, ap, t, r); xor_8k( 0, ap, t, r);
    copy_block_aligned(a, r);
 }
 #else
 void gf_mul_8k(gf_t a, const gf_t8k_t t, gf_t r)
 {   int i;
    uint8_t *ap = (uint8_t*)a;
    memset(r, 0, GF_BYTE_LEN);
    for(i = 15; i >= 0; --i)
    {
        xor_8k(i,ap,t,r);
    }
    memcpy(a, r, GF_BYTE_LEN);
 }
 #endif
 #endif
 #if defined( TABLES_4K )
 /*  This version uses 4k bytes of table space on the stack.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(128).  If we consider a GF(128) value in a
    single byte, we can construct a table of the 256 16
    byte values that result from multiplying g by the 256
    values of this byte.  This requires 4096 bytes.
    If we take the highest byte in the buffer and use this
    table to multiply it by g, we then have to multiply it
    by x^120 to get the final value. For the next highest
    byte the result has to be multiplied by x^112 and so on.
    But we can do this by accumulating the result in an
    accumulator starting with the result for the top byte.
    We repeatedly multiply the accumulator value by x^8 and
    then add in (i.e. xor) the 16 bytes of the next lower
    byte in the buffer, stopping when we reach the lowest
    byte. This requires a 4096 byte table.
 */
 void init_4k_table(const gf_t g, gf_t4k_t t)
 {   int j, k;
    memset(t[0], 0, GF_BYTE_LEN);
 #if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
    memcpy(t[1], g, GF_BYTE_LEN);
    for(j = 1; j <= 64; j <<= 1)
        gf_mulx1(mode)(t[j + j], t[j]);
 #else
    memcpy(t[128], g, GF_BYTE_LEN);
    for(j = 64; j >= 1; j >>= 1)
        gf_mulx1(mode)(t[j], t[j + j]);
 #endif
    for(j = 2; j < 256; j += j)
        for(k = 1; k < j; ++k)
            xor_block_aligned(t[j + k], t[j], t[k]);
 }
 #define xor_4k(i,ap,t,r) gf_mulx8(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)]])
 #if defined( UNROLL_LOOPS )
 void gf_mul_4k(gf_t a, const gf_t4k_t t, gf_t r)
 {   uint8_t *ap = (uint8_t*)a;
    memset(r, 0, GF_BYTE_LEN);
    xor_4k(15, ap, t, r); xor_4k(14, ap, t, r);
    xor_4k(13, ap, t, r); xor_4k(12, ap, t, r);
    xor_4k(11, ap, t, r); xor_4k(10, ap, t, r);
    xor_4k( 9, ap, t, r); xor_4k( 8, ap, t, r);
    xor_4k( 7, ap, t, r); xor_4k( 6, ap, t, r);
    xor_4k( 5, ap, t, r); xor_4k( 4, ap, t, r);
    xor_4k( 3, ap, t, r); xor_4k( 2, ap, t, r);
    xor_4k( 1, ap, t, r); xor_4k( 0, ap, t, r);
    copy_block_aligned(a, r);
 }
 #else
 void gf_mul_4k(gf_t a, const gf_t4k_t t, gf_t r)
 {   int i = 15;
    uint8_t *ap = (uint8_t*)a;
    memset(r, 0, GF_BYTE_LEN);
    for(i = 15; i >=0; --i)
    {
        xor_4k(i, ap, t, r);
    }
    copy_block_aligned(a, r);
 }
 #endif
 #endif
 #if defined( TABLES_256 )
 /*  This version uses 256 bytes of table space on the stack.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(128).  If we consider a GF(128) value in a
    single 4-bit nibble, we can construct a table of the 16
    16 byte  values that result from the 16 values of this
    byte.  This requires 256 bytes. If we take the highest
    4-bit nibble in the buffer and use this table to get the
    result, we then have to multiply by x^124 to get the
    final value. For the next highest byte the result has to
    be multiplied by x^120 and so on. But we can do this by
    accumulating the result in an accumulator starting with
    the result for the top nibble.  We repeatedly multiply
    the accumulator value by x^4 and then add in (i.e. xor)
    the 16 bytes of the next lower nibble in the buffer,
    stopping when we reach the lowest nibble. This uses a
    256 byte table.
 */
 void init_256_table(const gf_t g, gf_t256_t t)
 {   int j, k;
    memset(t[0], 0, GF_BYTE_LEN);
 #if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
    memcpy(t[1], g, GF_BYTE_LEN);
    for(j = 1; j <= 4; j <<= 1)
        gf_mulx1(mode)(t[j + j], t[j]);
 #else
    memcpy(t[8], g, GF_BYTE_LEN);
    for(j = 4; j >= 1; j >>= 1)
        gf_mulx1(mode)(t[j], t[j + j]);
 #endif
    for(j = 2; j < 16; j += j)
        for(k = 1; k < j; ++k)
            xor_block_aligned(t[j + k], t[j], t[k]);
 }
 #define x_lo(i,ap,t,r) gf_mulx4(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)] & 0x0f])
 #define x_hi(i,ap,t,r) gf_mulx4(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)] >> 4])
 #if defined( GF_MODE_LL ) || defined( GF_MODE_BL )
 #define xor_256(a,b,c,d)    x_hi(a,b,c,d);  x_lo(a,b,c,d)
 #else
 #define xor_256(a,b,c,d)    x_lo(a,b,c,d);  x_hi(a,b,c,d)
 #endif
 #if defined( UNROLL_LOOPS )
 void gf_mul_256(gf_t a, const gf_t256_t t, gf_t r)
 {   uint8_t *ap = (uint8_t*)a;
    memset(r, 0, GF_BYTE_LEN);
    xor_256(15, ap, t, r); xor_256(14, ap, t, r);
    xor_256(13, ap, t, r); xor_256(12, ap, t, r);
    xor_256(11, ap, t, r); xor_256(10, ap, t, r);
    xor_256( 9, ap, t, r); xor_256( 8, ap, t, r);
    xor_256( 7, ap, t, r); xor_256( 6, ap, t, r);
    xor_256( 5, ap, t, r); xor_256( 4, ap, t, r);
    xor_256( 3, ap, t, r); xor_256( 2, ap, t, r);
    xor_256( 1, ap, t, r); xor_256( 0, ap, t, r);
    copy_block_aligned(a, r);
 }
 #else
 void gf_mul_256(gf_t a, const gf_t256_t t, gf_t r)
 {   int i;
    uint8_t *ap = (uint8_t*)a;
    memset(r, 0, GF_BYTE_LEN);
    for(i = 15; i >= 0; --i)
    {
        xor_256(i, ap, t, r);
    }
    copy_block_aligned(a, r);
 }
 #endif
 #endif
--- a/crypto/aes/gf128mul.h
+++ b/crypto/aes/gf128mul.h
@ -0,0 +1,215 @@
 /*
 ---------------------------------------------------------------------------
 Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved.
 The redistribution and use of this software (with or without changes)
 is allowed without the payment of fees or royalties provided that:
  source code distributions include the above copyright notice, this
  list of conditions and the following disclaimer;
  binary distributions include the above copyright notice, this list
  of conditions and the following disclaimer in their documentation.
 This software is provided 'as is' with no explicit or implied warranties
 in respect of its operation, including, but not limited to, correctness
 and fitness for purpose.
 ---------------------------------------------------------------------------
 Issue Date: 11/01/2011
 I am grateful for the work done by Mark Rodenkirch and Jason Papadopoulos
 in helping to remove a bug in the operation of this code on big endian
 systems when fast buffer operations are enabled.
 ---------------------------------------------------------------------------
 An implementation of field multiplication in the Galois Field GF(2^128)
 A polynomial representation is used for the field with the coefficients
 held in bit sequences in which the bit numbers are the powers of x that
 a bit represents. The field polynomial used is (x^128+x^7+x^2+x+1).
 The obvious way of representing field elements in a computer system is 
 to map 'x' in the field to the binary integer '2'. But this was way too
 obvious for cryptographers!
 Here bytes are numbered in their memory order and bits within bytes are
 numbered according to their integer numeric significance (that is as is 
 now normal with bit 0 representing unity). The term 'little endian' 
 will then used to describe mappings where numeric (power of 2) or field 
 (power of x) significance increases with increasing bit or byte numbers 
 with 'big endian' being used to describe the inverse situation.  
 The GF bit sequence can then be mapped onto 8-bit bytes in computer 
 memory in one of four simple ways:
     A mapping in which x maps to the integer 2 in little endian 
     form for both bytes and bits within bytes:
         LL: bit for x^n ==> bit for 2^(n % 8) in byte[n / 8]
     A mapping in which x maps to the integer 2 in big endian form 
     for both bytes and bits within bytes:
         BL: bit for x^n ==> bit for 2^(n % 8) in byte[15 - n / 8]
     A little endian mapping for bytes but with the bits within 
     bytes in reverse order (big endian bytes):
         LB: bit for x^n ==> bit for 2^(7 - n % 8) in byte[n / 8]
     A big endian mapping for bytes but with the bits within 
     bytes in reverse order (big endian bytes):
         BB: bit for x^n ==> bit for 2^(7 - n % 8) in byte[15 - n / 8]
 128-bit field elements are represented by 16 byte buffers but for
 processing efficiency reasons it is often desirable to process arrays 
 of bytes using longer types such as, for example, unsigned long values. 
 The type used for representing these buffers will be called a 'gf_unit' 
 and the buffer itself will be referred to as a 'gf_t' type.
 THe field multiplier is based on the assumption that one of the two
 field elements involved in multiplication will change only relatively
 infrequently, making it worthwhile to precompute tables to speed up
 multiplication by this value. 
 */
 #ifndef _GF128MUL_H
 #define _GF128MUL_H
 #include <stdlib.h>
 #include <string.h>
 #include "brg_endian.h"
 /* USER DEFINABLE OPTIONS */
 /*  UNIT_BITS sets the size of variables used to process 16 byte buffers
    when the buffer alignment allows this.  When buffers are processed
    in bytes, 16 individual operations are invoolved.  But if, say, such 
    a buffer is divided into 4 32 bit variables, it can then be processed 
    in 4 operations, making the code typically much faster. In general
    it will pay to use the longest natively supported size, which will
    probably be 32 or 64 bits in 32 and 64 bit systems respectively.
 */
 #if defined( UNIT_BITS )
 # undef UNIT_BITS
 #endif
 #if !defined( UNIT_BITS )
 #  if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
 #    if 0
 #      define UNIT_BITS   8
 #    elif 0
 #      define UNIT_BITS  32
 #    elif 1
 #      define UNIT_BITS  64
 #    endif
 #  elif defined( _WIN64 )
 #    define UNIT_BITS 64
 #  else
 #    define UNIT_BITS 32
 #  endif
 #endif
 #if UNIT_BITS == 64 && !defined( NEED_UINT_64T )
 #  define NEED_UINT_64T
 #endif
 #include "brg_types.h"
 /* Choose the Galois Field representation to use (see above) */
 #if 0
 #  define GF_MODE_LL
 #elif 0
 #  define GF_MODE_BL
 #elif 1
 #  define GF_MODE_LB    /* the representation used by GCM */
 #elif 0
 #  define GF_MODE_BB
 #else
 #  error mode is not defined
 #endif
 /*  Table sizes for GF(128) Multiply.  Normally larger tables give 
    higher speed but cache loading might change this. Normally only 
    one table size (or none at all) will be specified here
 */
 #if 0
 #  define TABLES_64K
 #endif
 #if 0
 #  define TABLES_8K
 #endif
 #if 1
 #  define TABLES_4K
 #endif
 #if 0
 #  define TABLES_256
 #endif
 /* END OF USER DEFINABLE OPTIONS */
 #if !(defined( TABLES_64K ) || defined( TABLES_8K ) \
    || defined( TABLES_4K ) || defined( TABLES_256 ))
 #  define NO_TABLES
 #endif
 #if defined(__cplusplus)
 extern "C"
 {
 #endif
 #define GF_BYTE_LEN 16
 #define GF_UNIT_LEN (GF_BYTE_LEN / (UNIT_BITS >> 3))
 UNIT_TYPEDEF(gf_unit_t, UNIT_BITS);
 BUFR_TYPEDEF(gf_t, UNIT_BITS, GF_BYTE_LEN);
 /*  Code for conversion between the four different galois field representations 
    is optionally available using gf_convert.c
 */
 typedef enum { REVERSE_NONE = 0, REVERSE_BITS = 1, REVERSE_BYTES = 2 } transform;
 void convert_representation(gf_t dest, const gf_t source, transform rev);
 void gf_mul(gf_t a, const gf_t b);      /* slow field multiply  */  
 /* types and calls for 64k table driven field multiplier        */
 typedef gf_t    gf_t64k_a[16][256]; 
 typedef gf_t    (*gf_t64k_t)[256];
 void init_64k_table(const gf_t g, gf_t64k_t t);
 void gf_mul_64k(gf_t a, const gf_t64k_t t, void *r);
 /* types and calls for 8k table driven field multiplier        */
 typedef gf_t    gf_t8k_a[32][16];
 typedef gf_t    (*gf_t8k_t)[16];
 void init_8k_table(const gf_t g, gf_t8k_t t);
 void gf_mul_8k(gf_t a, const gf_t8k_t t, gf_t r);
 /* types and calls for 8k table driven field multiplier        */
 typedef gf_t    gf_t4k_a[256];
 typedef gf_t    (*gf_t4k_t);
 void init_4k_table(const gf_t g, gf_t4k_t t);
 void gf_mul_4k(gf_t a, const gf_t4k_t t, gf_t r);
 /* types and calls for 8k table driven field multiplier        */
 typedef gf_t    gf_t256_a[16];
 typedef gf_t    (*gf_t256_t);
 void init_256_table(const gf_t g, gf_t256_t t);
 void gf_mul_256(gf_t a, const gf_t256_t t, gf_t r);
 #if defined(__cplusplus)
 }
 #endif
 #endif
--- a/crypto/aes/gf_mul_lo.h
+++ b/crypto/aes/gf_mul_lo.h
@ -0,0 +1,773 @@
 /*
 ---------------------------------------------------------------------------
 Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved.
 The redistribution and use of this software (with or without changes)
 is allowed without the payment of fees or royalties provided that:
  source code distributions include the above copyright notice, this
  list of conditions and the following disclaimer;
  binary distributions include the above copyright notice, this list
  of conditions and the following disclaimer in their documentation.
 This software is provided 'as is' with no explicit or implied warranties
 in respect of its operation, including, but not limited to, correctness
 and fitness for purpose.
 ---------------------------------------------------------------------------
 Issue Date: 18/02/2014
 This file provides the low level primitives needed for Galois Field 
 operations in GF(2^128) for the four most likely field representations.
 */
 #ifndef _GF_MUL_LO_H
 #define _GF_MUL_LO_H
 #if defined( USE_INLINING )
 #  if defined( _MSC_VER )
 #    define gf_decl __inline
 #  elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
 #    define gf_decl static inline
 #  else
 #    define gf_decl static
 #  endif
 #endif
 #if 0   /* used for testing only: t1(UNIT_BITS), t2(UNIT_BITS)  */
 #  define _t1(n) bswap ## n ## _block(x, x)
 #  define  t1(n) _t1(n)
 #  define _t2(n) bswap ## n ## _block(x, x); bswap ## n ## _block(r, r) 
 #  define  t2(n) _t2(n)
 #endif
 #define gf_m(n,x)    gf_mulx ## n ## x
 #define gf_mulx1(x)  gf_m(1,x)
 #define gf_mulx4(x)  gf_m(4,x)
 #define gf_mulx8(x)  gf_m(8,x)
 #define MASK(x) ((x) * (UNIT_CAST(-1,UNIT_BITS) / 0xff))
 #define DATA_256(q) {\
    q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
    q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
    q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
    q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
    q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
    q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
    q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
    q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
    q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
    q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
    q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
    q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
    q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
    q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
    q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
    q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
    q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
    q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
    q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
    q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
    q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
    q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
    q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
    q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
    q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
    q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
    q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
    q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
    q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
    q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
    q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
    q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) }
 /*  Within the 16 bytes of the field element the top and bottom field bits
    are within bytes as follows (bit numbers in bytes 0 from ls up) for
    each of the four field representations supported (see gf128mul.txt):
    GF_BIT   127 126 125 124 123 122 121 120     .....  7 6 5 4 3 2 1 0
                                                  0x87  1 0 0 0 0 1 1 1
    BL x[ 0]   7   6   5   4   3   2   1   0     x[15]  7 6 5 4 3 2 1 0
    LL x[15]   7   6   5   4   3   2   1   0     x[ 0]  7 6 5 4 3 2 1 0
    GF_BIT   120 121 122 123 124 125 126 127     .....  0 1 2 3 4 5 6 7
                                                  0xc1  1 1 1 0 0 0 0 1
    BB x[ 0]   7   6   5   4   3   2   1   0     x[15]  7 6 5 4 3 2 1 0
    LB x[15]   7   6   5   4   3   2   1   0     x[ 0]  7 6 5 4 3 2 1 0
    When the field element is multiplied by x^n, the high bits overflow
    and are used to form an overflow byte. For the BL and LL modes this
    byte has the lowest overflow bit in bit 0 whereas for the BB and LB
    modes this bit is in biit 7.  So we have for this byte:
    bit (bit n = 2^n)    7   6   5   4   3   2   1   0
    BL and LL          x^7 x^6 x^5 x^4 x^3 x^2 x^1 x^0  
    BB and LB          x^0 x^1 x^2 x^3 x^4 x^5 x^6 x^7  
    This byte then has to be multiplied by the low bits of the field
    polynomial, which produces a value of 16 bits to be xored into the 
    left shifted field value. For the BL and LL modes bit 0 gives the
    word value 0x0087, bit 1 gives 0x010e (0x87 left shifted 1), 0x021c
    (0x87 left shifted 2), ... For the BB and LB modes, bit 7 gives the
    value 0x00e1, bit 6 gives 0x8070, bit 5 gives 0x4038, ... Each bit
    in the overflow byte is expanded in this way and is xored into the
    overall result, so eaach of the 256 byte values will produce a
    corresponding word value that is computed by the gf_uint16_xor(i)
    macros below.
    These word values have to be xored into the low 16 bits of the 
    field value. If the byte endianess of the mode matches that of
    the architecture xoring the word value will be correct. But if
    the mode has the opposite endianess, the word value has to be
    xored in byte reversed order. This is done by the ord() macro.
 */
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN \
      && (defined( GF_MODE_LB ) || defined( GF_MODE_LL )) || \
    PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN \
      && (defined( GF_MODE_BB ) || defined( GF_MODE_BL ))
 #  define ord(hi, lo)   0x##hi##lo
 #else 
 #  define ord(hi, lo)   0x##lo##hi
 #endif
 #if defined( GF_MODE_BL ) || defined( GF_MODE_LL )
 /* field and numeric bit significance correspond */
 #define gf_uint16_xor(i) ( \
    (i & 0x01 ? ord(00,87) : 0) ^ (i & 0x02 ? ord(01,0e) : 0) ^ \
    (i & 0x04 ? ord(02,1c) : 0) ^ (i & 0x08 ? ord(04,38) : 0) ^ \
    (i & 0x10 ? ord(08,70) : 0) ^ (i & 0x20 ? ord(10,e0) : 0) ^ \
    (i & 0x40 ? ord(21,c0) : 0) ^ (i & 0x80 ? ord(43,80) : 0) )
 enum x_bit 
 { 
    X_0 = 0x01, X_1 = 0x02, X_2 = 0x04, X_3 = 0x08, 
    X_4 = 0x10, X_5 = 0x20, X_6 = 0x40, X_7 = 0x80
 };
 #elif defined( GF_MODE_BB ) || defined( GF_MODE_LB )
 /* field and numeric bit significance are in reverse */
 #define gf_uint16_xor(i) ( \
    (i & 0x80 ? ord(00,e1) : 0) ^ (i & 0x40 ? ord(80,70) : 0) ^ \
    (i & 0x20 ? ord(40,38) : 0) ^ (i & 0x10 ? ord(20,1c) : 0) ^ \
    (i & 0x08 ? ord(10,0e) : 0) ^ (i & 0x04 ? ord(08,07) : 0) ^ \
    (i & 0x02 ? ord(84,03) : 0) ^ (i & 0x01 ? ord(c2,01) : 0) )
 enum x_bit 
 { 
    X_0 = 0x80, X_1 = 0x40, X_2 = 0x20, X_3 = 0x10, 
    X_4 = 0x08, X_5 = 0x04, X_6 = 0x02, X_7 = 0x01
 };
 #else
 #error Galois Field representation has not been set
 #endif
 const uint16_t gf_tab[256] = DATA_256(gf_uint16_xor);
 /* LL Mode Galois Field operations 
  x[0]     x[1]     x[2]     x[3]     x[4]     x[5]     x[6]    x[7]
 ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
 10000111 ........ ........ ........ ........ ........ ........ ........
 07....00 15....08 23....16 31....24 39....32 47....40 55....48 63....56
  x[8]    x[9]   x[10]   x[11]   x[12]   x[13]   x[14]  x[15]
 ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
 ........ ........ ........ ........ ........ ........ ........ M.......
 71....64 79....72 87....80 95....88 103...96 111..104 119..112 127..120
 */
 #if UNIT_BITS == 64
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
 #define f1_ll(n,r,x)   r[n] = (x[n] << 1) | (n ? x[n-1] >> 63 : 0)
 #define f4_ll(n,r,x)   r[n] = (x[n] << 4) | (n ? x[n-1] >> 60 : 0)
 #define f8_ll(n,r,x)   r[n] = (x[n] << 8) | (n ? x[n-1] >> 56 : 0)
 #else
 #define f1_ll(n,r,x)   r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \
                            | (n ? x[n-1] << 49 : 0)) & MASK(0x01))
 #define f4_ll(n,r,x)   r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \
                            | (n ? x[n-1] << 52 : 0)) & MASK(0x0f))
 #define f8_ll(n,r,x)   r[n] = (x[n] >> 8) | (n ? x[n-1] << 56 : 0)
 #endif
 gf_decl void gf_mulx1_ll(gf_t r, const gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[(UNIT_PTR(x)[1] >> 63) & 0x01];
 #else
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] >> 7) & 0x01])) << 48;
 #endif
    rep2_d2(f1_ll, UNIT_PTR(r), UNIT_PTR(x));
    UNIT_PTR(r)[0] ^= _tt;
 }
 gf_decl void gf_mulx4_ll(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[(UNIT_PTR(x)[1] >> 60) & 0x0f];
 #else
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] >> 4) & 0x0f])) << 48;
 #endif
    rep2_d2(f4_ll, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[0] ^= _tt;
 }
 gf_decl void gf_mulx8_ll(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[UNIT_PTR(x)[1] >> 56];
 #else
    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[1] & 0xff])) << 48;
 #endif
    rep2_d2(f8_ll, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[0] ^= _tt;
 }
 #elif UNIT_BITS == 32
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
 #define f1_ll(n,r,x)   r[n] = (x[n] << 1) | (n ? x[n-1] >> 31 : 0)
 #define f4_ll(n,r,x)   r[n] = (x[n] << 4) | (n ? x[n-1] >> 28 : 0)
 #define f8_ll(n,r,x)   r[n] = (x[n] << 8) | (n ? x[n-1] >> 24 : 0)
 #else
 #define f1_ll(n,r,x)   r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \
                            | (n ? x[n-1] << 17 : 0)) & MASK(0x01))
 #define f4_ll(n,r,x)   r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \
                            | (n ? x[n-1] << 20 : 0)) & MASK(0x0f))
 #define f8_ll(n,r,x)   r[n] = (x[n] >> 8) | (n ? x[n-1] << 24 : 0)
 #endif
 gf_decl void gf_mulx1_ll(gf_t r, const gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[(UNIT_PTR(x)[3] >> 31) & 0x01];
 #else
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] >> 7) & 0x01])) << 16;
 #endif
    rep2_d4(f1_ll, UNIT_PTR(r), UNIT_PTR(x));
    UNIT_PTR(r)[0] ^= _tt;
 }
 gf_decl void gf_mulx4_ll(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[(UNIT_PTR(x)[3] >> 28) & 0x0f];
 #else
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] >> 4) & 0x0f])) << 16;
 #endif
    rep2_d4(f4_ll, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[0] ^= _tt;
 }
 gf_decl void gf_mulx8_ll(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[UNIT_PTR(x)[3] >> 24];
 #else
    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[3] & 0xff])) << 16;
 #endif
    rep2_d4(f8_ll, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[0] ^= _tt;
 }
 #else
 #define f1_ll(n,r,x)   r[n] = (x[n] << 1) | (n ? x[n-1] >> 7 : 0)
 #define f4_ll(n,r,x)   r[n] = (x[n] << 4) | (n ? x[n-1] >> 4 : 0)
 gf_decl void gf_mulx1_ll(gf_t r, const gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[(UNIT_PTR(x)[15] >> 7) & 0x01];
    rep2_d16(f1_ll, UNIT_PTR(r), UNIT_PTR(x));
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(r)[0] ^= _tt & 0xff;
 #else
    UNIT_PTR(r)[0] ^= _tt >> 8;
 #endif
 }
 gf_decl void gf_mulx4_ll(gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[(UNIT_PTR(x)[15] >> 4) & 0x0f];
    rep2_d16(f4_ll, UNIT_PTR(x), UNIT_PTR(x));
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(x)[1] ^= _tt >> 8;
    UNIT_PTR(x)[0] ^= _tt & 0xff;
 #else
    UNIT_PTR(x)[1] ^= _tt & 0xff;
    UNIT_PTR(x)[0] =  _tt >> 8;
 #endif
 }
 gf_decl void gf_mulx8_ll(gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[UNIT_PTR(x)[15]];
    memmove(UNIT_PTR(x) + 1, UNIT_PTR(x), 15);
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(x)[1] ^= _tt >> 8;
    UNIT_PTR(x)[0] =  _tt & 0xff;
 #else
    UNIT_PTR(x)[1] ^= _tt & 0xff;
    UNIT_PTR(x)[0] =  _tt >> 8;
 #endif
 }
 #endif
 /* BL Mode Galois Field operations 
  x[0]     x[1]     x[2]     x[3]     x[4]     x[5]     x[6]     x[7]
 ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
 M....... ........ ........ ........ ........ ........ ........ ........
 127..120 119..112 111..104 103...96 95....88 87....80 79....72 71....64
  x[8]     x[9]    x[10]    x[11]    x[12]    x[13]    x[14]    x[15]
 ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
 ........ ........ ........ ........ ........ ........ ........ 10000111
 63....56 55....48 47....40 39....32 31....24 23....16 15....08 07....00
 */
 #if UNIT_BITS == 64
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
 #define f1_bl(n,r,x)   r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \
                            | (!n ? x[n+1] << 49 : 0)) & MASK(0x01))
 #define f4_bl(n,r,x)   r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \
                            | (!n ? x[n+1] << 52 : 0)) & MASK(0x0f))
 #define f8_bl(n,r,x)   r[n] = (x[n] >> 8) | (!n ? x[n+1] << 56 : 0)
 #else
 #define f1_bl(n,r,x)   r[n] = (x[n] << 1) | (!n ? x[n+1] >> 63 : 0)
 #define f4_bl(n,r,x)   r[n] = (x[n] << 4) | (!n ? x[n+1] >> 60 : 0)
 #define f8_bl(n,r,x)   r[n] = (x[n] << 8) | (!n ? x[n+1] >> 56 : 0)
 #endif
 gf_decl void gf_mulx1_bl(gf_t r, const gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 7) & 0x01])) << 48;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 63) & 0x01];
 #endif
    rep2_u2(f1_bl, UNIT_PTR(r), UNIT_PTR(x));
    UNIT_PTR(r)[1] ^= _tt;
 }
 gf_decl void gf_mulx4_bl(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 4) & 0x0f])) << 48;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 60) & 0x0f];
 #endif
    rep2_u2(f4_bl, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[1] ^= _tt;
 }
 gf_decl void gf_mulx8_bl(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 48;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 56) & 0xff];
 #endif
    rep2_u2(f8_bl, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[1] ^= _tt;
 }
 #elif UNIT_BITS == 32
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
 #define f1_bl(n,r,x)   r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \
                            | (n < 3 ? x[n+1] << 17 : 0)) & MASK(0x01))
 #define f4_bl(n,r,x)   r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \
                            | (n < 3 ? x[n+1] << 20 : 0)) & MASK(0x0f))
 #define f8_bl(n,r,x)   r[n] = (x[n] >> 8) | (n < 3 ? x[n+1] << 24 : 0)
 #else
 #define f1_bl(n,r,x)   r[n] = (x[n] << 1) | (n < 3 ? x[n+1] >> 31 : 0)
 #define f4_bl(n,r,x)   r[n] = (x[n] << 4) | (n < 3 ? x[n+1] >> 28 : 0)
 #define f8_bl(n,r,x)   r[n] = (x[n] << 8) | (n < 3 ? x[n+1] >> 24 : 0)
 #endif
 gf_decl void gf_mulx1_bl(gf_t r, const gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 7) & 0x01])) << 16;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 31) & 0x01];
 #endif
    rep2_u4(f1_bl, UNIT_PTR(r), UNIT_PTR(x));
    UNIT_PTR(r)[3] ^= _tt;
 }
 gf_decl void gf_mulx4_bl(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 4) & 0x0f])) << 16;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 28) & 0x0f];
 #endif
    rep2_u4(f4_bl, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[3] ^= _tt;
 }
 gf_decl void gf_mulx8_bl(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 16;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 24) & 0xff];
 #endif
    rep2_u4(f8_bl, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[3] ^= _tt;
 }
 #else
 #define f1_bl(n,r,x)   r[n] = (x[n] << 1) | (n < 15 ? x[n+1] >> 7 : 0)
 #define f4_bl(n,r,x)   r[n] = (x[n] << 4) | (n < 15 ? x[n+1] >> 4 : 0)
 gf_decl void gf_mulx1_bl(gf_t r, const gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[(UNIT_PTR(x)[0] >> 7) & 0x01];
    rep2_u16(f1_bl, UNIT_PTR(r), UNIT_PTR(x));
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(r)[15] ^= _tt >> 8;
 #else
    UNIT_PTR(r)[15] ^= _tt & 0xff;
 #endif
 }
 gf_decl void gf_mulx4_bl(gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[(UNIT_PTR(x)[0] >> 4) & 0x0f];
    rep2_u16(f4_bl, UNIT_PTR(x), UNIT_PTR(x));
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(x)[14] ^= _tt & 0xff;
    UNIT_PTR(x)[15] ^= _tt >> 8;
 #else
    UNIT_PTR(x)[14] ^= _tt >> 8;
    UNIT_PTR(x)[15] = _tt & 0xff;
 #endif
 }
 gf_decl void gf_mulx8_bl(gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[UNIT_PTR(x)[0]];
    memmove(UNIT_PTR(x), UNIT_PTR(x) + 1, 15);
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(x)[14] ^= _tt & 0xff;
    UNIT_PTR(x)[15]  = _tt >> 8;
 #else
    UNIT_PTR(x)[14] ^= _tt >> 8;
    UNIT_PTR(x)[15]  = _tt & 0xff;
 #endif
 }
 #endif
 /* LB Mode Galois Field operations 
   x[0]    x[1]     x[2]     x[3]     x[4]     x[5]     x[6]     x[7]
 ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
 11100001 ........ ........ ........ ........ ........ ........ ........
 00....07 08....15 16....23 24....31 32....39 40....47 48....55 56....63
   x[8]    x[9]    x[10]    x[11]    x[12]    x[13]    x[14]    x[15]
 ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
 ........ ........ ........ ........ ........ ........ ........ .......M
 64....71 72....79 80....87 88....95 96...103 104..111 112..119 120..127
 */
 #if UNIT_BITS == 64
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
 #define f1_lb(n,r,x)   r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \
                            | (n ? x[n-1] >> 49 : 0)) & MASK(0x80))
 #define f4_lb(n,r,x)   r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \
                            | (n ? x[n-1] >> 52 : 0)) & MASK(0xf0))
 #define f8_lb(n,r,x)   r[n] = (x[n] << 8) | (n ? x[n-1] >> 56 : 0)
 #else
 #define f1_lb(n,r,x)   r[n] = (x[n] >> 1) | (n ? x[n-1] << 63 : 0)
 #define f4_lb(n,r,x)   r[n] = (x[n] >> 4) | (n ? x[n-1] << 60 : 0)
 #define f8_lb(n,r,x)   x[n] = (x[n] >> 8) | (n ? x[n-1] << 56 : 0)
 #endif
 gf_decl void gf_mulx1_lb(gf_t r, const gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[(UNIT_PTR(x)[1] >> 49) & MASK(0x80)];
 #else
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] << 7) & 0xff])) << 48;
 #endif
    rep2_d2(f1_lb, UNIT_PTR(r), UNIT_PTR(x));
    UNIT_PTR(r)[0] ^= _tt;
 }
 gf_decl void gf_mulx4_lb(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[(UNIT_PTR(x)[1] >> 52) & MASK(0xf0)];
 #else
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] << 4) & 0xff])) << 48;
 #endif
    rep2_d2(f4_lb, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[0] ^= _tt;
 }
 gf_decl void gf_mulx8_lb(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[UNIT_PTR(x)[1] >> 56];
 #else
    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[1] & 0xff])) << 48;
 #endif
    rep2_d2(f8_lb, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[0] ^= _tt;
 }
 #elif UNIT_BITS == 32
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
 #define f1_lb(n,r,x)   r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \
                            | (n ? x[n-1] >> 17 : 0)) & MASK(0x80))
 #define f4_lb(n,r,x)   r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \
                            | (n ? x[n-1] >> 20 : 0)) & MASK(0xf0))
 #define f8_lb(n,r,x)   r[n] = (x[n] << 8) | (n ? x[n-1] >> 24 : 0)
 #else
 #define f1_lb(n,r,x)   r[n] = (x[n] >> 1) | (n ? x[n-1] << 31 : 0)
 #define f4_lb(n,r,x)   r[n] = (x[n] >> 4) | (n ? x[n-1] << 28 : 0)
 #define f8_lb(n,r,x)   r[n] = (x[n] >> 8) | (n ? x[n-1] << 24 : 0)
 #endif
 gf_decl void gf_mulx1_lb(gf_t r, const gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[(UNIT_PTR(x)[3] >> 17) & MASK(0x80)];
 #else
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] << 7) & 0xff])) << 16;
 #endif
    rep2_d4(f1_lb, UNIT_PTR(r), UNIT_PTR(x));
    UNIT_PTR(r)[0] ^= _tt;
 }
 gf_decl void gf_mulx4_lb(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[(UNIT_PTR(x)[3] >> 20) & MASK(0xf0)];
 #else
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] << 4) & 0xff])) << 16;
 #endif
    rep2_d4(f4_lb, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[0] ^= _tt;
 }
 gf_decl void gf_mulx8_lb(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = gf_tab[UNIT_PTR(x)[3] >> 24];
 #else
    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[3] & 0xff])) << 16;
 #endif
    rep2_d4(f8_lb, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[0] ^= _tt;
 }
 #else
 #define f1_lb(n,r,x)   r[n] = (x[n] >> 1) | (n ? x[n-1] << 7 : 0)
 #define f4_lb(n,r,x)   r[n] = (x[n] >> 4) | (n ? x[n-1] << 4 : 0)
 gf_decl void gf_mulx1_lb(gf_t r, const gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[(UNIT_PTR(x)[15] << 7) & 0x80];
    rep2_d16(f1_lb, UNIT_PTR(r), UNIT_PTR(x));
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(r)[0] ^= _tt;
 #else
    UNIT_PTR(r)[0] ^= _tt >> 8;
 #endif
 }
 gf_decl void gf_mulx4_lb(gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[(UNIT_PTR(x)[15] << 4) & 0xf0];
    rep2_d16(f4_lb, UNIT_PTR(x), UNIT_PTR(x));
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(x)[1] ^= _tt >> 8;
    UNIT_PTR(x)[0] ^= _tt & 0xff;
 #else
    UNIT_PTR(x)[1] ^= _tt & 0xff;
    UNIT_PTR(x)[0] ^= _tt >> 8;
 #endif
 }
 gf_decl void gf_mulx8_lb(gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[UNIT_PTR(x)[15]];
    memmove(UNIT_PTR(x) + 1, UNIT_PTR(x), 15);
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(x)[1] ^= _tt >> 8;
    UNIT_PTR(x)[0] = _tt & 0xff;
 #else
    UNIT_PTR(x)[1] ^= _tt & 0xff;
    UNIT_PTR(x)[0] = _tt >> 8;
 #endif
 }
 #endif
 /* BB Mode Galois Field operations 
  x[0]     x[1]     x[2]     x[3]     x[4]     x[5]     x[6]     x[7]
 ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
 .......M ........ ........ ........ ........ ........ ........ ........
 120..127 112..119 104..111 96...103 88....95 80....87 72....79 64....71
  x[8]     x[9]     x[10]    x[11]    x[12]    x[13]    x[14]   x[15]
 ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls ms    ls
 ........ ........ ........ ........ ........ ........ ........ 11100001
 56....63 48....55 40....47 32....39 24....31 16....23 08....15 00....07
 */
 #if UNIT_BITS == 64
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
 #define f1_bb(n,r,x)   r[n] = (x[n] >> 1) | (!n ? x[n+1] << 63 : 0)
 #define f4_bb(n,r,x)   r[n] = (x[n] >> 4) | (!n ? x[n+1] << 60 : 0)
 #define f8_bb(n,r,x)   r[n] = (x[n] >> 8) | (!n ? x[n+1] << 56 : 0)
 #else
 #define f1_bb(n,r,x)   r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \
                            | (!n ? x[n+1] >> 49 : 0)) & MASK(0x80))
 #define f4_bb(n,r,x)   r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \
                            | (!n ? x[n+1] >> 52 : 0)) & MASK(0xf0))
 #define f8_bb(n,r,x)   r[n] = (x[n] << 8) | (!n ? x[n+1] >> 56 : 0)
 #endif
 gf_decl void gf_mulx1_bb(gf_t r, const gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = (( gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 7) & 0x80])) << 48;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 49) & 0x80];
 #endif
    rep2_u2(f1_bb, UNIT_PTR(r), UNIT_PTR(x));
    UNIT_PTR(r)[1] ^= _tt;
 }
 gf_decl void gf_mulx4_bb(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 4) & 0xf0])) << 48;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 52) & 0xf0];
 #endif
    rep2_u2(f4_bb, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[1] ^= _tt;
 }
 gf_decl void gf_mulx8_bb(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 48;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 56) & 0xff];
 #endif
    rep2_u2(f8_bb, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[1] ^= _tt;
 }
 #elif UNIT_BITS == 32
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
 #define f1_bb(n,r,x)   r[n] = (x[n] >> 1) | (n < 3 ? x[n+1] << 31 : 0)
 #define f4_bb(n,r,x)   r[n] = (x[n] >> 4) | (n < 3 ? x[n+1] << 28 : 0)
 #define f8_bb(n,r,x)   r[n] = (x[n] >> 8) | (n < 3 ? x[n+1] << 24 : 0)
 #else
 #define f1_bb(n,r,x)   r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \
                            | (n < 3 ? x[n+1] >> 17 : 0)) & MASK(0x80))
 #define f4_bb(n,r,x)   r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \
                            | (n < 3 ? x[n+1] >> 20 : 0)) & MASK(0xf0))
 #define f8_bb(n,r,x)   r[n] = (x[n] << 8) | (n < 3 ? x[n+1] >> 24 : 0)
 #endif
 gf_decl void gf_mulx1_bb(gf_t r, const gf_t x)
 {   gf_unit_t _tt; 
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 7) & 0x80])) << 16;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 17) & 0x80];
 #endif
    rep2_u4(f1_bb, UNIT_PTR(r), UNIT_PTR(x));
    UNIT_PTR(r)[3] ^= _tt;
 }
 gf_decl void gf_mulx4_bb(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 4) & 0xf0])) << 16;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 20) & 0xf0];
 #endif
    rep2_u4(f4_bb, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[3] ^= _tt;
 }
 gf_decl void gf_mulx8_bb(gf_t x)
 {   gf_unit_t _tt;
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 16;
 #else
    _tt = gf_tab[(UNIT_PTR(x)[0] >> 24) & 0xff];
 #endif
    rep2_u4(f8_bb, UNIT_PTR(x), UNIT_PTR(x));
    UNIT_PTR(x)[3] ^= _tt;
 }
 #else
 #define f1_bb(n,r,x)   r[n] = (x[n] >> 1) | (n < 15 ? x[n+1] << 7 : 0)
 #define f4_bb(n,r,x)   r[n] = (x[n] >> 4) | (n < 15 ? x[n+1] << 4 : 0)
 gf_decl void gf_mulx1_bb(gf_t r, const gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[(UNIT_PTR(x)[0] << 7) & 0x80];
    rep2_u16(f1_bb, UNIT_PTR(r), UNIT_PTR(x));
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(r)[15] ^= _tt >> 8;
 #else
    UNIT_PTR(r)[15] ^= _tt;
 #endif
 }
 gf_decl void gf_mulx4_bb(gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[(UNIT_PTR(x)[0] << 4) & 0xf0];
    rep2_u16(f4_bb, UNIT_PTR(x), UNIT_PTR(x));
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(x)[14] ^= _tt & 0xff;
    UNIT_PTR(x)[15] ^= _tt >> 8;
 #else
    UNIT_PTR(x)[14] ^= _tt >> 8;
    UNIT_PTR(x)[15] ^= _tt & 0xff;
 #endif
 }
 gf_decl void gf_mulx8_bb(gf_t x)
 {   uint16_t _tt;
 	_tt = gf_tab[UNIT_PTR(x)[0]];
    memmove(UNIT_PTR(x), UNIT_PTR(x) + 1, 15);
 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
    UNIT_PTR(x)[14] ^= _tt & 0xff;
    UNIT_PTR(x)[15] = _tt >> 8;
 #else
    UNIT_PTR(x)[14] ^= _tt >> 8;
    UNIT_PTR(x)[15] = _tt & 0xff;
 #endif
 }
 #endif
 #endif
--- a/crypto/aes/mode_hdr.h
+++ b/crypto/aes/mode_hdr.h
@ -0,0 +1,329 @@
 /*
 ---------------------------------------------------------------------------
 Copyright (c) 1998-2014, Brian Gladman, Worcester, UK. All rights reserved.
 The redistribution and use of this software (with or without changes)
 is allowed without the payment of fees or royalties provided that:
  source code distributions include the above copyright notice, this
  list of conditions and the following disclaimer;
  binary distributions include the above copyright notice, this list
  of conditions and the following disclaimer in their documentation.
 This software is provided 'as is' with no explicit or implied warranties
 in respect of its operation, including, but not limited to, correctness
 and fitness for purpose.
 ---------------------------------------------------------------------------
 Issue Date: 18/02/2014
 This header file is an INTERNAL file which supports mode implementation
 */
 #ifndef _MODE_HDR_H
 #define _MODE_HDR_H
 #include <string.h>
 #include <limits.h>
 #include "brg_endian.h"
 /*  This define sets the units in which buffers are processed.  This code
    can provide significant speed gains if buffers can be processed in
    32 or 64 bit chunks rather than in bytes.  This define sets the units
    in which buffers will be accessed if possible
 */
 #if !defined( UNIT_BITS )
 #  if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
 #    if 0
 #      define UNIT_BITS  32
 #    elif 1
 #      define UNIT_BITS  64
 #    endif
 #  elif defined( _WIN64 )
 #    define UNIT_BITS 64
 #  else
 #    define UNIT_BITS 32
 #  endif
 #endif
 #if UNIT_BITS == 64 && !defined( NEED_UINT_64T )
 #  define NEED_UINT_64T
 #endif
 #include "brg_types.h"
 /*  Use of inlines is preferred but code blocks can also be expanded inline
    using 'defines'.  But the latter approach will typically generate a LOT
    of code and is not recommended. 
 */
 #if 1 && !defined( USE_INLINING )
 #  define USE_INLINING
 #endif
 #if defined( _MSC_VER )
 #  if _MSC_VER >= 1400
 #    include <stdlib.h>
 #    include <intrin.h>
 #    pragma intrinsic(memset)
 #    pragma intrinsic(memcpy)
 #    define rotl32        _rotl
 #    define rotr32        _rotr
 #    define rotl64        _rotl64
 #    define rotr64        _rotl64
 #    define bswap_16(x)   _byteswap_ushort(x)
 #    define bswap_32(x)   _byteswap_ulong(x)
 #    define bswap_64(x)   _byteswap_uint64(x)
 #  else
 #    define rotl32 _lrotl
 #    define rotr32 _lrotr
 #  endif
 #endif
 #if defined( USE_INLINING )
 #  if defined( _MSC_VER )
 #    define mh_decl __inline
 #  elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
 #    define mh_decl static inline
 #  else
 #    define mh_decl static
 #  endif
 #endif
 #if defined(__cplusplus)
 extern "C" {
 #endif
 #define  UI8_PTR(x)     UPTR_CAST(x,  8)
 #define UI16_PTR(x)     UPTR_CAST(x, 16)
 #define UI32_PTR(x)     UPTR_CAST(x, 32)
 #define UI64_PTR(x)     UPTR_CAST(x, 64)
 #define UNIT_PTR(x)     UPTR_CAST(x, UNIT_BITS)
 #define  UI8_VAL(x)     UNIT_CAST(x,  8)
 #define UI16_VAL(x)     UNIT_CAST(x, 16)
 #define UI32_VAL(x)     UNIT_CAST(x, 32)
 #define UI64_VAL(x)     UNIT_CAST(x, 64)
 #define UNIT_VAL(x)     UNIT_CAST(x, UNIT_BITS)
 #define BUF_INC          (UNIT_BITS >> 3)
 #define BUF_ADRMASK     ((UNIT_BITS >> 3) - 1)
 #define rep2_u2(f,r,x)    f( 0,r,x); f( 1,r,x) 
 #define rep2_u4(f,r,x)    f( 0,r,x); f( 1,r,x); f( 2,r,x); f( 3,r,x) 
 #define rep2_u16(f,r,x)   f( 0,r,x); f( 1,r,x); f( 2,r,x); f( 3,r,x); \
                          f( 4,r,x); f( 5,r,x); f( 6,r,x); f( 7,r,x); \
                          f( 8,r,x); f( 9,r,x); f(10,r,x); f(11,r,x); \
                          f(12,r,x); f(13,r,x); f(14,r,x); f(15,r,x)
 #define rep2_d2(f,r,x)    f( 1,r,x); f( 0,r,x) 
 #define rep2_d4(f,r,x)    f( 3,r,x); f( 2,r,x); f( 1,r,x); f( 0,r,x) 
 #define rep2_d16(f,r,x)   f(15,r,x); f(14,r,x); f(13,r,x); f(12,r,x); \
                          f(11,r,x); f(10,r,x); f( 9,r,x); f( 8,r,x); \
                          f( 7,r,x); f( 6,r,x); f( 5,r,x); f( 4,r,x); \
                          f( 3,r,x); f( 2,r,x); f( 1,r,x); f( 0,r,x)
 #define rep3_u2(f,r,x,y,c)  f( 0,r,x,y,c); f( 1,r,x,y,c) 
 #define rep3_u4(f,r,x,y,c)  f( 0,r,x,y,c); f( 1,r,x,y,c); f( 2,r,x,y,c); f( 3,r,x,y,c) 
 #define rep3_u16(f,r,x,y,c) f( 0,r,x,y,c); f( 1,r,x,y,c); f( 2,r,x,y,c); f( 3,r,x,y,c); \
                            f( 4,r,x,y,c); f( 5,r,x,y,c); f( 6,r,x,y,c); f( 7,r,x,y,c); \
                            f( 8,r,x,y,c); f( 9,r,x,y,c); f(10,r,x,y,c); f(11,r,x,y,c); \
                            f(12,r,x,y,c); f(13,r,x,y,c); f(14,r,x,y,c); f(15,r,x,y,c)
 #define rep3_d2(f,r,x,y,c)  f( 1,r,x,y,c); f( 0,r,x,y,c) 
 #define rep3_d4(f,r,x,y,c)  f( 3,r,x,y,c); f( 2,r,x,y,c); f( 1,r,x,y,c); f( 0,r,x,y,c) 
 #define rep3_d16(f,r,x,y,c) f(15,r,x,y,c); f(14,r,x,y,c); f(13,r,x,y,c); f(12,r,x,y,c); \
                            f(11,r,x,y,c); f(10,r,x,y,c); f( 9,r,x,y,c); f( 8,r,x,y,c); \
                            f( 7,r,x,y,c); f( 6,r,x,y,c); f( 5,r,x,y,c); f( 4,r,x,y,c); \
                            f( 3,r,x,y,c); f( 2,r,x,y,c); f( 1,r,x,y,c); f( 0,r,x,y,c)
 /* function pointers might be used for fast XOR operations */
 typedef void (*xor_function)(void* r, const void* p, const void* q);
 /* left and right rotates on 32 and 64 bit variables */
 #if !defined( rotl32 )  /* NOTE: 0 <= n <= 32 ASSUMED */
 mh_decl uint32_t rotl32(uint32_t x, int n)
 {
    return (((x) << n) | ((x) >> (32 - n)));
 }
 #endif
 #if !defined( rotr32 )  /* NOTE: 0 <= n <= 32 ASSUMED */
 mh_decl uint32_t rotr32(uint32_t x, int n)
 {
    return (((x) >> n) | ((x) << (32 - n)));
 }
 #endif
 #if ( UNIT_BITS == 64 ) && !defined( rotl64 )  /* NOTE: 0 <= n <= 64 ASSUMED */
 mh_decl uint64_t rotl64(uint64_t x, int n)
 {
    return (((x) << n) | ((x) >> (64 - n)));
 }
 #endif
 #if ( UNIT_BITS == 64 ) && !defined( rotr64 )  /* NOTE: 0 <= n <= 64 ASSUMED */
 mh_decl uint64_t rotr64(uint64_t x, int n)
 {
    return (((x) >> n) | ((x) << (64 - n)));
 }
 #endif
 /* byte order inversions for 16, 32 and 64 bit variables */
 #if !defined(bswap_16)
 mh_decl uint16_t bswap_16(uint16_t x)
 {
    return (uint16_t)((x >> 8) | (x << 8));
 }
 #endif
 #if !defined(bswap_32)
 mh_decl uint32_t bswap_32(uint32_t x)
 {
    return ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00));
 }
 #endif
 #if ( UNIT_BITS == 64 ) && !defined(bswap_64)
 mh_decl uint64_t bswap_64(uint64_t x)
 {   
    return bswap_32((uint32_t)(x >> 32)) | ((uint64_t)bswap_32((uint32_t)x) << 32);
 }
 #endif
 /* support for fast aligned buffer move, xor and byte swap operations - 
   source and destination buffers for move and xor operations must not 
   overlap, those for byte order revesal must either not overlap or
   must be identical
 */
 #define f_copy(n,p,q)     p[n] = q[n]
 #define f_xor(n,r,p,q,c)  r[n] = c(p[n] ^ q[n])
 mh_decl void copy_block(void* p, const void* q)
 {
    memcpy(p, q, 16);
 }
 mh_decl void copy_block_aligned(void *p, const void *q)
 {
 #if UNIT_BITS == 8
    memcpy(p, q, 16);
 #elif UNIT_BITS == 32
    rep2_u4(f_copy,UNIT_PTR(p),UNIT_PTR(q));
 #else
    rep2_u2(f_copy,UNIT_PTR(p),UNIT_PTR(q));
 #endif
 }
 mh_decl void xor_block(void *r, const void* p, const void* q)
 {
    rep3_u16(f_xor, UI8_PTR(r), UI8_PTR(p), UI8_PTR(q), UI8_VAL);
 }
 mh_decl void xor_block_aligned(void *r, const void *p, const void *q)
 {
 #if UNIT_BITS == 8
    rep3_u16(f_xor, UNIT_PTR(r), UNIT_PTR(p), UNIT_PTR(q), UNIT_VAL);
 #elif UNIT_BITS == 32
    rep3_u4(f_xor, UNIT_PTR(r), UNIT_PTR(p), UNIT_PTR(q), UNIT_VAL);
 #else
    rep3_u2(f_xor, UNIT_PTR(r), UNIT_PTR(p), UNIT_PTR(q), UNIT_VAL);
 #endif
 }
 /* byte swap within 32-bit words in a 16 byte block; don't move 32-bit words */
 mh_decl void bswap32_block(void *d, const void* s)
 {
 #if UNIT_BITS == 8
    uint8_t t;
    t = UNIT_PTR(s)[ 0]; UNIT_PTR(d)[ 0] = UNIT_PTR(s)[ 3]; UNIT_PTR(d)[ 3] = t;
    t = UNIT_PTR(s)[ 1]; UNIT_PTR(d)[ 1] = UNIT_PTR(s)[ 2]; UNIT_PTR(d)[ 2] = t;
    t = UNIT_PTR(s)[ 4]; UNIT_PTR(d)[ 4] = UNIT_PTR(s)[ 7]; UNIT_PTR(d)[ 7] = t;
    t = UNIT_PTR(s)[ 5]; UNIT_PTR(d)[ 5] = UNIT_PTR(s)[ 6]; UNIT_PTR(d) [6] = t;
    t = UNIT_PTR(s)[ 8]; UNIT_PTR(d)[ 8] = UNIT_PTR(s)[11]; UNIT_PTR(d)[12] = t;
    t = UNIT_PTR(s)[ 9]; UNIT_PTR(d)[ 9] = UNIT_PTR(s)[10]; UNIT_PTR(d)[10] = t;
    t = UNIT_PTR(s)[12]; UNIT_PTR(d)[12] = UNIT_PTR(s)[15]; UNIT_PTR(d)[15] = t;
    t = UNIT_PTR(s)[13]; UNIT_PTR(d)[ 3] = UNIT_PTR(s)[14]; UNIT_PTR(d)[14] = t;
 #elif UNIT_BITS == 32
    UNIT_PTR(d)[0] = bswap_32(UNIT_PTR(s)[0]); UNIT_PTR(d)[1] = bswap_32(UNIT_PTR(s)[1]);
    UNIT_PTR(d)[2] = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[3] = bswap_32(UNIT_PTR(s)[3]);
 #else
    UI32_PTR(d)[0] = bswap_32(UI32_PTR(s)[0]); UI32_PTR(d)[1] = bswap_32(UI32_PTR(s)[1]);
    UI32_PTR(d)[2] = bswap_32(UI32_PTR(s)[2]); UI32_PTR(d)[3] = bswap_32(UI32_PTR(s)[3]);
 #endif
 }
 /* byte swap within 64-bit words in a 16 byte block; don't move 64-bit words */
 mh_decl void bswap64_block(void *d, const void* s)
 {
 #if UNIT_BITS == 8
    uint8_t t;
    t = UNIT_PTR(s)[ 0]; UNIT_PTR(d)[ 0] = UNIT_PTR(s)[ 7]; UNIT_PTR(d)[ 7] = t;
    t = UNIT_PTR(s)[ 1]; UNIT_PTR(d)[ 1] = UNIT_PTR(s)[ 6]; UNIT_PTR(d)[ 6] = t;
    t = UNIT_PTR(s)[ 2]; UNIT_PTR(d)[ 2] = UNIT_PTR(s)[ 5]; UNIT_PTR(d)[ 5] = t;
    t = UNIT_PTR(s)[ 3]; UNIT_PTR(d)[ 3] = UNIT_PTR(s)[ 3]; UNIT_PTR(d) [3] = t;
    t = UNIT_PTR(s)[ 8]; UNIT_PTR(d)[ 8] = UNIT_PTR(s)[15]; UNIT_PTR(d)[15] = t;
    t = UNIT_PTR(s)[ 9]; UNIT_PTR(d)[ 9] = UNIT_PTR(s)[14]; UNIT_PTR(d)[14] = t;
    t = UNIT_PTR(s)[10]; UNIT_PTR(d)[10] = UNIT_PTR(s)[13]; UNIT_PTR(d)[13] = t;
    t = UNIT_PTR(s)[11]; UNIT_PTR(d)[11] = UNIT_PTR(s)[12]; UNIT_PTR(d)[12] = t;
 #elif UNIT_BITS == 32
    uint32_t t;
    t = bswap_32(UNIT_PTR(s)[0]); UNIT_PTR(d)[0] = bswap_32(UNIT_PTR(s)[1]); UNIT_PTR(d)[1] = t;
    t = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[2] = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[3] = t;
 #else
    UNIT_PTR(d)[0] = bswap_64(UNIT_PTR(s)[0]);  UNIT_PTR(d)[1] = bswap_64(UNIT_PTR(s)[1]); 
 #endif
 }
 mh_decl void bswap128_block(void *d, const void* s)
 {
 #if UNIT_BITS == 8
    uint8_t t;
    t = UNIT_PTR(s)[0]; UNIT_PTR(d)[0] = UNIT_PTR(s)[15]; UNIT_PTR(d)[15] = t;
    t = UNIT_PTR(s)[1]; UNIT_PTR(d)[1] = UNIT_PTR(s)[14]; UNIT_PTR(d)[14] = t;
    t = UNIT_PTR(s)[2]; UNIT_PTR(d)[2] = UNIT_PTR(s)[13]; UNIT_PTR(d)[13] = t;
    t = UNIT_PTR(s)[3]; UNIT_PTR(d)[3] = UNIT_PTR(s)[12]; UNIT_PTR(d)[12] = t;
    t = UNIT_PTR(s)[4]; UNIT_PTR(d)[4] = UNIT_PTR(s)[11]; UNIT_PTR(d)[11] = t;
    t = UNIT_PTR(s)[5]; UNIT_PTR(d)[5] = UNIT_PTR(s)[10]; UNIT_PTR(d)[10] = t;
    t = UNIT_PTR(s)[6]; UNIT_PTR(d)[6] = UNIT_PTR(s)[ 9]; UNIT_PTR(d)[ 9] = t;
    t = UNIT_PTR(s)[7]; UNIT_PTR(d)[7] = UNIT_PTR(s)[ 8]; UNIT_PTR(d)[ 8] = t;
 #elif UNIT_BITS == 32
    uint32_t t;
    t = bswap_32(UNIT_PTR(s)[0]); UNIT_PTR(d)[0] = bswap_32(UNIT_PTR(s)[3]); UNIT_PTR(d)[3] = t;
    t = bswap_32(UNIT_PTR(s)[1]); UNIT_PTR(d)[1] = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[2] = t;
 #else
    uint64_t t;
    t = bswap_64(UNIT_PTR(s)[0]); UNIT_PTR(d)[0] = bswap_64(UNIT_PTR(s)[1]); UNIT_PTR(d)[1] = t;
 #endif
 }
 /* platform byte order to big or little endian order for 16, 32 and 64 bit variables */
 #if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
 #  define uint16_t_to_le(x) (x) = bswap_16((x))
 #  define uint32_t_to_le(x) (x) = bswap_32((x))
 #  define uint64_t_to_le(x) (x) = bswap_64((x))
 #  define uint16_t_to_be(x)
 #  define uint32_t_to_be(x)
 #  define uint64_t_to_be(x)
 #else
 #  define uint16_t_to_le(x)
 #  define uint32_t_to_le(x)
 #  define uint64_t_to_le(x)
 #  define uint16_t_to_be(x) (x) = bswap_16((x))
 #  define uint32_t_to_be(x) (x) = bswap_32((x))
 #  define uint64_t_to_be(x) (x) = bswap_64((x))
 #endif
 #if defined(__cplusplus)
 }
 #endif
 #endif
--- a/tools/style.c.exclude
+++ b/tools/style.c.exclude
@ -1,5 +1,5 @@
 ^\./core/embed/bootloader/protob/
-^\./crypto/aes/aes\(\|crypt\|key\|_modes\|opt\|tab\|tst\)\.
+^\./crypto/aes/
 ^\./crypto/chacha20poly1305/
 ^\./crypto/ed25519-donna/
 ^\./crypto/gui/