From 6e207215e334094c943c96c93bbbf8e9d8795c14 Mon Sep 17 00:00:00 2001 From: Andrew Kozlik Date: Fri, 15 Mar 2024 11:10:47 +0100 Subject: [PATCH] feat(crypto): Add Brian Gladman's implementation of GCM. --- Makefile | 2 +- crypto/aes/aesgcm.c | 547 ++++++++++++++++++++++++++++ crypto/aes/aesgcm.h | 233 ++++++++++++ crypto/aes/brg_endian.h | 29 ++ crypto/aes/gf128mul.c | 471 ++++++++++++++++++++++++ crypto/aes/gf128mul.h | 215 +++++++++++ crypto/aes/gf_mul_lo.h | 773 ++++++++++++++++++++++++++++++++++++++++ crypto/aes/mode_hdr.h | 329 +++++++++++++++++ tools/style.c.exclude | 2 +- 9 files changed, 2599 insertions(+), 2 deletions(-) create mode 100644 crypto/aes/aesgcm.c create mode 100644 crypto/aes/aesgcm.h create mode 100644 crypto/aes/brg_endian.h create mode 100644 crypto/aes/gf128mul.c create mode 100644 crypto/aes/gf128mul.h create mode 100644 crypto/aes/gf_mul_lo.h create mode 100644 crypto/aes/mode_hdr.h diff --git a/Makefile b/Makefile index 1cab33d0f..04503bf02 100644 --- a/Makefile +++ b/Makefile @@ -65,7 +65,7 @@ yaml_check: ## check yaml formatting yamllint . editor_check: ## check editorconfig formatting - editorconfig-checker -exclude '.*\.(so|dat|toif|der)' + editorconfig-checker -exclude '.*\.(so|dat|toif|der)|^crypto/aes/' cstyle_check: ## run code style check on low-level C code clang-format --version diff --git a/crypto/aes/aesgcm.c b/crypto/aes/aesgcm.c new file mode 100644 index 000000000..a6415f97a --- /dev/null +++ b/crypto/aes/aesgcm.c @@ -0,0 +1,547 @@ +/* +--------------------------------------------------------------------------- +Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved. + +The redistribution and use of this software (with or without changes) +is allowed without the payment of fees or royalties provided that: + + source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation. + +This software is provided 'as is' with no explicit or implied warranties +in respect of its operation, including, but not limited to, correctness +and fitness for purpose. +--------------------------------------------------------------------------- +Issue Date: 30/03/2011 + + My thanks to: + + Colin Sinclair for finding an error and suggesting a number of + improvements to this code. + + John Viega and David McGrew for their support in the development + of this code and to David for testing it on a big-endIAN system. + + Mark Rodenkirch and Jason Papadopoulos for their help in finding + a bug in the fast buffer operations on big endian systems. +*/ + +#include "gcm.h" +#include "mode_hdr.h" + +/* This GCM implementation needs a Galois Field multiplier for GF(2^128). + which operates on field elements using a polynomial field representation + x^127 + x^126 + ... + x^2 + x + 1 using the bits in a bit sequence that + will be numbered by the power of x that they represent. GCM uses the + polynomial x^128 + x^7 + x^2 + x + 1 as its basis for representation. + + The obvious way of representing this in a computer system is to map GF + 'x' to the binary integer '2' - but this was way too obvious for any + cryptographer to adopt! + + Here bytes are numbered in memory order and bits within bytes according + to their integer numeric significance. The term 'little endian' is then + used to describe mappings in which numeric (power of 2) or field (power + of x) significance increase with increasing bit or byte numbers with + 'big endian' being used to describe the inverse situation. + + GCM uses little endian byte ordering and big endian bit ordering, a + representation that will be described as LB. Hence the low end of the + field polynomial is in byte[0], which has the value 0xe1 rather than + 0x87 in the more obvious mappings. + + The related field multipler can use this mapping but if you want to + use an alternative (e.g hardware) multiplier that uses a different + polynomial field representation, you can do so by changing the form + used for the field elements when this alternative multiplier is used. + + If GF_REPRESENTATION is defined as one of: + + REVERSE_BITS // change to LL + REVERSE_BYTES | REVERSE_BITS // change to BL + REVERSE_NONE // no change + REVERSE_BYTES // change to BB + + then an appropriate change of representation will occur before and + after calls to your revised field multiplier. To use this you need + to add gf_convert.c to your application. +*/ + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#if 1 +# undef GF_REPRESENTATION +#elif 0 +# define GF_REPRESENTATION REVERSE_BITS +#elif 0 +# define GF_REPRESENTATION REVERSE_BYTES | REVERSE_BITS +#elif 0 +# define GF_REPRESENTATION REVERSE_NONE +#elif 0 +# define GF_REPRESENTATION REVERSE_BITS +#endif + +#define BLOCK_SIZE GCM_BLOCK_SIZE /* block length */ +#define BLK_ADR_MASK (BLOCK_SIZE - 1) /* mask for 'in block' address */ +#define CTR_POS 12 + +#define inc_ctr(x) \ + { int i = BLOCK_SIZE; while(i-- > CTR_POS && !++(UI8_PTR(x)[i])) ; } + +ret_type gcm_init_and_key( /* initialise mode and set key */ + const unsigned char key[], /* the key value */ + unsigned long key_len, /* and its length in bytes */ + gcm_ctx ctx[1]) /* the mode context */ +{ + memset(ctx->ghash_h, 0, sizeof(ctx->ghash_h)); + + /* set the AES key */ + aes_encrypt_key(key, key_len, ctx->aes); + + /* compute E(0) (for the hash function) */ + aes_encrypt(UI8_PTR(ctx->ghash_h), UI8_PTR(ctx->ghash_h), ctx->aes); + +#if defined( GF_REPRESENTATION ) + convert_representation(ctx->ghash_h, ctx->ghash_h, GF_REPRESENTATION); +#endif + +#if defined( TABLES_64K ) + init_64k_table(ctx->ghash_h, ctx->gf_t64k); +#elif defined( TABLES_8K ) + init_8k_table(ctx->ghash_h, ctx->gf_t8k); +#elif defined( TABLES_4K ) + init_4k_table(ctx->ghash_h, ctx->gf_t4k); +#elif defined( TABLES_256 ) + init_256_table(ctx->ghash_h, ctx->gf_t256); +#endif +#if defined( GF_REPRESENTATION ) + convert_representation(ctx->ghash_h, ctx->ghash_h, GF_REPRESENTATION); +#endif + return RETURN_GOOD; +} + +void gf_mul_hh(gf_t a, gcm_ctx ctx[1]) +{ +#if defined( GF_REPRESENTATION ) || !defined( NO_TABLES ) + gf_t scr; +#endif +#if defined( GF_REPRESENTATION ) + convert_representation(a, a, GF_REPRESENTATION); +#endif + +#if defined( TABLES_64K ) + gf_mul_64k(a, ctx->gf_t64k, scr); +#elif defined( TABLES_8K ) + gf_mul_8k(a, ctx->gf_t8k, scr); +#elif defined( TABLES_4K ) + gf_mul_4k(a, ctx->gf_t4k, scr); +#elif defined( TABLES_256 ) + gf_mul_256(a, ctx->gf_t256, scr); +#else +# if defined( GF_REPRESENTATION ) + convert_representation(scr, ctx->ghash_h, GF_REPRESENTATION); + gf_mul(a, scr); +# else + gf_mul(a, ctx->ghash_h); +# endif +#endif + +#if defined( GF_REPRESENTATION ) + convert_representation(a, a, GF_REPRESENTATION); +#endif +} + +ret_type gcm_init_message( /* initialise a new message */ + const unsigned char iv[], /* the initialisation vector */ + unsigned long iv_len, /* and its length in bytes */ + gcm_ctx ctx[1]) /* the mode context */ +{ uint32_t i, n_pos = 0; + uint8_t *p; + + memset(ctx->ctr_val, 0, BLOCK_SIZE); + if(iv_len == CTR_POS) + { + memcpy(ctx->ctr_val, iv, CTR_POS); UI8_PTR(ctx->ctr_val)[15] = 0x01; + } + else + { n_pos = iv_len; + while(n_pos >= BLOCK_SIZE) + { + xor_block_aligned(ctx->ctr_val, ctx->ctr_val, iv); + n_pos -= BLOCK_SIZE; + iv += BLOCK_SIZE; + gf_mul_hh((gf_t*)ctx->ctr_val, ctx); + } + + if(n_pos) + { + p = UI8_PTR(ctx->ctr_val); + while(n_pos-- > 0) + *p++ ^= *iv++; + gf_mul_hh((gf_t*)ctx->ctr_val, ctx); + } + n_pos = (iv_len << 3); + for(i = BLOCK_SIZE - 1; n_pos; --i, n_pos >>= 8) + UI8_PTR(ctx->ctr_val)[i] ^= (unsigned char)n_pos; + gf_mul_hh((gf_t*)ctx->ctr_val, ctx); + } + + ctx->y0_val = *UI32_PTR(UI8_PTR(ctx->ctr_val) + CTR_POS); + memset(ctx->hdr_ghv, 0, BLOCK_SIZE); + memset(ctx->txt_ghv, 0, BLOCK_SIZE); + ctx->hdr_cnt = 0; + ctx->txt_ccnt = ctx->txt_acnt = 0; + return RETURN_GOOD; +} + +ret_type gcm_auth_header( /* authenticate the header */ + const unsigned char hdr[], /* the header buffer */ + unsigned long hdr_len, /* and its length in bytes */ + gcm_ctx ctx[1]) /* the mode context */ +{ uint32_t cnt = 0, b_pos = (uint32_t)ctx->hdr_cnt & BLK_ADR_MASK; + + if(!hdr_len) + return RETURN_GOOD; + + if(ctx->hdr_cnt && b_pos == 0) + gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx); + + if(!((hdr - (UI8_PTR(ctx->hdr_ghv) + b_pos)) & BUF_ADRMASK)) + { + while(cnt < hdr_len && (b_pos & BUF_ADRMASK)) + UI8_PTR(ctx->hdr_ghv)[b_pos++] ^= hdr[cnt++]; + + while(cnt + BUF_INC <= hdr_len && b_pos <= BLOCK_SIZE - BUF_INC) + { + *UNIT_PTR(UI8_PTR(ctx->hdr_ghv) + b_pos) ^= *UNIT_PTR(hdr + cnt); + cnt += BUF_INC; b_pos += BUF_INC; + } + + while(cnt + BLOCK_SIZE <= hdr_len) + { + gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx); + xor_block_aligned(ctx->hdr_ghv, ctx->hdr_ghv, hdr + cnt); + cnt += BLOCK_SIZE; + } + } + else + { + while(cnt < hdr_len && b_pos < BLOCK_SIZE) + UI8_PTR(ctx->hdr_ghv)[b_pos++] ^= hdr[cnt++]; + + while(cnt + BLOCK_SIZE <= hdr_len) + { + gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx); + xor_block(ctx->hdr_ghv, ctx->hdr_ghv, hdr + cnt); + cnt += BLOCK_SIZE; + } + } + + while(cnt < hdr_len) + { + if(b_pos == BLOCK_SIZE) + { + gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx); + b_pos = 0; + } + UI8_PTR(ctx->hdr_ghv)[b_pos++] ^= hdr[cnt++]; + } + + ctx->hdr_cnt += cnt; + return RETURN_GOOD; +} + +ret_type gcm_auth_data( /* authenticate ciphertext data */ + const unsigned char data[], /* the data buffer */ + unsigned long data_len, /* and its length in bytes */ + gcm_ctx ctx[1]) /* the mode context */ +{ uint32_t cnt = 0, b_pos = (uint32_t)ctx->txt_acnt & BLK_ADR_MASK; + + if(!data_len) + return RETURN_GOOD; + + if(ctx->txt_acnt && b_pos == 0) + gf_mul_hh((gf_t*)ctx->txt_ghv, ctx); + + if(!((data - (UI8_PTR(ctx->txt_ghv) + b_pos)) & BUF_ADRMASK)) + { + while(cnt < data_len && (b_pos & BUF_ADRMASK)) + UI8_PTR(ctx->txt_ghv)[b_pos++] ^= data[cnt++]; + + while(cnt + BUF_INC <= data_len && b_pos <= BLOCK_SIZE - BUF_INC) + { + *UNIT_PTR(UI8_PTR(ctx->txt_ghv) + b_pos) ^= *UNIT_PTR(data + cnt); + cnt += BUF_INC; b_pos += BUF_INC; + } + + while(cnt + BLOCK_SIZE <= data_len) + { + gf_mul_hh((gf_t*)ctx->txt_ghv, ctx); + xor_block_aligned(ctx->txt_ghv, ctx->txt_ghv, data + cnt); + cnt += BLOCK_SIZE; + } + } + else + { + while(cnt < data_len && b_pos < BLOCK_SIZE) + UI8_PTR(ctx->txt_ghv)[b_pos++] ^= data[cnt++]; + + while(cnt + BLOCK_SIZE <= data_len) + { + gf_mul_hh((gf_t*)ctx->txt_ghv, ctx); + xor_block(ctx->txt_ghv, ctx->txt_ghv, data + cnt); + cnt += BLOCK_SIZE; + } + } + + while(cnt < data_len) + { + if(b_pos == BLOCK_SIZE) + { + gf_mul_hh((gf_t*)ctx->txt_ghv, ctx); + b_pos = 0; + } + UI8_PTR(ctx->txt_ghv)[b_pos++] ^= data[cnt++]; + } + + ctx->txt_acnt += cnt; + return RETURN_GOOD; +} + +ret_type gcm_crypt_data( /* encrypt or decrypt data */ + unsigned char data[], /* the data buffer */ + unsigned long data_len, /* and its length in bytes */ + gcm_ctx ctx[1]) /* the mode context */ +{ uint32_t cnt = 0, b_pos = (uint32_t)ctx->txt_ccnt & BLK_ADR_MASK; + + if(!data_len) + return RETURN_GOOD; + + if(!((data - (UI8_PTR(ctx->enc_ctr) + b_pos)) & BUF_ADRMASK)) + { + if(b_pos) + { + while(cnt < data_len && (b_pos & BUF_ADRMASK)) + data[cnt++] ^= UI8_PTR(ctx->enc_ctr)[b_pos++]; + + while(cnt + BUF_INC <= data_len && b_pos <= BLOCK_SIZE - BUF_INC) + { + *UNIT_PTR(data + cnt) ^= *UNIT_PTR(UI8_PTR(ctx->enc_ctr) + b_pos); + cnt += BUF_INC; b_pos += BUF_INC; + } + } + + while(cnt + BLOCK_SIZE <= data_len) + { + inc_ctr(ctx->ctr_val); + aes_encrypt(UI8_PTR(ctx->ctr_val), UI8_PTR(ctx->enc_ctr), ctx->aes); + xor_block_aligned(data + cnt, data + cnt, ctx->enc_ctr); + cnt += BLOCK_SIZE; + } + } + else + { + if(b_pos) + while(cnt < data_len && b_pos < BLOCK_SIZE) + data[cnt++] ^= UI8_PTR(ctx->enc_ctr)[b_pos++]; + + while(cnt + BLOCK_SIZE <= data_len) + { + inc_ctr(ctx->ctr_val); + aes_encrypt(UI8_PTR(ctx->ctr_val), UI8_PTR(ctx->enc_ctr), ctx->aes); + xor_block(data + cnt, data + cnt, ctx->enc_ctr); + cnt += BLOCK_SIZE; + } + } + + while(cnt < data_len) + { + if(b_pos == BLOCK_SIZE || !b_pos) + { + inc_ctr(ctx->ctr_val); + aes_encrypt(UI8_PTR(ctx->ctr_val), UI8_PTR(ctx->enc_ctr), ctx->aes); + b_pos = 0; + } + data[cnt++] ^= UI8_PTR(ctx->enc_ctr)[b_pos++]; + } + + ctx->txt_ccnt += cnt; + return RETURN_GOOD; +} + +ret_type gcm_compute_tag( /* compute authentication tag */ + unsigned char tag[], /* the buffer for the tag */ + unsigned long tag_len, /* and its length in bytes */ + gcm_ctx ctx[1]) /* the mode context */ +{ uint32_t i, ln; + gf_t tbuf; + + if(ctx->txt_acnt != ctx->txt_ccnt && ctx->txt_ccnt > 0) + return RETURN_ERROR; + + gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx); + gf_mul_hh((gf_t*)ctx->txt_ghv, ctx); + + if(ctx->hdr_cnt) + { + ln = (uint32_t)((ctx->txt_acnt + BLOCK_SIZE - 1) / BLOCK_SIZE); + if(ln) + { +#if 1 /* alternative versions of the exponentiation operation */ + memcpy(tbuf, ctx->ghash_h, BLOCK_SIZE); +# if defined( GF_REPRESENTATION ) + convert_representation(tbuf, tbuf, GF_REPRESENTATION); + convert_representation(ctx->hdr_ghv, ctx->hdr_ghv, GF_REPRESENTATION); +# endif + for( ; ; ) + { + if(ln & 1) + { + gf_mul((void*)ctx->hdr_ghv, tbuf); + } + if(!(ln >>= 1)) + break; + gf_mul(tbuf, tbuf); + } +#else /* this one seems slower on x86 and x86_64 :-( */ + i = ln | ln >> 1; i |= i >> 2; i |= i >> 4; + i |= i >> 8; i |= i >> 16; i &= ~(i >> 1); + memset(tbuf, 0, BLOCK_SIZE); + UI8_PTR(tbuf)[0] = 0x80; + while(i) + { +# if defined( GF_REPRESENTATION ) + convert_representation(tbuf, tbuf, GF_REPRESENTATION); +# endif + gf_mul(tbuf, tbuf); +# if defined( GF_REPRESENTATION ) + convert_representation(tbuf, tbuf, GF_REPRESENTATION); +# endif + if(i & ln) + gf_mul_hh((gf_t*)tbuf, ctx); + i >>= 1; + } +# if defined( GF_REPRESENTATION ) + convert_representation(tbuf, tbuf, GF_REPRESENTATION); + convert_representation(ctx->hdr_ghv, ctx->hdr_ghv, GF_REPRESENTATION); +# endif + gf_mul((void*)ctx->hdr_ghv, tbuf); +#endif +#if defined( GF_REPRESENTATION ) + convert_representation(ctx->hdr_ghv, ctx->hdr_ghv, GF_REPRESENTATION); +# endif + } + } + + i = BLOCK_SIZE; +#ifdef BRG_UI64 + { uint64_t tm = ((uint64_t)ctx->txt_acnt) << 3; + while(i-- > 0) + { + UI8_PTR(ctx->hdr_ghv)[i] ^= UI8_PTR(ctx->txt_ghv)[i] ^ (unsigned char)tm; + tm = (i == 8 ? (((uint64_t)ctx->hdr_cnt) << 3) : tm >> 8); + } + } +#else + { uint32_t tm = ctx->txt_acnt << 3; + + while(i-- > 0) + { + UI8_PTR(ctx->hdr_ghv)[i] ^= UI8_PTR(ctx->txt_ghv)[i] ^ (unsigned char)tm; + if(i & 3) + tm >>= 8; + else if(i == 4) + tm = ctx->txt_acnt >> 29; + else if(i == 8) + tm = ctx->hdr_cnt << 3; + else + tm = ctx->hdr_cnt >> 29; + } + } +#endif + + gf_mul_hh((gf_t*)ctx->hdr_ghv, ctx); + + memcpy(ctx->enc_ctr, ctx->ctr_val, BLOCK_SIZE); + *UI32_PTR(UI8_PTR(ctx->enc_ctr) + CTR_POS) = ctx->y0_val; + aes_encrypt(UI8_PTR(ctx->enc_ctr), UI8_PTR(ctx->enc_ctr), ctx->aes); + for(i = 0; i < (unsigned int)tag_len; ++i) + tag[i] = (unsigned char)(UI8_PTR(ctx->hdr_ghv)[i] ^ UI8_PTR(ctx->enc_ctr)[i]); + + return (ctx->txt_ccnt == ctx->txt_acnt ? RETURN_GOOD : RETURN_WARN); +} + +ret_type gcm_end( /* clean up and end operation */ + gcm_ctx ctx[1]) /* the mode context */ +{ + memset(ctx, 0, sizeof(gcm_ctx)); + return RETURN_GOOD; +} + +ret_type gcm_encrypt( /* encrypt & authenticate data */ + unsigned char data[], /* the data buffer */ + unsigned long data_len, /* and its length in bytes */ + gcm_ctx ctx[1]) /* the mode context */ +{ + + gcm_crypt_data(data, data_len, ctx); + gcm_auth_data(data, data_len, ctx); + return RETURN_GOOD; +} + +ret_type gcm_decrypt( /* authenticate & decrypt data */ + unsigned char data[], /* the data buffer */ + unsigned long data_len, /* and its length in bytes */ + gcm_ctx ctx[1]) /* the mode context */ +{ + gcm_auth_data(data, data_len, ctx); + gcm_crypt_data(data, data_len, ctx); + return RETURN_GOOD; +} + +ret_type gcm_encrypt_message( /* encrypt an entire message */ + const unsigned char iv[], /* the initialisation vector */ + unsigned long iv_len, /* and its length in bytes */ + const unsigned char hdr[], /* the header buffer */ + unsigned long hdr_len, /* and its length in bytes */ + unsigned char msg[], /* the message buffer */ + unsigned long msg_len, /* and its length in bytes */ + unsigned char tag[], /* the buffer for the tag */ + unsigned long tag_len, /* and its length in bytes */ + gcm_ctx ctx[1]) /* the mode context */ +{ + gcm_init_message(iv, iv_len, ctx); + gcm_auth_header(hdr, hdr_len, ctx); + gcm_encrypt(msg, msg_len, ctx); + return gcm_compute_tag(tag, tag_len, ctx) ? RETURN_ERROR : RETURN_GOOD; +} + +ret_type gcm_decrypt_message( /* decrypt an entire message */ + const unsigned char iv[], /* the initialisation vector */ + unsigned long iv_len, /* and its length in bytes */ + const unsigned char hdr[], /* the header buffer */ + unsigned long hdr_len, /* and its length in bytes */ + unsigned char msg[], /* the message buffer */ + unsigned long msg_len, /* and its length in bytes */ + const unsigned char tag[], /* the buffer for the tag */ + unsigned long tag_len, /* and its length in bytes */ + gcm_ctx ctx[1]) /* the mode context */ +{ uint8_t local_tag[BLOCK_SIZE]; + ret_type rr; + + gcm_init_message(iv, iv_len, ctx); + gcm_auth_header(hdr, hdr_len, ctx); + gcm_decrypt(msg, msg_len, ctx); + rr = gcm_compute_tag(local_tag, tag_len, ctx); + return (rr != RETURN_GOOD || memcmp(tag, local_tag, tag_len)) ? RETURN_ERROR : RETURN_GOOD; +} + +#if defined(__cplusplus) +} +#endif diff --git a/crypto/aes/aesgcm.h b/crypto/aes/aesgcm.h new file mode 100644 index 000000000..4fd58ea7a --- /dev/null +++ b/crypto/aes/aesgcm.h @@ -0,0 +1,233 @@ +/* +--------------------------------------------------------------------------- +Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved. + +The redistribution and use of this software (with or without changes) +is allowed without the payment of fees or royalties provided that: + + source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation. + +This software is provided 'as is' with no explicit or implied warranties +in respect of its operation, including, but not limited to, correctness +and fitness for purpose. +--------------------------------------------------------------------------- +Issue Date: 11/01/2011 + + I am grateful for the work done by Mark Rodenkirch and Jason Papadopoulos + in helping to remove a bug in the operation of this code on big endian + systems when fast buffer operations are enabled. + --------------------------------------------------------------------------- +*/ + +#ifndef _GCM_H +#define _GCM_H + +#include "aes.h" +#include "gf128mul.h" + +/* USER DEFINABLE OPTIONS (Further options need to be set in gf128mul.h) */ + +/* UNIT_BITS sets the size of variables used to process 16 byte buffers + when the buffer alignment allows this. When buffers are processed + in bytes, 16 individual operations are invoolved. But if, say, such + a buffer is divided into 4 32 bit variables, it can then be processed + in 4 operations, making the code typically much faster. In general + it will pay to use the longest natively supported size, which will + probably be 32 or 64 bits in 32 and 64 bit systems respectively. +*/ + +#if defined( UNIT_BITS ) +# undef UNIT_BITS +#endif + +#if !defined( UNIT_BITS ) +# if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN +# if 0 +# define UNIT_BITS 8 +# elif 0 +# define UNIT_BITS 32 +# elif 1 +# define UNIT_BITS 64 +# endif +# elif defined( _WIN64 ) +# define UNIT_BITS 64 +# else +# define UNIT_BITS 32 +# endif +#endif + +#if UNIT_BITS == 64 && !defined( NEED_UINT_64T ) +# define NEED_UINT_64T +#endif + +/* END OF USER DEFINABLE OPTIONS */ + +/* After encryption or decryption operations the return value of + 'compute tag' will be one of the values RETURN_GOOD, RETURN_WARN + or RETURN_ERROR, the latter indicating an error. A return value + RETURN_GOOD indicates that both encryption and authentication + have taken place and resulted in the returned tag value. If + the returned value is RETURN_WARN, the tag value is the result + of authentication alone without encryption (CCM) or decryption + (GCM and EAX). +*/ +#ifndef RETURN_GOOD +# define RETURN_WARN 1 +# define RETURN_GOOD 0 +# define RETURN_ERROR -1 +#endif + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#ifndef RET_TYPE_DEFINED + typedef int ret_type; +#endif +UNIT_TYPEDEF(gcm_unit_t, UNIT_BITS); +BUFR_TYPEDEF(gcm_buf_t, UNIT_BITS, AES_BLOCK_SIZE); + +#define GCM_BLOCK_SIZE AES_BLOCK_SIZE + +/* The GCM-AES context */ + +typedef struct +{ +#if defined( TABLES_64K ) + gf_t64k_a gf_t64k; +#endif +#if defined( TABLES_8K ) + gf_t8k_a gf_t8k; +#endif +#if defined( TABLES_4K ) + gf_t4k_a gf_t4k; +#endif +#if defined( TABLES_256 ) + gf_t256_a gf_t256; +#endif + gcm_buf_t ctr_val; /* CTR counter value */ + gcm_buf_t enc_ctr; /* encrypted CTR block */ + gcm_buf_t hdr_ghv; /* ghash buffer (header) */ + gcm_buf_t txt_ghv; /* ghash buffer (ciphertext) */ + gf_t ghash_h; /* ghash H value */ + aes_encrypt_ctx aes[1]; /* AES encryption context */ + uint32_t y0_val; /* initial counter value */ + uint32_t hdr_cnt; /* header bytes so far */ + uint32_t txt_ccnt; /* text bytes so far (encrypt) */ + uint32_t txt_acnt; /* text bytes so far (auth) */ +} gcm_ctx; + +/* The following calls handle mode initialisation, keying and completion */ + +ret_type gcm_init_and_key( /* initialise mode and set key */ + const unsigned char key[], /* the key value */ + unsigned long key_len, /* and its length in bytes */ + gcm_ctx ctx[1]); /* the mode context */ + +ret_type gcm_end( /* clean up and end operation */ + gcm_ctx ctx[1]); /* the mode context */ + +/* The following calls handle complete messages in memory as one operation */ + +ret_type gcm_encrypt_message( /* encrypt an entire message */ + const unsigned char iv[], /* the initialisation vector */ + unsigned long iv_len, /* and its length in bytes */ + const unsigned char hdr[], /* the header buffer */ + unsigned long hdr_len, /* and its length in bytes */ + unsigned char msg[], /* the message buffer */ + unsigned long msg_len, /* and its length in bytes */ + unsigned char tag[], /* the buffer for the tag */ + unsigned long tag_len, /* and its length in bytes */ + gcm_ctx ctx[1]); /* the mode context */ + + /* RETURN_GOOD is returned if the input tag */ + /* matches that for the decrypted message */ +ret_type gcm_decrypt_message( /* decrypt an entire message */ + const unsigned char iv[], /* the initialisation vector */ + unsigned long iv_len, /* and its length in bytes */ + const unsigned char hdr[], /* the header buffer */ + unsigned long hdr_len, /* and its length in bytes */ + unsigned char msg[], /* the message buffer */ + unsigned long msg_len, /* and its length in bytes */ + const unsigned char tag[], /* the buffer for the tag */ + unsigned long tag_len, /* and its length in bytes */ + gcm_ctx ctx[1]); /* the mode context */ + +/* The following calls handle messages in a sequence of operations followed */ +/* by tag computation after the sequence has been completed. In these calls */ +/* the user is responsible for verfiying the computed tag on decryption */ + +ret_type gcm_init_message( /* initialise a new message */ + const unsigned char iv[], /* the initialisation vector */ + unsigned long iv_len, /* and its length in bytes */ + gcm_ctx ctx[1]); /* the mode context */ + +ret_type gcm_auth_header( /* authenticate the header */ + const unsigned char hdr[], /* the header buffer */ + unsigned long hdr_len, /* and its length in bytes */ + gcm_ctx ctx[1]); /* the mode context */ + +ret_type gcm_encrypt( /* encrypt & authenticate data */ + unsigned char data[], /* the data buffer */ + unsigned long data_len, /* and its length in bytes */ + gcm_ctx ctx[1]); /* the mode context */ + +ret_type gcm_decrypt( /* authenticate & decrypt data */ + unsigned char data[], /* the data buffer */ + unsigned long data_len, /* and its length in bytes */ + gcm_ctx ctx[1]); /* the mode context */ + +ret_type gcm_compute_tag( /* compute authentication tag */ + unsigned char tag[], /* the buffer for the tag */ + unsigned long tag_len, /* and its length in bytes */ + gcm_ctx ctx[1]); /* the mode context */ + +/* The use of the following calls should be avoided if possible because + their use requires a very good understanding of the way this encryption + mode works and the way in which this code implements it in order to use + them correctly. + + The gcm_auth_data routine is used to authenticate encrypted message data. + In message encryption gcm_crypt_data must be called before gcm_auth_data + is called since it is encrypted data that is authenticated. In message + decryption authentication must occur before decryption and data can be + authenticated without being decrypted if necessary. + + If these calls are used it is up to the user to ensure that these routines + are called in the correct order and that the correct data is passed to + them. + + When gcm_compute_tag is called it is assumed that an error in use has + occurred if both encryption (or decryption) and authentication have taken + place but the total lengths of the message data respectively authenticated + and encrypted are not the same. If authentication has taken place but + there has been no corresponding encryption or decryption operations (none + at all) only a warning is issued. This should be treated as an error if it + occurs during encryption but it is only signalled as a warning as it might + be intentional when decryption operations are involved (this avoids having + different compute tag functions for encryption and decryption). Decryption + operations can be undertaken freely after authetication but if the tag is + computed after such operations an error will be signalled if the lengths + of the data authenticated and decrypted don't match. +*/ + +ret_type gcm_auth_data( /* authenticate ciphertext data */ + const unsigned char data[], /* the data buffer */ + unsigned long data_len, /* and its length in bytes */ + gcm_ctx ctx[1]); /* the mode context */ + +ret_type gcm_crypt_data( /* encrypt or decrypt data */ + unsigned char data[], /* the data buffer */ + unsigned long data_len, /* and its length in bytes */ + gcm_ctx ctx[1]); /* the mode context */ + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/crypto/aes/brg_endian.h b/crypto/aes/brg_endian.h new file mode 100644 index 000000000..1e3cbf1e8 --- /dev/null +++ b/crypto/aes/brg_endian.h @@ -0,0 +1,29 @@ +/* +--------------------------------------------------------------------------- +Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved. + +The redistribution and use of this software (with or without changes) +is allowed without the payment of fees or royalties provided that: + + source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation. + +This software is provided 'as is' with no explicit or implied warranties +in respect of its operation, including, but not limited to, correctness +and fitness for purpose. +--------------------------------------------------------------------------- +Issue Date: 10/09/2018 +*/ + +#ifndef _BRG_ENDIAN_H +#define _BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#endif diff --git a/crypto/aes/gf128mul.c b/crypto/aes/gf128mul.c new file mode 100644 index 000000000..0a4eb3cd6 --- /dev/null +++ b/crypto/aes/gf128mul.c @@ -0,0 +1,471 @@ +/* +--------------------------------------------------------------------------- +Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved. + +The redistribution and use of this software (with or without changes) +is allowed without the payment of fees or royalties provided that: + + source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation. + +This software is provided 'as is' with no explicit or implied warranties +in respect of its operation, including, but not limited to, correctness +and fitness for purpose. +--------------------------------------------------------------------------- +Issue Date: 20/12/2007 + + This file provides fast multiplication in GF(128) as required by several + cryptographic authentication modes (see gfmul128.h). +*/ + +/* Speed critical loops can be unrolled to gain speed but consume more memory */ +#if 1 +# define UNROLL_LOOPS +#endif + +/* The order of these includes matters */ +#include "mode_hdr.h" +#include "gf128mul.h" +#include "gf_mul_lo.h" + +#if defined( GF_MODE_LL ) +# define mode _ll +#elif defined( GF_MODE_BL ) +# define mode _bl +#elif defined( GF_MODE_LB ) +# define mode _lb +#elif defined( GF_MODE_BB ) +# define mode _bb +#else +# error mode is not defined +#endif + +#if defined( GF_MODE_LL) || defined( GF_MODE_LB ) +# define GF_INDEX(i) (i) +#else +# define GF_INDEX(i) (15 - (i)) +#endif + +/* A slow field multiplier */ + +void gf_mul(gf_t a, const gf_t b) +{ gf_t p[8]; + uint8_t *q, ch; + int i; + + copy_block_aligned(p[0], a); + for(i = 0; i < 7; ++i) + gf_mulx1(mode)(p[i + 1], p[i]); + + q = (uint8_t*)(a == b ? p[0] : b); + memset(a, 0, GF_BYTE_LEN); + for(i = 15 ; ; ) + { + ch = q[GF_INDEX(i)]; + if(ch & X_0) + xor_block_aligned(a, a, p[0]); + if(ch & X_1) + xor_block_aligned(a, a, p[1]); + if(ch & X_2) + xor_block_aligned(a, a, p[2]); + if(ch & X_3) + xor_block_aligned(a, a, p[3]); + if(ch & X_4) + xor_block_aligned(a, a, p[4]); + if(ch & X_5) + xor_block_aligned(a, a, p[5]); + if(ch & X_6) + xor_block_aligned(a, a, p[6]); + if(ch & X_7) + xor_block_aligned(a, a, p[7]); + if(!i--) + break; + gf_mulx8(mode)(a); + } +} + +#if defined( TABLES_64K ) + +/* This version uses 64k bytes of table space on the stack. + An input variable field value in a[] has to be multiplied + by a key value in g[] that changes far less frequently. + + To do this a[] is split up into 16 smaller field values, + each one byte in length. For the 256 values of each of + these smaller values, we can precompute the result of + mulltiplying g by this field value. We can then combine + these values to provide the full multiply. So for each + of 16 bytes we have a table of 256 field values each of + 16 bytes - 64k bytes in total. +*/ + +void init_64k_table(const gf_t g, gf_t64k_t t) +{ int i = 0, j, k; + + /* + depending on the representation we have to process bits + within bytes high to low (0xe1 style ) or low to high + (0x87 style). We start by producing the powers x ,x^2 + .. x^7 and put them in t[0][1], t[0][2] .. t[128] or in + t[128], t[64] .. t[1] depending on the bit order in use. + */ + + /* clear the element for the zero field element */ + memset(t[0][0], 0, GF_BYTE_LEN); + +#if defined( GF_MODE_LL ) || defined( GF_MODE_BL ) + + /* g -> t[0][1], generate t[0][2] ... */ + memcpy(t[0][1], g, GF_BYTE_LEN); + for(j = 1; j <= 64; j <<= 1) + gf_mulx1(mode)(t[0][j + j], t[0][j]); +#else + + /* g -> t[0][128], generate t[0][64] ... */ + memcpy(t[0][128], g, GF_BYTE_LEN); + for(j = 64; j >= 1; j >>= 1) + gf_mulx1(mode)(t[0][j], t[0][j + j]); +#endif + + for( ; ; ) + { + /* if { n } stands for the field value represented by + the integer n, we can express higher multiplies in + the table as follows: + + 1. g * { 3} = g * {2} ^ g * {1} + + 2. g * { 5} = g * {4} ^ g * {1} + g * { 6} = g * {4} ^ g * {2} + g * { 7} = g * {4} ^ g * {3} + + 3. g * { 9} = g * {8} ^ g * {1} + g * {10} = g * {8} ^ g * {2} + .... + + and so on. This is what the following loops do. + */ + for(j = 2; j < 256; j += j) + for(k = 1; k < j; ++k) + xor_block_aligned(t[i][j + k], t[i][j], t[i][k]); + + if(++i == GF_BYTE_LEN) /* all 16 byte positions done */ + return; + + /* We now move to the next byte up and set up its eight + starting values by multiplying the values in the + lower table by x^8 + */ + memset(t[i][0], 0, GF_BYTE_LEN); + for(j = 128; j > 0; j >>= 1) + { + memcpy(t[i][j], t[i - 1][j], GF_BYTE_LEN); + gf_mulx8(mode)(t[i][j]); + } + } +} + +#define xor_64k(i,ap,t,r) xor_block_aligned(r, r, t[i][ap[GF_INDEX(i)]]) + +#if defined( UNROLL_LOOPS ) + +void gf_mul_64k(gf_t a, const gf_t64k_t t, gf_t r) +{ uint8_t *ap = (uint8_t*)a; + memset(r, 0, GF_BYTE_LEN); + xor_64k(15, ap, t, r); xor_64k(14, ap, t, r); + xor_64k(13, ap, t, r); xor_64k(12, ap, t, r); + xor_64k(11, ap, t, r); xor_64k(10, ap, t, r); + xor_64k( 9, ap, t, r); xor_64k( 8, ap, t, r); + xor_64k( 7, ap, t, r); xor_64k( 6, ap, t, r); + xor_64k( 5, ap, t, r); xor_64k( 4, ap, t, r); + xor_64k( 3, ap, t, r); xor_64k( 2, ap, t, r); + xor_64k( 1, ap, t, r); xor_64k( 0, ap, t, r); + copy_block_aligned(a, r); +} + +#else + +void gf_mul_64k(gf_t a, const gf_t64k_t t, gf_t r) +{ int i; + uint8_t *ap = (uint8_t*)a; + memset(r, 0, GF_BYTE_LEN); + for(i = 15; i >= 0; --i) + { + xor_64k(i,ap,t,r); + } + copy_block_aligned(a, r); +} + +#endif + +#endif + +#if defined( TABLES_8K ) + +/* This version uses 8k bytes of table space on the stack. + An input field value in a[] has to be multiplied by a + key value in g[]. To do this a[] is split up into 32 + smaller field values each 4-bits in length. For the + 16 values of each of these smaller field values we can + precompute the result of mulltiplying g[] by the field + value in question. So for each of 32 nibbles we have a + table of 16 field values, each of 16 bytes - 8k bytes + in total. +*/ +void init_8k_table(const gf_t g, gf_t8k_t t) +{ int i = 0, j, k; + + /* do the low 4-bit nibble first - t[0][16] - and note + that the unit multiplier sits at 0x01 - t[0][1] in + the table. Then multiplies by x go at 2, 4, 8 + */ + /* set the table elements for a zero multiplier */ + memset(t[0][0], 0, GF_BYTE_LEN); + memset(t[1][0], 0, GF_BYTE_LEN); + +#if defined( GF_MODE_LL ) || defined( GF_MODE_BL ) + + /* t[0][1] = g, compute t[0][2], t[0][4], t[0][8] */ + memcpy(t[0][1], g, GF_BYTE_LEN); + for(j = 1; j <= 4; j <<= 1) + gf_mulx1(mode)(t[0][j + j], t[0][j]); + /* t[1][1] = t[0][1] * x^4 = t[0][8] * x */ + gf_mulx1(mode)(t[1][1], t[0][8]); + for(j = 1; j <= 4; j <<= 1) + gf_mulx1(mode)(t[1][j + j], t[1][j]); +#else + + /* g -> t[0][8], compute t[0][4], t[0][2], t[0][1] */ + memcpy(t[1][8], g, GF_BYTE_LEN); + for(j = 4; j >= 1; j >>= 1) + gf_mulx1(mode)(t[1][j], t[1][j + j]); + /* t[1][1] = t[0][1] * x^4 = t[0][8] * x */ + gf_mulx1(mode)(t[0][8], t[1][1]); + for(j = 4; j >= 1; j >>= 1) + gf_mulx1(mode)(t[0][j], t[0][j + j]); +#endif + + for( ; ; ) + { + for(j = 2; j < 16; j += j) + for(k = 1; k < j; ++k) + xor_block_aligned(t[i][j + k], t[i][j], t[i][k]); + + if(++i == 2 * GF_BYTE_LEN) + return; + + if(i > 1) + { + memset(t[i][0], 0, GF_BYTE_LEN); + for(j = 8; j > 0; j >>= 1) + { + memcpy(t[i][j], t[i - 2][j], GF_BYTE_LEN); + gf_mulx8(mode)(t[i][j]); + } + } + + } +} + +#define xor_8k(i,ap,t,r) \ + xor_block_aligned(r, r, t[i + i][ap[GF_INDEX(i)] & 15]); \ + xor_block_aligned(r, r, t[i + i + 1][ap[GF_INDEX(i)] >> 4]) + +#if defined( UNROLL_LOOPS ) + +void gf_mul_8k(gf_t a, const gf_t8k_t t, gf_t r) +{ uint8_t *ap = (uint8_t*)a; + memset(r, 0, GF_BYTE_LEN); + xor_8k(15, ap, t, r); xor_8k(14, ap, t, r); + xor_8k(13, ap, t, r); xor_8k(12, ap, t, r); + xor_8k(11, ap, t, r); xor_8k(10, ap, t, r); + xor_8k( 9, ap, t, r); xor_8k( 8, ap, t, r); + xor_8k( 7, ap, t, r); xor_8k( 6, ap, t, r); + xor_8k( 5, ap, t, r); xor_8k( 4, ap, t, r); + xor_8k( 3, ap, t, r); xor_8k( 2, ap, t, r); + xor_8k( 1, ap, t, r); xor_8k( 0, ap, t, r); + copy_block_aligned(a, r); +} + +#else + +void gf_mul_8k(gf_t a, const gf_t8k_t t, gf_t r) +{ int i; + uint8_t *ap = (uint8_t*)a; + memset(r, 0, GF_BYTE_LEN); + for(i = 15; i >= 0; --i) + { + xor_8k(i,ap,t,r); + } + memcpy(a, r, GF_BYTE_LEN); +} + +#endif + +#endif + +#if defined( TABLES_4K ) + +/* This version uses 4k bytes of table space on the stack. + A 16 byte buffer has to be multiplied by a 16 byte key + value in GF(128). If we consider a GF(128) value in a + single byte, we can construct a table of the 256 16 + byte values that result from multiplying g by the 256 + values of this byte. This requires 4096 bytes. + + If we take the highest byte in the buffer and use this + table to multiply it by g, we then have to multiply it + by x^120 to get the final value. For the next highest + byte the result has to be multiplied by x^112 and so on. + + But we can do this by accumulating the result in an + accumulator starting with the result for the top byte. + We repeatedly multiply the accumulator value by x^8 and + then add in (i.e. xor) the 16 bytes of the next lower + byte in the buffer, stopping when we reach the lowest + byte. This requires a 4096 byte table. +*/ + +void init_4k_table(const gf_t g, gf_t4k_t t) +{ int j, k; + + memset(t[0], 0, GF_BYTE_LEN); + +#if defined( GF_MODE_LL ) || defined( GF_MODE_BL ) + + memcpy(t[1], g, GF_BYTE_LEN); + for(j = 1; j <= 64; j <<= 1) + gf_mulx1(mode)(t[j + j], t[j]); +#else + + memcpy(t[128], g, GF_BYTE_LEN); + for(j = 64; j >= 1; j >>= 1) + gf_mulx1(mode)(t[j], t[j + j]); +#endif + + for(j = 2; j < 256; j += j) + for(k = 1; k < j; ++k) + xor_block_aligned(t[j + k], t[j], t[k]); +} + +#define xor_4k(i,ap,t,r) gf_mulx8(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)]]) + +#if defined( UNROLL_LOOPS ) + +void gf_mul_4k(gf_t a, const gf_t4k_t t, gf_t r) +{ uint8_t *ap = (uint8_t*)a; + memset(r, 0, GF_BYTE_LEN); + xor_4k(15, ap, t, r); xor_4k(14, ap, t, r); + xor_4k(13, ap, t, r); xor_4k(12, ap, t, r); + xor_4k(11, ap, t, r); xor_4k(10, ap, t, r); + xor_4k( 9, ap, t, r); xor_4k( 8, ap, t, r); + xor_4k( 7, ap, t, r); xor_4k( 6, ap, t, r); + xor_4k( 5, ap, t, r); xor_4k( 4, ap, t, r); + xor_4k( 3, ap, t, r); xor_4k( 2, ap, t, r); + xor_4k( 1, ap, t, r); xor_4k( 0, ap, t, r); + copy_block_aligned(a, r); +} + +#else + +void gf_mul_4k(gf_t a, const gf_t4k_t t, gf_t r) +{ int i = 15; + uint8_t *ap = (uint8_t*)a; + memset(r, 0, GF_BYTE_LEN); + for(i = 15; i >=0; --i) + { + xor_4k(i, ap, t, r); + } + copy_block_aligned(a, r); +} + +#endif + +#endif + +#if defined( TABLES_256 ) + +/* This version uses 256 bytes of table space on the stack. + A 16 byte buffer has to be multiplied by a 16 byte key + value in GF(128). If we consider a GF(128) value in a + single 4-bit nibble, we can construct a table of the 16 + 16 byte values that result from the 16 values of this + byte. This requires 256 bytes. If we take the highest + 4-bit nibble in the buffer and use this table to get the + result, we then have to multiply by x^124 to get the + final value. For the next highest byte the result has to + be multiplied by x^120 and so on. But we can do this by + accumulating the result in an accumulator starting with + the result for the top nibble. We repeatedly multiply + the accumulator value by x^4 and then add in (i.e. xor) + the 16 bytes of the next lower nibble in the buffer, + stopping when we reach the lowest nibble. This uses a + 256 byte table. +*/ + +void init_256_table(const gf_t g, gf_t256_t t) +{ int j, k; + + memset(t[0], 0, GF_BYTE_LEN); + +#if defined( GF_MODE_LL ) || defined( GF_MODE_BL ) + + memcpy(t[1], g, GF_BYTE_LEN); + for(j = 1; j <= 4; j <<= 1) + gf_mulx1(mode)(t[j + j], t[j]); +#else + + memcpy(t[8], g, GF_BYTE_LEN); + for(j = 4; j >= 1; j >>= 1) + gf_mulx1(mode)(t[j], t[j + j]); +#endif + + for(j = 2; j < 16; j += j) + for(k = 1; k < j; ++k) + xor_block_aligned(t[j + k], t[j], t[k]); +} + +#define x_lo(i,ap,t,r) gf_mulx4(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)] & 0x0f]) +#define x_hi(i,ap,t,r) gf_mulx4(mode)(r); xor_block_aligned(r, r, t[ap[GF_INDEX(i)] >> 4]) + +#if defined( GF_MODE_LL ) || defined( GF_MODE_BL ) +#define xor_256(a,b,c,d) x_hi(a,b,c,d); x_lo(a,b,c,d) +#else +#define xor_256(a,b,c,d) x_lo(a,b,c,d); x_hi(a,b,c,d) +#endif + +#if defined( UNROLL_LOOPS ) + +void gf_mul_256(gf_t a, const gf_t256_t t, gf_t r) +{ uint8_t *ap = (uint8_t*)a; + memset(r, 0, GF_BYTE_LEN); + xor_256(15, ap, t, r); xor_256(14, ap, t, r); + xor_256(13, ap, t, r); xor_256(12, ap, t, r); + xor_256(11, ap, t, r); xor_256(10, ap, t, r); + xor_256( 9, ap, t, r); xor_256( 8, ap, t, r); + xor_256( 7, ap, t, r); xor_256( 6, ap, t, r); + xor_256( 5, ap, t, r); xor_256( 4, ap, t, r); + xor_256( 3, ap, t, r); xor_256( 2, ap, t, r); + xor_256( 1, ap, t, r); xor_256( 0, ap, t, r); + copy_block_aligned(a, r); +} + +#else + +void gf_mul_256(gf_t a, const gf_t256_t t, gf_t r) +{ int i; + uint8_t *ap = (uint8_t*)a; + memset(r, 0, GF_BYTE_LEN); + for(i = 15; i >= 0; --i) + { + xor_256(i, ap, t, r); + } + copy_block_aligned(a, r); +} + +#endif + +#endif diff --git a/crypto/aes/gf128mul.h b/crypto/aes/gf128mul.h new file mode 100644 index 000000000..9a59c00dc --- /dev/null +++ b/crypto/aes/gf128mul.h @@ -0,0 +1,215 @@ +/* +--------------------------------------------------------------------------- +Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved. + +The redistribution and use of this software (with or without changes) +is allowed without the payment of fees or royalties provided that: + + source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation. + +This software is provided 'as is' with no explicit or implied warranties +in respect of its operation, including, but not limited to, correctness +and fitness for purpose. +--------------------------------------------------------------------------- +Issue Date: 11/01/2011 + + I am grateful for the work done by Mark Rodenkirch and Jason Papadopoulos + in helping to remove a bug in the operation of this code on big endian + systems when fast buffer operations are enabled. + --------------------------------------------------------------------------- + + An implementation of field multiplication in the Galois Field GF(2^128) + + A polynomial representation is used for the field with the coefficients + held in bit sequences in which the bit numbers are the powers of x that + a bit represents. The field polynomial used is (x^128+x^7+x^2+x+1). + + The obvious way of representing field elements in a computer system is + to map 'x' in the field to the binary integer '2'. But this was way too + obvious for cryptographers! + + Here bytes are numbered in their memory order and bits within bytes are + numbered according to their integer numeric significance (that is as is + now normal with bit 0 representing unity). The term 'little endian' + will then used to describe mappings where numeric (power of 2) or field + (power of x) significance increases with increasing bit or byte numbers + with 'big endian' being used to describe the inverse situation. + + The GF bit sequence can then be mapped onto 8-bit bytes in computer + memory in one of four simple ways: + + A mapping in which x maps to the integer 2 in little endian + form for both bytes and bits within bytes: + + LL: bit for x^n ==> bit for 2^(n % 8) in byte[n / 8] + + A mapping in which x maps to the integer 2 in big endian form + for both bytes and bits within bytes: + + BL: bit for x^n ==> bit for 2^(n % 8) in byte[15 - n / 8] + + A little endian mapping for bytes but with the bits within + bytes in reverse order (big endian bytes): + + LB: bit for x^n ==> bit for 2^(7 - n % 8) in byte[n / 8] + + A big endian mapping for bytes but with the bits within + bytes in reverse order (big endian bytes): + + BB: bit for x^n ==> bit for 2^(7 - n % 8) in byte[15 - n / 8] + + 128-bit field elements are represented by 16 byte buffers but for + processing efficiency reasons it is often desirable to process arrays + of bytes using longer types such as, for example, unsigned long values. + The type used for representing these buffers will be called a 'gf_unit' + and the buffer itself will be referred to as a 'gf_t' type. + + THe field multiplier is based on the assumption that one of the two + field elements involved in multiplication will change only relatively + infrequently, making it worthwhile to precompute tables to speed up + multiplication by this value. +*/ + +#ifndef _GF128MUL_H +#define _GF128MUL_H + +#include +#include + +#include "brg_endian.h" + +/* USER DEFINABLE OPTIONS */ +/* UNIT_BITS sets the size of variables used to process 16 byte buffers + when the buffer alignment allows this. When buffers are processed + in bytes, 16 individual operations are invoolved. But if, say, such + a buffer is divided into 4 32 bit variables, it can then be processed + in 4 operations, making the code typically much faster. In general + it will pay to use the longest natively supported size, which will + probably be 32 or 64 bits in 32 and 64 bit systems respectively. +*/ + +#if defined( UNIT_BITS ) +# undef UNIT_BITS +#endif + +#if !defined( UNIT_BITS ) +# if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN +# if 0 +# define UNIT_BITS 8 +# elif 0 +# define UNIT_BITS 32 +# elif 1 +# define UNIT_BITS 64 +# endif +# elif defined( _WIN64 ) +# define UNIT_BITS 64 +# else +# define UNIT_BITS 32 +# endif +#endif + +#if UNIT_BITS == 64 && !defined( NEED_UINT_64T ) +# define NEED_UINT_64T +#endif + +#include "brg_types.h" + +/* Choose the Galois Field representation to use (see above) */ +#if 0 +# define GF_MODE_LL +#elif 0 +# define GF_MODE_BL +#elif 1 +# define GF_MODE_LB /* the representation used by GCM */ +#elif 0 +# define GF_MODE_BB +#else +# error mode is not defined +#endif + +/* Table sizes for GF(128) Multiply. Normally larger tables give + higher speed but cache loading might change this. Normally only + one table size (or none at all) will be specified here +*/ +#if 0 +# define TABLES_64K +#endif +#if 0 +# define TABLES_8K +#endif +#if 1 +# define TABLES_4K +#endif +#if 0 +# define TABLES_256 +#endif + +/* END OF USER DEFINABLE OPTIONS */ + +#if !(defined( TABLES_64K ) || defined( TABLES_8K ) \ + || defined( TABLES_4K ) || defined( TABLES_256 )) +# define NO_TABLES +#endif + +#if defined(__cplusplus) +extern "C" +{ +#endif + +#define GF_BYTE_LEN 16 +#define GF_UNIT_LEN (GF_BYTE_LEN / (UNIT_BITS >> 3)) + +UNIT_TYPEDEF(gf_unit_t, UNIT_BITS); +BUFR_TYPEDEF(gf_t, UNIT_BITS, GF_BYTE_LEN); + +/* Code for conversion between the four different galois field representations + is optionally available using gf_convert.c +*/ + +typedef enum { REVERSE_NONE = 0, REVERSE_BITS = 1, REVERSE_BYTES = 2 } transform; + +void convert_representation(gf_t dest, const gf_t source, transform rev); + +void gf_mul(gf_t a, const gf_t b); /* slow field multiply */ + +/* types and calls for 64k table driven field multiplier */ + +typedef gf_t gf_t64k_a[16][256]; +typedef gf_t (*gf_t64k_t)[256]; + +void init_64k_table(const gf_t g, gf_t64k_t t); +void gf_mul_64k(gf_t a, const gf_t64k_t t, void *r); + +/* types and calls for 8k table driven field multiplier */ + +typedef gf_t gf_t8k_a[32][16]; +typedef gf_t (*gf_t8k_t)[16]; + +void init_8k_table(const gf_t g, gf_t8k_t t); +void gf_mul_8k(gf_t a, const gf_t8k_t t, gf_t r); + +/* types and calls for 8k table driven field multiplier */ + +typedef gf_t gf_t4k_a[256]; +typedef gf_t (*gf_t4k_t); + +void init_4k_table(const gf_t g, gf_t4k_t t); +void gf_mul_4k(gf_t a, const gf_t4k_t t, gf_t r); + +/* types and calls for 8k table driven field multiplier */ + +typedef gf_t gf_t256_a[16]; +typedef gf_t (*gf_t256_t); + +void init_256_table(const gf_t g, gf_t256_t t); +void gf_mul_256(gf_t a, const gf_t256_t t, gf_t r); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/crypto/aes/gf_mul_lo.h b/crypto/aes/gf_mul_lo.h new file mode 100644 index 000000000..2691646f0 --- /dev/null +++ b/crypto/aes/gf_mul_lo.h @@ -0,0 +1,773 @@ +/* +--------------------------------------------------------------------------- +Copyright (c) 1998-2010, Brian Gladman, Worcester, UK. All rights reserved. + +The redistribution and use of this software (with or without changes) +is allowed without the payment of fees or royalties provided that: + + source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation. + +This software is provided 'as is' with no explicit or implied warranties +in respect of its operation, including, but not limited to, correctness +and fitness for purpose. +--------------------------------------------------------------------------- +Issue Date: 18/02/2014 + + This file provides the low level primitives needed for Galois Field + operations in GF(2^128) for the four most likely field representations. +*/ + +#ifndef _GF_MUL_LO_H +#define _GF_MUL_LO_H + +#if defined( USE_INLINING ) +# if defined( _MSC_VER ) +# define gf_decl __inline +# elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# define gf_decl static inline +# else +# define gf_decl static +# endif +#endif + +#if 0 /* used for testing only: t1(UNIT_BITS), t2(UNIT_BITS) */ +# define _t1(n) bswap ## n ## _block(x, x) +# define t1(n) _t1(n) +# define _t2(n) bswap ## n ## _block(x, x); bswap ## n ## _block(r, r) +# define t2(n) _t2(n) +#endif + +#define gf_m(n,x) gf_mulx ## n ## x +#define gf_mulx1(x) gf_m(1,x) +#define gf_mulx4(x) gf_m(4,x) +#define gf_mulx8(x) gf_m(8,x) + +#define MASK(x) ((x) * (UNIT_CAST(-1,UNIT_BITS) / 0xff)) + +#define DATA_256(q) {\ + q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\ + q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\ + q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\ + q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\ + q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\ + q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\ + q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\ + q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\ + q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\ + q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\ + q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\ + q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\ + q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\ + q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\ + q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\ + q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\ + q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\ + q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\ + q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\ + q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\ + q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\ + q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\ + q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\ + q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\ + q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\ + q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\ + q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\ + q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\ + q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\ + q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\ + q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\ + q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) } + +/* Within the 16 bytes of the field element the top and bottom field bits + are within bytes as follows (bit numbers in bytes 0 from ls up) for + each of the four field representations supported (see gf128mul.txt): + + GF_BIT 127 126 125 124 123 122 121 120 ..... 7 6 5 4 3 2 1 0 + 0x87 1 0 0 0 0 1 1 1 + BL x[ 0] 7 6 5 4 3 2 1 0 x[15] 7 6 5 4 3 2 1 0 + LL x[15] 7 6 5 4 3 2 1 0 x[ 0] 7 6 5 4 3 2 1 0 + + GF_BIT 120 121 122 123 124 125 126 127 ..... 0 1 2 3 4 5 6 7 + 0xc1 1 1 1 0 0 0 0 1 + BB x[ 0] 7 6 5 4 3 2 1 0 x[15] 7 6 5 4 3 2 1 0 + LB x[15] 7 6 5 4 3 2 1 0 x[ 0] 7 6 5 4 3 2 1 0 + + When the field element is multiplied by x^n, the high bits overflow + and are used to form an overflow byte. For the BL and LL modes this + byte has the lowest overflow bit in bit 0 whereas for the BB and LB + modes this bit is in biit 7. So we have for this byte: + + bit (bit n = 2^n) 7 6 5 4 3 2 1 0 + BL and LL x^7 x^6 x^5 x^4 x^3 x^2 x^1 x^0 + BB and LB x^0 x^1 x^2 x^3 x^4 x^5 x^6 x^7 + + This byte then has to be multiplied by the low bits of the field + polynomial, which produces a value of 16 bits to be xored into the + left shifted field value. For the BL and LL modes bit 0 gives the + word value 0x0087, bit 1 gives 0x010e (0x87 left shifted 1), 0x021c + (0x87 left shifted 2), ... For the BB and LB modes, bit 7 gives the + value 0x00e1, bit 6 gives 0x8070, bit 5 gives 0x4038, ... Each bit + in the overflow byte is expanded in this way and is xored into the + overall result, so eaach of the 256 byte values will produce a + corresponding word value that is computed by the gf_uint16_xor(i) + macros below. + + These word values have to be xored into the low 16 bits of the + field value. If the byte endianess of the mode matches that of + the architecture xoring the word value will be correct. But if + the mode has the opposite endianess, the word value has to be + xored in byte reversed order. This is done by the ord() macro. +*/ + +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN \ + && (defined( GF_MODE_LB ) || defined( GF_MODE_LL )) || \ + PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN \ + && (defined( GF_MODE_BB ) || defined( GF_MODE_BL )) +# define ord(hi, lo) 0x##hi##lo +#else +# define ord(hi, lo) 0x##lo##hi +#endif + +#if defined( GF_MODE_BL ) || defined( GF_MODE_LL ) + +/* field and numeric bit significance correspond */ + +#define gf_uint16_xor(i) ( \ + (i & 0x01 ? ord(00,87) : 0) ^ (i & 0x02 ? ord(01,0e) : 0) ^ \ + (i & 0x04 ? ord(02,1c) : 0) ^ (i & 0x08 ? ord(04,38) : 0) ^ \ + (i & 0x10 ? ord(08,70) : 0) ^ (i & 0x20 ? ord(10,e0) : 0) ^ \ + (i & 0x40 ? ord(21,c0) : 0) ^ (i & 0x80 ? ord(43,80) : 0) ) + +enum x_bit +{ + X_0 = 0x01, X_1 = 0x02, X_2 = 0x04, X_3 = 0x08, + X_4 = 0x10, X_5 = 0x20, X_6 = 0x40, X_7 = 0x80 +}; + +#elif defined( GF_MODE_BB ) || defined( GF_MODE_LB ) + +/* field and numeric bit significance are in reverse */ + +#define gf_uint16_xor(i) ( \ + (i & 0x80 ? ord(00,e1) : 0) ^ (i & 0x40 ? ord(80,70) : 0) ^ \ + (i & 0x20 ? ord(40,38) : 0) ^ (i & 0x10 ? ord(20,1c) : 0) ^ \ + (i & 0x08 ? ord(10,0e) : 0) ^ (i & 0x04 ? ord(08,07) : 0) ^ \ + (i & 0x02 ? ord(84,03) : 0) ^ (i & 0x01 ? ord(c2,01) : 0) ) + +enum x_bit +{ + X_0 = 0x80, X_1 = 0x40, X_2 = 0x20, X_3 = 0x10, + X_4 = 0x08, X_5 = 0x04, X_6 = 0x02, X_7 = 0x01 +}; + +#else +#error Galois Field representation has not been set +#endif + +const uint16_t gf_tab[256] = DATA_256(gf_uint16_xor); + +/* LL Mode Galois Field operations + + x[0] x[1] x[2] x[3] x[4] x[5] x[6] x[7] +ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls +10000111 ........ ........ ........ ........ ........ ........ ........ +07....00 15....08 23....16 31....24 39....32 47....40 55....48 63....56 + x[8] x[9] x[10] x[11] x[12] x[13] x[14] x[15] +ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls +........ ........ ........ ........ ........ ........ ........ M....... +71....64 79....72 87....80 95....88 103...96 111..104 119..112 127..120 +*/ + +#if UNIT_BITS == 64 + +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN +#define f1_ll(n,r,x) r[n] = (x[n] << 1) | (n ? x[n-1] >> 63 : 0) +#define f4_ll(n,r,x) r[n] = (x[n] << 4) | (n ? x[n-1] >> 60 : 0) +#define f8_ll(n,r,x) r[n] = (x[n] << 8) | (n ? x[n-1] >> 56 : 0) +#else +#define f1_ll(n,r,x) r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \ + | (n ? x[n-1] << 49 : 0)) & MASK(0x01)) +#define f4_ll(n,r,x) r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \ + | (n ? x[n-1] << 52 : 0)) & MASK(0x0f)) +#define f8_ll(n,r,x) r[n] = (x[n] >> 8) | (n ? x[n-1] << 56 : 0) +#endif + +gf_decl void gf_mulx1_ll(gf_t r, const gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[(UNIT_PTR(x)[1] >> 63) & 0x01]; +#else + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] >> 7) & 0x01])) << 48; +#endif + rep2_d2(f1_ll, UNIT_PTR(r), UNIT_PTR(x)); + UNIT_PTR(r)[0] ^= _tt; +} + +gf_decl void gf_mulx4_ll(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[(UNIT_PTR(x)[1] >> 60) & 0x0f]; +#else + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] >> 4) & 0x0f])) << 48; +#endif + rep2_d2(f4_ll, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[0] ^= _tt; +} + +gf_decl void gf_mulx8_ll(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[UNIT_PTR(x)[1] >> 56]; +#else + _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[1] & 0xff])) << 48; +#endif + rep2_d2(f8_ll, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[0] ^= _tt; +} + +#elif UNIT_BITS == 32 + +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN +#define f1_ll(n,r,x) r[n] = (x[n] << 1) | (n ? x[n-1] >> 31 : 0) +#define f4_ll(n,r,x) r[n] = (x[n] << 4) | (n ? x[n-1] >> 28 : 0) +#define f8_ll(n,r,x) r[n] = (x[n] << 8) | (n ? x[n-1] >> 24 : 0) +#else +#define f1_ll(n,r,x) r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \ + | (n ? x[n-1] << 17 : 0)) & MASK(0x01)) +#define f4_ll(n,r,x) r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \ + | (n ? x[n-1] << 20 : 0)) & MASK(0x0f)) +#define f8_ll(n,r,x) r[n] = (x[n] >> 8) | (n ? x[n-1] << 24 : 0) +#endif + +gf_decl void gf_mulx1_ll(gf_t r, const gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[(UNIT_PTR(x)[3] >> 31) & 0x01]; +#else + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] >> 7) & 0x01])) << 16; +#endif + rep2_d4(f1_ll, UNIT_PTR(r), UNIT_PTR(x)); + UNIT_PTR(r)[0] ^= _tt; +} + +gf_decl void gf_mulx4_ll(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[(UNIT_PTR(x)[3] >> 28) & 0x0f]; +#else + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] >> 4) & 0x0f])) << 16; +#endif + rep2_d4(f4_ll, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[0] ^= _tt; +} + +gf_decl void gf_mulx8_ll(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[UNIT_PTR(x)[3] >> 24]; +#else + _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[3] & 0xff])) << 16; +#endif + rep2_d4(f8_ll, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[0] ^= _tt; +} + +#else + +#define f1_ll(n,r,x) r[n] = (x[n] << 1) | (n ? x[n-1] >> 7 : 0) +#define f4_ll(n,r,x) r[n] = (x[n] << 4) | (n ? x[n-1] >> 4 : 0) + +gf_decl void gf_mulx1_ll(gf_t r, const gf_t x) +{ uint16_t _tt; + _tt = gf_tab[(UNIT_PTR(x)[15] >> 7) & 0x01]; + rep2_d16(f1_ll, UNIT_PTR(r), UNIT_PTR(x)); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(r)[0] ^= _tt & 0xff; +#else + UNIT_PTR(r)[0] ^= _tt >> 8; +#endif +} + +gf_decl void gf_mulx4_ll(gf_t x) +{ uint16_t _tt; + _tt = gf_tab[(UNIT_PTR(x)[15] >> 4) & 0x0f]; + rep2_d16(f4_ll, UNIT_PTR(x), UNIT_PTR(x)); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(x)[1] ^= _tt >> 8; + UNIT_PTR(x)[0] ^= _tt & 0xff; +#else + UNIT_PTR(x)[1] ^= _tt & 0xff; + UNIT_PTR(x)[0] = _tt >> 8; +#endif +} + +gf_decl void gf_mulx8_ll(gf_t x) +{ uint16_t _tt; + _tt = gf_tab[UNIT_PTR(x)[15]]; + memmove(UNIT_PTR(x) + 1, UNIT_PTR(x), 15); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(x)[1] ^= _tt >> 8; + UNIT_PTR(x)[0] = _tt & 0xff; +#else + UNIT_PTR(x)[1] ^= _tt & 0xff; + UNIT_PTR(x)[0] = _tt >> 8; +#endif +} + +#endif + +/* BL Mode Galois Field operations + + x[0] x[1] x[2] x[3] x[4] x[5] x[6] x[7] +ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls +M....... ........ ........ ........ ........ ........ ........ ........ +127..120 119..112 111..104 103...96 95....88 87....80 79....72 71....64 + x[8] x[9] x[10] x[11] x[12] x[13] x[14] x[15] +ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls +........ ........ ........ ........ ........ ........ ........ 10000111 +63....56 55....48 47....40 39....32 31....24 23....16 15....08 07....00 +*/ + +#if UNIT_BITS == 64 + +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN +#define f1_bl(n,r,x) r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \ + | (!n ? x[n+1] << 49 : 0)) & MASK(0x01)) +#define f4_bl(n,r,x) r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \ + | (!n ? x[n+1] << 52 : 0)) & MASK(0x0f)) +#define f8_bl(n,r,x) r[n] = (x[n] >> 8) | (!n ? x[n+1] << 56 : 0) +#else +#define f1_bl(n,r,x) r[n] = (x[n] << 1) | (!n ? x[n+1] >> 63 : 0) +#define f4_bl(n,r,x) r[n] = (x[n] << 4) | (!n ? x[n+1] >> 60 : 0) +#define f8_bl(n,r,x) r[n] = (x[n] << 8) | (!n ? x[n+1] >> 56 : 0) +#endif + +gf_decl void gf_mulx1_bl(gf_t r, const gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 7) & 0x01])) << 48; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 63) & 0x01]; +#endif + rep2_u2(f1_bl, UNIT_PTR(r), UNIT_PTR(x)); + UNIT_PTR(r)[1] ^= _tt; +} + +gf_decl void gf_mulx4_bl(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 4) & 0x0f])) << 48; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 60) & 0x0f]; +#endif + rep2_u2(f4_bl, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[1] ^= _tt; +} + +gf_decl void gf_mulx8_bl(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 48; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 56) & 0xff]; +#endif + rep2_u2(f8_bl, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[1] ^= _tt; +} + +#elif UNIT_BITS == 32 + +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN +#define f1_bl(n,r,x) r[n] = ((x[n] << 1) & ~MASK(0x01)) | (((x[n] >> 15) \ + | (n < 3 ? x[n+1] << 17 : 0)) & MASK(0x01)) +#define f4_bl(n,r,x) r[n] = ((x[n] << 4) & ~MASK(0x0f)) | (((x[n] >> 12) \ + | (n < 3 ? x[n+1] << 20 : 0)) & MASK(0x0f)) +#define f8_bl(n,r,x) r[n] = (x[n] >> 8) | (n < 3 ? x[n+1] << 24 : 0) +#else +#define f1_bl(n,r,x) r[n] = (x[n] << 1) | (n < 3 ? x[n+1] >> 31 : 0) +#define f4_bl(n,r,x) r[n] = (x[n] << 4) | (n < 3 ? x[n+1] >> 28 : 0) +#define f8_bl(n,r,x) r[n] = (x[n] << 8) | (n < 3 ? x[n+1] >> 24 : 0) +#endif + +gf_decl void gf_mulx1_bl(gf_t r, const gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 7) & 0x01])) << 16; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 31) & 0x01]; +#endif + rep2_u4(f1_bl, UNIT_PTR(r), UNIT_PTR(x)); + UNIT_PTR(r)[3] ^= _tt; +} + +gf_decl void gf_mulx4_bl(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] >> 4) & 0x0f])) << 16; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 28) & 0x0f]; +#endif + rep2_u4(f4_bl, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[3] ^= _tt; +} + +gf_decl void gf_mulx8_bl(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 16; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 24) & 0xff]; +#endif + rep2_u4(f8_bl, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[3] ^= _tt; +} + +#else + +#define f1_bl(n,r,x) r[n] = (x[n] << 1) | (n < 15 ? x[n+1] >> 7 : 0) +#define f4_bl(n,r,x) r[n] = (x[n] << 4) | (n < 15 ? x[n+1] >> 4 : 0) + +gf_decl void gf_mulx1_bl(gf_t r, const gf_t x) +{ uint16_t _tt; + _tt = gf_tab[(UNIT_PTR(x)[0] >> 7) & 0x01]; + rep2_u16(f1_bl, UNIT_PTR(r), UNIT_PTR(x)); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(r)[15] ^= _tt >> 8; +#else + UNIT_PTR(r)[15] ^= _tt & 0xff; +#endif +} + +gf_decl void gf_mulx4_bl(gf_t x) +{ uint16_t _tt; + _tt = gf_tab[(UNIT_PTR(x)[0] >> 4) & 0x0f]; + rep2_u16(f4_bl, UNIT_PTR(x), UNIT_PTR(x)); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(x)[14] ^= _tt & 0xff; + UNIT_PTR(x)[15] ^= _tt >> 8; +#else + UNIT_PTR(x)[14] ^= _tt >> 8; + UNIT_PTR(x)[15] = _tt & 0xff; +#endif +} + +gf_decl void gf_mulx8_bl(gf_t x) +{ uint16_t _tt; + _tt = gf_tab[UNIT_PTR(x)[0]]; + memmove(UNIT_PTR(x), UNIT_PTR(x) + 1, 15); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(x)[14] ^= _tt & 0xff; + UNIT_PTR(x)[15] = _tt >> 8; +#else + UNIT_PTR(x)[14] ^= _tt >> 8; + UNIT_PTR(x)[15] = _tt & 0xff; +#endif +} + +#endif + +/* LB Mode Galois Field operations + + x[0] x[1] x[2] x[3] x[4] x[5] x[6] x[7] +ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls +11100001 ........ ........ ........ ........ ........ ........ ........ +00....07 08....15 16....23 24....31 32....39 40....47 48....55 56....63 + x[8] x[9] x[10] x[11] x[12] x[13] x[14] x[15] +ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls +........ ........ ........ ........ ........ ........ ........ .......M +64....71 72....79 80....87 88....95 96...103 104..111 112..119 120..127 +*/ + +#if UNIT_BITS == 64 + +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN +#define f1_lb(n,r,x) r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \ + | (n ? x[n-1] >> 49 : 0)) & MASK(0x80)) +#define f4_lb(n,r,x) r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \ + | (n ? x[n-1] >> 52 : 0)) & MASK(0xf0)) +#define f8_lb(n,r,x) r[n] = (x[n] << 8) | (n ? x[n-1] >> 56 : 0) +#else +#define f1_lb(n,r,x) r[n] = (x[n] >> 1) | (n ? x[n-1] << 63 : 0) +#define f4_lb(n,r,x) r[n] = (x[n] >> 4) | (n ? x[n-1] << 60 : 0) +#define f8_lb(n,r,x) x[n] = (x[n] >> 8) | (n ? x[n-1] << 56 : 0) +#endif + +gf_decl void gf_mulx1_lb(gf_t r, const gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[(UNIT_PTR(x)[1] >> 49) & MASK(0x80)]; +#else + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] << 7) & 0xff])) << 48; +#endif + rep2_d2(f1_lb, UNIT_PTR(r), UNIT_PTR(x)); + UNIT_PTR(r)[0] ^= _tt; +} + +gf_decl void gf_mulx4_lb(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[(UNIT_PTR(x)[1] >> 52) & MASK(0xf0)]; +#else + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[1] << 4) & 0xff])) << 48; +#endif + rep2_d2(f4_lb, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[0] ^= _tt; +} + +gf_decl void gf_mulx8_lb(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[UNIT_PTR(x)[1] >> 56]; +#else + _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[1] & 0xff])) << 48; +#endif + rep2_d2(f8_lb, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[0] ^= _tt; +} + +#elif UNIT_BITS == 32 + +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN +#define f1_lb(n,r,x) r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \ + | (n ? x[n-1] >> 17 : 0)) & MASK(0x80)) +#define f4_lb(n,r,x) r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \ + | (n ? x[n-1] >> 20 : 0)) & MASK(0xf0)) +#define f8_lb(n,r,x) r[n] = (x[n] << 8) | (n ? x[n-1] >> 24 : 0) +#else +#define f1_lb(n,r,x) r[n] = (x[n] >> 1) | (n ? x[n-1] << 31 : 0) +#define f4_lb(n,r,x) r[n] = (x[n] >> 4) | (n ? x[n-1] << 28 : 0) +#define f8_lb(n,r,x) r[n] = (x[n] >> 8) | (n ? x[n-1] << 24 : 0) +#endif + +gf_decl void gf_mulx1_lb(gf_t r, const gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[(UNIT_PTR(x)[3] >> 17) & MASK(0x80)]; +#else + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] << 7) & 0xff])) << 16; +#endif + rep2_d4(f1_lb, UNIT_PTR(r), UNIT_PTR(x)); + UNIT_PTR(r)[0] ^= _tt; +} + +gf_decl void gf_mulx4_lb(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[(UNIT_PTR(x)[3] >> 20) & MASK(0xf0)]; +#else + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[3] << 4) & 0xff])) << 16; +#endif + rep2_d4(f4_lb, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[0] ^= _tt; +} + +gf_decl void gf_mulx8_lb(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = gf_tab[UNIT_PTR(x)[3] >> 24]; +#else + _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[3] & 0xff])) << 16; +#endif + rep2_d4(f8_lb, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[0] ^= _tt; +} + +#else + +#define f1_lb(n,r,x) r[n] = (x[n] >> 1) | (n ? x[n-1] << 7 : 0) +#define f4_lb(n,r,x) r[n] = (x[n] >> 4) | (n ? x[n-1] << 4 : 0) + +gf_decl void gf_mulx1_lb(gf_t r, const gf_t x) +{ uint16_t _tt; + _tt = gf_tab[(UNIT_PTR(x)[15] << 7) & 0x80]; + rep2_d16(f1_lb, UNIT_PTR(r), UNIT_PTR(x)); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(r)[0] ^= _tt; +#else + UNIT_PTR(r)[0] ^= _tt >> 8; +#endif +} + +gf_decl void gf_mulx4_lb(gf_t x) +{ uint16_t _tt; + _tt = gf_tab[(UNIT_PTR(x)[15] << 4) & 0xf0]; + rep2_d16(f4_lb, UNIT_PTR(x), UNIT_PTR(x)); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(x)[1] ^= _tt >> 8; + UNIT_PTR(x)[0] ^= _tt & 0xff; +#else + UNIT_PTR(x)[1] ^= _tt & 0xff; + UNIT_PTR(x)[0] ^= _tt >> 8; +#endif +} + +gf_decl void gf_mulx8_lb(gf_t x) +{ uint16_t _tt; + _tt = gf_tab[UNIT_PTR(x)[15]]; + memmove(UNIT_PTR(x) + 1, UNIT_PTR(x), 15); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(x)[1] ^= _tt >> 8; + UNIT_PTR(x)[0] = _tt & 0xff; +#else + UNIT_PTR(x)[1] ^= _tt & 0xff; + UNIT_PTR(x)[0] = _tt >> 8; +#endif +} + +#endif + +/* BB Mode Galois Field operations + + x[0] x[1] x[2] x[3] x[4] x[5] x[6] x[7] +ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls +.......M ........ ........ ........ ........ ........ ........ ........ +120..127 112..119 104..111 96...103 88....95 80....87 72....79 64....71 + x[8] x[9] x[10] x[11] x[12] x[13] x[14] x[15] +ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls +........ ........ ........ ........ ........ ........ ........ 11100001 +56....63 48....55 40....47 32....39 24....31 16....23 08....15 00....07 +*/ + +#if UNIT_BITS == 64 + +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN +#define f1_bb(n,r,x) r[n] = (x[n] >> 1) | (!n ? x[n+1] << 63 : 0) +#define f4_bb(n,r,x) r[n] = (x[n] >> 4) | (!n ? x[n+1] << 60 : 0) +#define f8_bb(n,r,x) r[n] = (x[n] >> 8) | (!n ? x[n+1] << 56 : 0) +#else +#define f1_bb(n,r,x) r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \ + | (!n ? x[n+1] >> 49 : 0)) & MASK(0x80)) +#define f4_bb(n,r,x) r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \ + | (!n ? x[n+1] >> 52 : 0)) & MASK(0xf0)) +#define f8_bb(n,r,x) r[n] = (x[n] << 8) | (!n ? x[n+1] >> 56 : 0) +#endif + +gf_decl void gf_mulx1_bb(gf_t r, const gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = (( gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 7) & 0x80])) << 48; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 49) & 0x80]; +#endif + rep2_u2(f1_bb, UNIT_PTR(r), UNIT_PTR(x)); + UNIT_PTR(r)[1] ^= _tt; +} + +gf_decl void gf_mulx4_bb(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 4) & 0xf0])) << 48; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 52) & 0xf0]; +#endif + rep2_u2(f4_bb, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[1] ^= _tt; +} + +gf_decl void gf_mulx8_bb(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 48; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 56) & 0xff]; +#endif + rep2_u2(f8_bb, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[1] ^= _tt; +} + +#elif UNIT_BITS == 32 + +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN +#define f1_bb(n,r,x) r[n] = (x[n] >> 1) | (n < 3 ? x[n+1] << 31 : 0) +#define f4_bb(n,r,x) r[n] = (x[n] >> 4) | (n < 3 ? x[n+1] << 28 : 0) +#define f8_bb(n,r,x) r[n] = (x[n] >> 8) | (n < 3 ? x[n+1] << 24 : 0) +#else +#define f1_bb(n,r,x) r[n] = ((x[n] >> 1) & ~MASK(0x80)) | (((x[n] << 15) \ + | (n < 3 ? x[n+1] >> 17 : 0)) & MASK(0x80)) +#define f4_bb(n,r,x) r[n] = ((x[n] >> 4) & ~MASK(0xf0)) | (((x[n] << 12) \ + | (n < 3 ? x[n+1] >> 20 : 0)) & MASK(0xf0)) +#define f8_bb(n,r,x) r[n] = (x[n] << 8) | (n < 3 ? x[n+1] >> 24 : 0) +#endif + +gf_decl void gf_mulx1_bb(gf_t r, const gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 7) & 0x80])) << 16; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 17) & 0x80]; +#endif + rep2_u4(f1_bb, UNIT_PTR(r), UNIT_PTR(x)); + UNIT_PTR(r)[3] ^= _tt; +} + +gf_decl void gf_mulx4_bb(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = ((gf_unit_t)(gf_tab[(UNIT_PTR(x)[0] << 4) & 0xf0])) << 16; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 20) & 0xf0]; +#endif + rep2_u4(f4_bb, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[3] ^= _tt; +} + +gf_decl void gf_mulx8_bb(gf_t x) +{ gf_unit_t _tt; +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + _tt = ((gf_unit_t)(gf_tab[UNIT_PTR(x)[0] & 0xff])) << 16; +#else + _tt = gf_tab[(UNIT_PTR(x)[0] >> 24) & 0xff]; +#endif + rep2_u4(f8_bb, UNIT_PTR(x), UNIT_PTR(x)); + UNIT_PTR(x)[3] ^= _tt; +} + +#else + +#define f1_bb(n,r,x) r[n] = (x[n] >> 1) | (n < 15 ? x[n+1] << 7 : 0) +#define f4_bb(n,r,x) r[n] = (x[n] >> 4) | (n < 15 ? x[n+1] << 4 : 0) + +gf_decl void gf_mulx1_bb(gf_t r, const gf_t x) +{ uint16_t _tt; + _tt = gf_tab[(UNIT_PTR(x)[0] << 7) & 0x80]; + rep2_u16(f1_bb, UNIT_PTR(r), UNIT_PTR(x)); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(r)[15] ^= _tt >> 8; +#else + UNIT_PTR(r)[15] ^= _tt; +#endif +} + +gf_decl void gf_mulx4_bb(gf_t x) +{ uint16_t _tt; + _tt = gf_tab[(UNIT_PTR(x)[0] << 4) & 0xf0]; + rep2_u16(f4_bb, UNIT_PTR(x), UNIT_PTR(x)); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(x)[14] ^= _tt & 0xff; + UNIT_PTR(x)[15] ^= _tt >> 8; +#else + UNIT_PTR(x)[14] ^= _tt >> 8; + UNIT_PTR(x)[15] ^= _tt & 0xff; +#endif +} + +gf_decl void gf_mulx8_bb(gf_t x) +{ uint16_t _tt; + _tt = gf_tab[UNIT_PTR(x)[0]]; + memmove(UNIT_PTR(x), UNIT_PTR(x) + 1, 15); +#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + UNIT_PTR(x)[14] ^= _tt & 0xff; + UNIT_PTR(x)[15] = _tt >> 8; +#else + UNIT_PTR(x)[14] ^= _tt >> 8; + UNIT_PTR(x)[15] = _tt & 0xff; +#endif +} + +#endif + +#endif diff --git a/crypto/aes/mode_hdr.h b/crypto/aes/mode_hdr.h new file mode 100644 index 000000000..ce9408eb4 --- /dev/null +++ b/crypto/aes/mode_hdr.h @@ -0,0 +1,329 @@ +/* +--------------------------------------------------------------------------- +Copyright (c) 1998-2014, Brian Gladman, Worcester, UK. All rights reserved. + +The redistribution and use of this software (with or without changes) +is allowed without the payment of fees or royalties provided that: + + source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation. + +This software is provided 'as is' with no explicit or implied warranties +in respect of its operation, including, but not limited to, correctness +and fitness for purpose. +--------------------------------------------------------------------------- +Issue Date: 18/02/2014 + +This header file is an INTERNAL file which supports mode implementation +*/ + +#ifndef _MODE_HDR_H +#define _MODE_HDR_H + +#include +#include + +#include "brg_endian.h" + +/* This define sets the units in which buffers are processed. This code + can provide significant speed gains if buffers can be processed in + 32 or 64 bit chunks rather than in bytes. This define sets the units + in which buffers will be accessed if possible +*/ +#if !defined( UNIT_BITS ) +# if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN +# if 0 +# define UNIT_BITS 32 +# elif 1 +# define UNIT_BITS 64 +# endif +# elif defined( _WIN64 ) +# define UNIT_BITS 64 +# else +# define UNIT_BITS 32 +# endif +#endif + +#if UNIT_BITS == 64 && !defined( NEED_UINT_64T ) +# define NEED_UINT_64T +#endif + +#include "brg_types.h" + +/* Use of inlines is preferred but code blocks can also be expanded inline + using 'defines'. But the latter approach will typically generate a LOT + of code and is not recommended. +*/ +#if 1 && !defined( USE_INLINING ) +# define USE_INLINING +#endif + +#if defined( _MSC_VER ) +# if _MSC_VER >= 1400 +# include +# include +# pragma intrinsic(memset) +# pragma intrinsic(memcpy) +# define rotl32 _rotl +# define rotr32 _rotr +# define rotl64 _rotl64 +# define rotr64 _rotl64 +# define bswap_16(x) _byteswap_ushort(x) +# define bswap_32(x) _byteswap_ulong(x) +# define bswap_64(x) _byteswap_uint64(x) +# else +# define rotl32 _lrotl +# define rotr32 _lrotr +# endif +#endif + +#if defined( USE_INLINING ) +# if defined( _MSC_VER ) +# define mh_decl __inline +# elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# define mh_decl static inline +# else +# define mh_decl static +# endif +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#define UI8_PTR(x) UPTR_CAST(x, 8) +#define UI16_PTR(x) UPTR_CAST(x, 16) +#define UI32_PTR(x) UPTR_CAST(x, 32) +#define UI64_PTR(x) UPTR_CAST(x, 64) +#define UNIT_PTR(x) UPTR_CAST(x, UNIT_BITS) + +#define UI8_VAL(x) UNIT_CAST(x, 8) +#define UI16_VAL(x) UNIT_CAST(x, 16) +#define UI32_VAL(x) UNIT_CAST(x, 32) +#define UI64_VAL(x) UNIT_CAST(x, 64) +#define UNIT_VAL(x) UNIT_CAST(x, UNIT_BITS) + +#define BUF_INC (UNIT_BITS >> 3) +#define BUF_ADRMASK ((UNIT_BITS >> 3) - 1) + +#define rep2_u2(f,r,x) f( 0,r,x); f( 1,r,x) +#define rep2_u4(f,r,x) f( 0,r,x); f( 1,r,x); f( 2,r,x); f( 3,r,x) +#define rep2_u16(f,r,x) f( 0,r,x); f( 1,r,x); f( 2,r,x); f( 3,r,x); \ + f( 4,r,x); f( 5,r,x); f( 6,r,x); f( 7,r,x); \ + f( 8,r,x); f( 9,r,x); f(10,r,x); f(11,r,x); \ + f(12,r,x); f(13,r,x); f(14,r,x); f(15,r,x) + +#define rep2_d2(f,r,x) f( 1,r,x); f( 0,r,x) +#define rep2_d4(f,r,x) f( 3,r,x); f( 2,r,x); f( 1,r,x); f( 0,r,x) +#define rep2_d16(f,r,x) f(15,r,x); f(14,r,x); f(13,r,x); f(12,r,x); \ + f(11,r,x); f(10,r,x); f( 9,r,x); f( 8,r,x); \ + f( 7,r,x); f( 6,r,x); f( 5,r,x); f( 4,r,x); \ + f( 3,r,x); f( 2,r,x); f( 1,r,x); f( 0,r,x) + +#define rep3_u2(f,r,x,y,c) f( 0,r,x,y,c); f( 1,r,x,y,c) +#define rep3_u4(f,r,x,y,c) f( 0,r,x,y,c); f( 1,r,x,y,c); f( 2,r,x,y,c); f( 3,r,x,y,c) +#define rep3_u16(f,r,x,y,c) f( 0,r,x,y,c); f( 1,r,x,y,c); f( 2,r,x,y,c); f( 3,r,x,y,c); \ + f( 4,r,x,y,c); f( 5,r,x,y,c); f( 6,r,x,y,c); f( 7,r,x,y,c); \ + f( 8,r,x,y,c); f( 9,r,x,y,c); f(10,r,x,y,c); f(11,r,x,y,c); \ + f(12,r,x,y,c); f(13,r,x,y,c); f(14,r,x,y,c); f(15,r,x,y,c) + +#define rep3_d2(f,r,x,y,c) f( 1,r,x,y,c); f( 0,r,x,y,c) +#define rep3_d4(f,r,x,y,c) f( 3,r,x,y,c); f( 2,r,x,y,c); f( 1,r,x,y,c); f( 0,r,x,y,c) +#define rep3_d16(f,r,x,y,c) f(15,r,x,y,c); f(14,r,x,y,c); f(13,r,x,y,c); f(12,r,x,y,c); \ + f(11,r,x,y,c); f(10,r,x,y,c); f( 9,r,x,y,c); f( 8,r,x,y,c); \ + f( 7,r,x,y,c); f( 6,r,x,y,c); f( 5,r,x,y,c); f( 4,r,x,y,c); \ + f( 3,r,x,y,c); f( 2,r,x,y,c); f( 1,r,x,y,c); f( 0,r,x,y,c) + +/* function pointers might be used for fast XOR operations */ + +typedef void (*xor_function)(void* r, const void* p, const void* q); + +/* left and right rotates on 32 and 64 bit variables */ + +#if !defined( rotl32 ) /* NOTE: 0 <= n <= 32 ASSUMED */ +mh_decl uint32_t rotl32(uint32_t x, int n) +{ + return (((x) << n) | ((x) >> (32 - n))); +} +#endif + +#if !defined( rotr32 ) /* NOTE: 0 <= n <= 32 ASSUMED */ +mh_decl uint32_t rotr32(uint32_t x, int n) +{ + return (((x) >> n) | ((x) << (32 - n))); +} +#endif + +#if ( UNIT_BITS == 64 ) && !defined( rotl64 ) /* NOTE: 0 <= n <= 64 ASSUMED */ +mh_decl uint64_t rotl64(uint64_t x, int n) +{ + return (((x) << n) | ((x) >> (64 - n))); +} +#endif + +#if ( UNIT_BITS == 64 ) && !defined( rotr64 ) /* NOTE: 0 <= n <= 64 ASSUMED */ +mh_decl uint64_t rotr64(uint64_t x, int n) +{ + return (((x) >> n) | ((x) << (64 - n))); +} +#endif + +/* byte order inversions for 16, 32 and 64 bit variables */ + +#if !defined(bswap_16) +mh_decl uint16_t bswap_16(uint16_t x) +{ + return (uint16_t)((x >> 8) | (x << 8)); +} +#endif + +#if !defined(bswap_32) +mh_decl uint32_t bswap_32(uint32_t x) +{ + return ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00)); +} +#endif + +#if ( UNIT_BITS == 64 ) && !defined(bswap_64) +mh_decl uint64_t bswap_64(uint64_t x) +{ + return bswap_32((uint32_t)(x >> 32)) | ((uint64_t)bswap_32((uint32_t)x) << 32); +} +#endif + +/* support for fast aligned buffer move, xor and byte swap operations - + source and destination buffers for move and xor operations must not + overlap, those for byte order revesal must either not overlap or + must be identical +*/ +#define f_copy(n,p,q) p[n] = q[n] +#define f_xor(n,r,p,q,c) r[n] = c(p[n] ^ q[n]) + +mh_decl void copy_block(void* p, const void* q) +{ + memcpy(p, q, 16); +} + +mh_decl void copy_block_aligned(void *p, const void *q) +{ +#if UNIT_BITS == 8 + memcpy(p, q, 16); +#elif UNIT_BITS == 32 + rep2_u4(f_copy,UNIT_PTR(p),UNIT_PTR(q)); +#else + rep2_u2(f_copy,UNIT_PTR(p),UNIT_PTR(q)); +#endif +} + +mh_decl void xor_block(void *r, const void* p, const void* q) +{ + rep3_u16(f_xor, UI8_PTR(r), UI8_PTR(p), UI8_PTR(q), UI8_VAL); +} + +mh_decl void xor_block_aligned(void *r, const void *p, const void *q) +{ +#if UNIT_BITS == 8 + rep3_u16(f_xor, UNIT_PTR(r), UNIT_PTR(p), UNIT_PTR(q), UNIT_VAL); +#elif UNIT_BITS == 32 + rep3_u4(f_xor, UNIT_PTR(r), UNIT_PTR(p), UNIT_PTR(q), UNIT_VAL); +#else + rep3_u2(f_xor, UNIT_PTR(r), UNIT_PTR(p), UNIT_PTR(q), UNIT_VAL); +#endif +} + +/* byte swap within 32-bit words in a 16 byte block; don't move 32-bit words */ +mh_decl void bswap32_block(void *d, const void* s) +{ +#if UNIT_BITS == 8 + uint8_t t; + t = UNIT_PTR(s)[ 0]; UNIT_PTR(d)[ 0] = UNIT_PTR(s)[ 3]; UNIT_PTR(d)[ 3] = t; + t = UNIT_PTR(s)[ 1]; UNIT_PTR(d)[ 1] = UNIT_PTR(s)[ 2]; UNIT_PTR(d)[ 2] = t; + t = UNIT_PTR(s)[ 4]; UNIT_PTR(d)[ 4] = UNIT_PTR(s)[ 7]; UNIT_PTR(d)[ 7] = t; + t = UNIT_PTR(s)[ 5]; UNIT_PTR(d)[ 5] = UNIT_PTR(s)[ 6]; UNIT_PTR(d) [6] = t; + t = UNIT_PTR(s)[ 8]; UNIT_PTR(d)[ 8] = UNIT_PTR(s)[11]; UNIT_PTR(d)[12] = t; + t = UNIT_PTR(s)[ 9]; UNIT_PTR(d)[ 9] = UNIT_PTR(s)[10]; UNIT_PTR(d)[10] = t; + t = UNIT_PTR(s)[12]; UNIT_PTR(d)[12] = UNIT_PTR(s)[15]; UNIT_PTR(d)[15] = t; + t = UNIT_PTR(s)[13]; UNIT_PTR(d)[ 3] = UNIT_PTR(s)[14]; UNIT_PTR(d)[14] = t; +#elif UNIT_BITS == 32 + UNIT_PTR(d)[0] = bswap_32(UNIT_PTR(s)[0]); UNIT_PTR(d)[1] = bswap_32(UNIT_PTR(s)[1]); + UNIT_PTR(d)[2] = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[3] = bswap_32(UNIT_PTR(s)[3]); +#else + UI32_PTR(d)[0] = bswap_32(UI32_PTR(s)[0]); UI32_PTR(d)[1] = bswap_32(UI32_PTR(s)[1]); + UI32_PTR(d)[2] = bswap_32(UI32_PTR(s)[2]); UI32_PTR(d)[3] = bswap_32(UI32_PTR(s)[3]); +#endif +} + +/* byte swap within 64-bit words in a 16 byte block; don't move 64-bit words */ +mh_decl void bswap64_block(void *d, const void* s) +{ +#if UNIT_BITS == 8 + uint8_t t; + t = UNIT_PTR(s)[ 0]; UNIT_PTR(d)[ 0] = UNIT_PTR(s)[ 7]; UNIT_PTR(d)[ 7] = t; + t = UNIT_PTR(s)[ 1]; UNIT_PTR(d)[ 1] = UNIT_PTR(s)[ 6]; UNIT_PTR(d)[ 6] = t; + t = UNIT_PTR(s)[ 2]; UNIT_PTR(d)[ 2] = UNIT_PTR(s)[ 5]; UNIT_PTR(d)[ 5] = t; + t = UNIT_PTR(s)[ 3]; UNIT_PTR(d)[ 3] = UNIT_PTR(s)[ 3]; UNIT_PTR(d) [3] = t; + t = UNIT_PTR(s)[ 8]; UNIT_PTR(d)[ 8] = UNIT_PTR(s)[15]; UNIT_PTR(d)[15] = t; + t = UNIT_PTR(s)[ 9]; UNIT_PTR(d)[ 9] = UNIT_PTR(s)[14]; UNIT_PTR(d)[14] = t; + t = UNIT_PTR(s)[10]; UNIT_PTR(d)[10] = UNIT_PTR(s)[13]; UNIT_PTR(d)[13] = t; + t = UNIT_PTR(s)[11]; UNIT_PTR(d)[11] = UNIT_PTR(s)[12]; UNIT_PTR(d)[12] = t; +#elif UNIT_BITS == 32 + uint32_t t; + t = bswap_32(UNIT_PTR(s)[0]); UNIT_PTR(d)[0] = bswap_32(UNIT_PTR(s)[1]); UNIT_PTR(d)[1] = t; + t = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[2] = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[3] = t; +#else + UNIT_PTR(d)[0] = bswap_64(UNIT_PTR(s)[0]); UNIT_PTR(d)[1] = bswap_64(UNIT_PTR(s)[1]); +#endif +} + +mh_decl void bswap128_block(void *d, const void* s) +{ +#if UNIT_BITS == 8 + uint8_t t; + t = UNIT_PTR(s)[0]; UNIT_PTR(d)[0] = UNIT_PTR(s)[15]; UNIT_PTR(d)[15] = t; + t = UNIT_PTR(s)[1]; UNIT_PTR(d)[1] = UNIT_PTR(s)[14]; UNIT_PTR(d)[14] = t; + t = UNIT_PTR(s)[2]; UNIT_PTR(d)[2] = UNIT_PTR(s)[13]; UNIT_PTR(d)[13] = t; + t = UNIT_PTR(s)[3]; UNIT_PTR(d)[3] = UNIT_PTR(s)[12]; UNIT_PTR(d)[12] = t; + t = UNIT_PTR(s)[4]; UNIT_PTR(d)[4] = UNIT_PTR(s)[11]; UNIT_PTR(d)[11] = t; + t = UNIT_PTR(s)[5]; UNIT_PTR(d)[5] = UNIT_PTR(s)[10]; UNIT_PTR(d)[10] = t; + t = UNIT_PTR(s)[6]; UNIT_PTR(d)[6] = UNIT_PTR(s)[ 9]; UNIT_PTR(d)[ 9] = t; + t = UNIT_PTR(s)[7]; UNIT_PTR(d)[7] = UNIT_PTR(s)[ 8]; UNIT_PTR(d)[ 8] = t; +#elif UNIT_BITS == 32 + uint32_t t; + t = bswap_32(UNIT_PTR(s)[0]); UNIT_PTR(d)[0] = bswap_32(UNIT_PTR(s)[3]); UNIT_PTR(d)[3] = t; + t = bswap_32(UNIT_PTR(s)[1]); UNIT_PTR(d)[1] = bswap_32(UNIT_PTR(s)[2]); UNIT_PTR(d)[2] = t; +#else + uint64_t t; + t = bswap_64(UNIT_PTR(s)[0]); UNIT_PTR(d)[0] = bswap_64(UNIT_PTR(s)[1]); UNIT_PTR(d)[1] = t; +#endif +} + +/* platform byte order to big or little endian order for 16, 32 and 64 bit variables */ + +#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN + +# define uint16_t_to_le(x) (x) = bswap_16((x)) +# define uint32_t_to_le(x) (x) = bswap_32((x)) +# define uint64_t_to_le(x) (x) = bswap_64((x)) +# define uint16_t_to_be(x) +# define uint32_t_to_be(x) +# define uint64_t_to_be(x) + +#else + +# define uint16_t_to_le(x) +# define uint32_t_to_le(x) +# define uint64_t_to_le(x) +# define uint16_t_to_be(x) (x) = bswap_16((x)) +# define uint32_t_to_be(x) (x) = bswap_32((x)) +# define uint64_t_to_be(x) (x) = bswap_64((x)) + +#endif + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/tools/style.c.exclude b/tools/style.c.exclude index e643d7575..5ef29b507 100644 --- a/tools/style.c.exclude +++ b/tools/style.c.exclude @@ -1,5 +1,5 @@ ^\./core/embed/bootloader/protob/ -^\./crypto/aes/aes\(\|crypt\|key\|_modes\|opt\|tab\|tst\)\. +^\./crypto/aes/ ^\./crypto/chacha20poly1305/ ^\./crypto/ed25519-donna/ ^\./crypto/gui/